21
21
from pdfminer .pdfinterp import PDFResourceManager , PDFPageInterpreter
22
22
from pdfminer .pdfpage import PDFPage
23
23
from pdfminer .pdfparser import PDFParser
24
+ from pdfminer .pdftypes import PDFObject
24
25
from pytesseract import TesseractNotFoundError , TesseractError
25
26
from skimage import io
26
27
from skimage .feature import canny
27
28
from skimage .transform import hough_line , hough_line_peaks , rotate
28
29
29
- from PDFScraper .dataStructure import Document , Documents
30
+ from PDFScraper .dataStructure import Document
30
31
31
32
# Set up logger
32
33
log_level = 20
36
37
logger .setLevel (log_level )
37
38
38
39
39
- def find_pdfs_in_path (docs : Documents , path : str ):
40
+ def find_pdfs_in_path (path : str ):
41
+ pdfs = []
40
42
if os .path .exists (path ):
41
43
if os .path .isdir (path ): # find PDFs in directory and add them to the list
42
44
count = 0
43
45
for f in os .listdir (path ):
44
46
count += 1
45
- find_pdfs_in_path (docs , path + '/' + f )
47
+ find_pdfs_in_path (path + '/' + f )
48
+
46
49
elif os .path .isfile (path ) and (path .endswith (".pdf" )):
50
+ pdfs .append (Document (path , True ))
47
51
48
- docs .num_docs += 1
49
- docs .docs .append (Document (path , docs , True ))
50
52
elif os .path .isfile (path ) and (path .endswith (".bmp" ) or path .endswith (".jpg" ) or path .endswith (".pbm" )
51
53
or path .endswith (".pgm" ) or path .endswith (".ppm" ) or path .endswith (".jpeg" )
52
54
or path .endswith (".jpe" ) or path .endswith (".jp2" ) or path .endswith (".tiff" )
53
55
or path .endswith (".tif" ) or path .endswith (".png" )):
54
- docs .num_docs += 1
55
56
56
- docs . docs . append (Document (path , docs , False ))
57
+ pdfs . append (Document (path , False ))
57
58
58
59
else :
59
60
raise Exception ("Provided path does not exist" )
61
+ return pdfs
60
62
61
63
62
64
# Get filename from path
@@ -72,7 +74,7 @@ def pdf_to_image(document: Document):
72
74
except FileExistsError :
73
75
pass
74
76
75
- if document .isPDF :
77
+ if document .is_pdf :
76
78
pages = pdf2image .convert_from_path (pdf_path = document .path , dpi = 300 )
77
79
# TODO: implement saving to temp dir with mkstemp for better security
78
80
for i in range (len (pages )):
@@ -116,10 +118,10 @@ def image_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
116
118
117
119
118
120
# determine skew angle of image
119
- def determine_skew (image ):
120
- edges = canny (image , sigma = 3.0 )
121
+ def determine_skew (image , sigma = 3.0 , num_peaks = 20 ):
122
+ edges = canny (image , sigma = sigma )
121
123
h , a , d = hough_line (edges )
122
- _ , ap , _ = hough_line_peaks (h , a , d , num_peaks = 20 )
124
+ _ , ap , _ = hough_line_peaks (h , a , d , num_peaks = num_peaks )
123
125
124
126
if len (ap ) == 0 :
125
127
return 0
@@ -207,7 +209,6 @@ def get_max_freq_elem(arr):
207
209
# Apply deskewing to the image
208
210
def deskew (image ):
209
211
angle = determine_skew (image )
210
-
211
212
if 0 <= angle <= 90 :
212
213
rot_angle = angle - 90
213
214
if - 45 <= angle < 0 :
@@ -224,8 +225,8 @@ def preprocess_image(image):
224
225
# RGB to grayscale
225
226
image = cv2 .cvtColor (image , cv2 .COLOR_BGR2GRAY )
226
227
# Thresholding
227
- image = cv2 .threshold (image , 0 , 255 , cv2 .THRESH_BINARY + cv2 .THRESH_OTSU )[1 ]
228
-
228
+ # image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
229
+ image = cv2 . adaptiveThreshold ( image , 255 , cv2 . ADAPTIVE_THRESH_GAUSSIAN_C , cv2 . THRESH_BINARY , 11 , 2 )
229
230
# save and reread to convert to scikit-image image type
230
231
temp_image_path = tempfile .gettempdir () + "/PDFScraper" + "/" + "deskew.jpg"
231
232
cv2 .imwrite (temp_image_path , image )
@@ -250,13 +251,16 @@ def convert_to_pdf(document: Document, tessdata_location: str, config_options=""
250
251
# Resize imput image if not PDF
251
252
# if not document.isPDF:
252
253
# img = image_resize(img, width=1024)
253
- img = preprocess_image (img )
254
254
255
- # Extract testing using OCR
255
+ img = preprocess_image ( img )
256
256
257
- # Extract language only from the first page
257
+ # Extract language from the first page only
258
258
if i == 0 :
259
259
language = get_language (img , tessdata_location )
260
+ # if not english or slovene set to english
261
+ if language != "eng" or language != "slv" :
262
+ language = "eng"
263
+
260
264
try :
261
265
# uses provided config if available
262
266
if config_options == "" :
@@ -311,44 +315,46 @@ def get_language(img, tessdata_location: str):
311
315
312
316
# parses Document to PDFDocument
313
317
def get_pdf_object (document : Document ):
318
+ if document .filename is None :
319
+ get_filename (document )
314
320
# use OCR processed file if available
315
321
file = open (document .ocr_path , 'rb' )
316
322
parser = PDFParser (file )
317
- document . doc = PDFDocument (parser )
318
- parser .set_document (document . doc )
323
+ pdf_object = PDFDocument (parser )
324
+ parser .set_document (pdf_object )
319
325
320
- if document . doc .is_extractable :
326
+ if pdf_object .is_extractable :
321
327
document .extractable = True
328
+ return pdf_object
322
329
323
330
324
331
def extract_info (document : Document ):
325
- if document .isPDF :
332
+ if document .filename is None :
333
+ get_filename (document )
334
+ if document .is_pdf :
326
335
with open (document .path , 'rb' ) as f :
327
336
pdf = PdfFileReader (f , strict = False )
328
337
# TODO: Handle encrypted files
329
338
330
339
document .num_pages = pdf .getNumPages ()
331
- info = pdf .getDocumentInfo ()
332
- if info is not None :
333
- document .author = "unknown" if not info .author else info .author
334
- document .creator = "unknown" if not info .creator else info .creator
335
- document .producer = "unknown" if not info .producer else info .producer
336
- document .subject = "unknown" if not info .subject else info .subject
337
- document .title = "unknown" if not info .title else info .title
340
+ informations = pdf .getDocumentInfo ()
341
+ if informations is not None :
342
+ document .info . author = "unknown" if not informations .author else informations .author
343
+ document .info . creator = "unknown" if not informations .creator else informations .creator
344
+ document .info . producer = "unknown" if not informations .producer else informations .producer
345
+ document .info . subject = "unknown" if not informations .subject else informations .subject
346
+ document .info . title = "unknown" if not informations .title else informations .title
338
347
else :
339
348
document .num_pages = 1
340
- document .author = "unknown"
341
- document .creator = "unknown"
342
- document .producer = "unknown"
343
- document .subject = "unknown"
344
- document .title = "unknown"
349
+ document .info . author = "unknown"
350
+ document .info . creator = "unknown"
351
+ document .info . producer = "unknown"
352
+ document .info . subject = "unknown"
353
+ document .info . title = "unknown"
345
354
346
355
347
356
# layout analysis for every page
348
- def extract_page_layouts (document : Document , config_options = "line_margin=0.8" ):
349
- # calls get_pdf_object if document.doc, which contains PDFObject, is empty
350
- if document .doc is None :
351
- get_pdf_object (document )
357
+ def extract_page_layouts (pdf_object : PDFObject , config_options = "line_margin=0.8" ):
352
358
# converts config_options, which is a string to dictionary, so it can be passed as **kwargs to camelot
353
359
args = dict (e .split ('=' ) for e in config_options .split (',' ))
354
360
for key in args :
@@ -360,15 +366,17 @@ def extract_page_layouts(document: Document, config_options="line_margin=0.8"):
360
366
laparams = LAParams (** args )
361
367
page_aggregator = PDFPageAggregator (resource_manager , laparams = laparams )
362
368
interpreter = PDFPageInterpreter (resource_manager , page_aggregator )
363
- for page in PDFPage .create_pages (document .doc ):
369
+ page_layouts = []
370
+ for page in PDFPage .create_pages (pdf_object ):
364
371
interpreter .process_page (page )
365
- document .page_layouts .append (page_aggregator .get_result ())
372
+ page_layouts .append (page_aggregator .get_result ())
373
+ return page_layouts
366
374
367
375
368
- def extract_table_of_contents (document : Document ):
376
+ def extract_table_of_contents (document : Document , pdf_object ):
369
377
try :
370
- for (level , title , dest , a , se ) in document . doc .get_outlines ():
371
- document .table_of_contents .append ((level , title ))
378
+ for (level , title , dest , a , se ) in pdf_object .get_outlines ():
379
+ document .info . table_of_contents .append ((level , title ))
372
380
except PDFNoOutlines :
373
381
logger .warning ("Could not get table of contents for document at path " + document .path )
374
382
@@ -388,19 +396,11 @@ def doOverlap(l1, r1, l2, r2):
388
396
389
397
390
398
# extracts LTTextBoxHorizontal and LTImage from layouts
391
- def parse_layouts (document : Document , preserve_pdfminer_structure = True , config_options = "" ):
399
+ def parse_layouts (document : Document , page_layouts ):
392
400
count = 1
393
- # perform layout analysis if document.page_layouts is empty
394
- if len (document .page_layouts ) == 0 :
395
- extract_page_layouts (document , config_options )
396
-
397
- for page_layout in document .page_layouts :
401
+ for page_layout in page_layouts :
398
402
parse_elements (document , page_layout , count )
399
403
count = count + 1
400
- # keep data structure small by deleting pdfminer objects, which are not needed anymore
401
- if not preserve_pdfminer_structure :
402
- page_layout = []
403
- document .doc = None
404
404
405
405
406
406
# Recursively iterate over all the lt elements from pdfminer.six
@@ -435,26 +435,27 @@ def parse_elements(document, page_layout, page):
435
435
parse_elements (document , el , page )
436
436
437
437
438
- def extract_tables (document : Document , output_path : str , config_options = "flavor=lattice" ):
438
+ def extract_tables (document : Document , config_options = "pages=all, flavor=lattice,parallel=True " ):
439
439
# converts config_options, which is a string to dictionary, so it can be passed as **kwargs to camelot
440
440
args = dict (e .split ('=' ) for e in config_options .split (',' ))
441
441
for key in args :
442
442
try :
443
443
args [key ] = int (args [key ])
444
444
except ValueError :
445
445
pass
446
- tables = camelot .read_pdf (document .path , pages = '1-' + str (document .num_pages ), ** args )
446
+ # use new OCR path if available
447
+ tables = camelot .read_pdf (document .ocr_path , ** args )
447
448
# remove tables with bad accuracy
448
449
tables = [table for table in tables if table .accuracy > 90 ]
449
450
document .tables = tables
450
451
451
452
452
- def find_words_paragraphs (document : Document , search_mode , search_word , match_score ):
453
+ def find_words_paragraphs (paragraphs , search_mode , search_words , match_score ):
453
454
result = []
454
- for paragraph in document . paragraphs :
455
+ for paragraph in paragraphs :
455
456
# split paragraph into sentences.
456
457
split = paragraph .split ("." )
457
- for word in search_word . split ( "," ) :
458
+ for word in search_words :
458
459
found = False
459
460
for string in split :
460
461
if (len (word ) <= len (string )) and fuzz .partial_ratio (word , string ) > match_score :
@@ -471,16 +472,16 @@ def find_words_paragraphs(document: Document, search_mode, search_word, match_sc
471
472
return result
472
473
473
474
474
- def find_words_tables (document : Document , search_mode , search_word , match_score ):
475
+ def find_words_tables (tables , search_mode , search_words , match_score ):
475
476
result = []
476
- for table in document . tables :
477
+ for table in tables :
477
478
table .df [0 ].str .strip ('.!? \n \t ' )
478
479
# perform fuzzy search over all columns
479
480
found = False
480
481
for i in range (0 , table .shape [1 ]):
481
482
if found :
482
483
break
483
- for x in process .extract (search_word , table .df [i ].astype (str ).values .tolist (),
484
+ for x in process .extract (search_words [ 0 ] , table .df [i ].astype (str ).values .tolist (),
484
485
scorer = fuzz .partial_ratio ):
485
486
if x [1 ] > 80 :
486
487
found = True
0 commit comments