@@ -226,7 +226,7 @@ def preprocess_image(image):
226
226
227
227
228
228
# Preprocess the images for OCR then extract them
229
- def convert_to_pdf (document : Document , tessdata_location : str ):
229
+ def convert_to_pdf (document : Document , tessdata_location : str , config_options = "" ):
230
230
pdf_pages = []
231
231
for i in range (document .num_pages ):
232
232
img = cv2 .imread (tempfile .gettempdir () + "/PDFScraper" + "/" + document .filename + "_" + str (i ) + ".jpg" )
@@ -243,7 +243,9 @@ def convert_to_pdf(document: Document, tessdata_location: str):
243
243
if i == 0 :
244
244
language = get_language (img , tessdata_location )
245
245
try :
246
- config_options = '--psm 1 --tessdata-dir ' + tessdata_location
246
+ # uses provided config if available
247
+ if config_options == "" :
248
+ config_options = '--psm 1 --tessdata-dir ' + tessdata_location
247
249
text = pytesseract .image_to_pdf_or_hocr (img , extension = 'pdf' , lang = language , config = config_options )
248
250
with open (tempfile .gettempdir () + "/PDFScraper" + "/" + document .filename + "_" + str (i ) + ".pdf" ,
249
251
'w+b' ) as f :
@@ -334,9 +336,22 @@ def extract_info(document: Document):
334
336
335
337
336
338
# layout analysis for every page
337
- def extract_page_layouts (document : Document ):
339
+ def extract_page_layouts (document : Document , config_options = "" ):
340
+ # calls get_pdf_object if document.doc, which contains PDFObject, is empty
341
+ if document .doc is not None :
342
+ get_pdf_object (document )
343
+ # use config_options if specified
344
+ if config_options == "" :
345
+ config_options = "line_margin=0.8"
346
+ # converts config_options, which is a string to dictionary, so it can be passed as **kwargs to camelot
347
+ args = dict (e .split ('=' ) for e in config_options .split (',' ))
348
+ for key in args :
349
+ try :
350
+ args [key ] = float (args [key ])
351
+ except ValueError :
352
+ pass
338
353
resource_manager = PDFResourceManager ()
339
- laparams = LAParams (line_margin = 0.8 )
354
+ laparams = LAParams (** args )
340
355
page_aggregator = PDFPageAggregator (resource_manager , laparams = laparams )
341
356
interpreter = PDFPageInterpreter (resource_manager , page_aggregator )
342
357
for page in PDFPage .create_pages (document .doc ):
@@ -366,15 +381,20 @@ def doOverlap(l1, r1, l2, r2):
366
381
return True
367
382
368
383
369
- # parse pdfminer.six layouts
370
- def parse_layouts (document : Document , preserve_pdfminer_structure = True ):
384
+ # extracts LTTextBoxHorizontal and LTImage from layouts
385
+ def parse_layouts (document : Document , preserve_pdfminer_structure = True , config_options = "" ):
371
386
count = 1
387
+ # perform layout analysis if document.page_layouts is empty
388
+ if len (document .page_layouts ) == 0 :
389
+ extract_page_layouts (document , config_options )
390
+
372
391
for page_layout in document .page_layouts :
373
392
parse_elements (document , page_layout , count )
374
393
count = count + 1
375
- # keep data structure small
394
+ # keep data structure small by deleting pdfminer objects, which are not needed anymore
376
395
if not preserve_pdfminer_structure :
377
396
page_layout = []
397
+ document .doc = None
378
398
379
399
380
400
# Recursively iterate over all the lt elements from pdfminer.six
@@ -409,8 +429,18 @@ def parse_elements(document, page_layout, page):
409
429
parse_elements (document , el , page )
410
430
411
431
412
- def extract_tables (document : Document , output_path : str ):
413
- tables = camelot .read_pdf (document .path , pages = '1-' + str (document .num_pages ), flavor = 'lattice' )
432
+ def extract_tables (document : Document , output_path : str , config_options = "" ):
433
+ # use config_options if specified
434
+ if config_options == "" :
435
+ config_options = "flavor=lattice"
436
+ # converts config_options, which is a string to dictionary, so it can be passed as **kwargs to camelot
437
+ args = dict (e .split ('=' ) for e in config_options .split (',' ))
438
+ for key in args :
439
+ try :
440
+ args [key ] = int (args [key ])
441
+ except ValueError :
442
+ pass
443
+ tables = camelot .read_pdf (document .path , pages = '1-' + str (document .num_pages ), ** args )
414
444
document .tables = tables
415
445
416
446
@@ -421,4 +451,3 @@ def extract_tables(document: Document, output_path: str):
421
451
argumentParser .add_argument ('--path' , help = 'path to pdf file' , required = True )
422
452
args = vars (argumentParser .parse_args ())
423
453
doc = Document (args ["path" ])
424
- print (extract_text (doc ))
0 commit comments