@@ -368,43 +368,61 @@ def extract_table_of_contents(document: Document):
368
368
logger .warning ("Could not get table of contents for document at path " + document .path )
369
369
370
370
371
+ # Returns true if two rectangles(l1, r1)
372
+ # and (l2, r2) overlap
373
+ def doOverlap (l1 , r1 , l2 , r2 ):
374
+ # If one rectangle is on left side of other
375
+ if l1 [0 ] >= r2 [0 ] or l2 [0 ] >= r1 [0 ]:
376
+ return False
377
+
378
+ # If one rectangle is above other
379
+ if l1 [1 ] <= r2 [1 ] or l2 [1 ] <= r1 [1 ]:
380
+ return False
381
+
382
+ return True
383
+
384
+
371
385
def parse_layouts (document : Document ):
386
+ count = 1
372
387
for page_layout in document .page_layouts :
373
- for element in page_layout :
374
- # TODO: improve efficiency
375
- # extract text and images if there is no table in that location
376
- skip = False
377
- if len (document .tables_coordinates ) > 0 :
378
- for coordinates in document .tables_coordinates :
379
- # skip if element is inside already detected table
380
- if (coordinates [0 ] < element .bbox [0 ] < coordinates [2 ] or coordinates [1 ] < element .bbox [1 ] <
381
- coordinates [3 ]):
382
- skip = True
383
- break
384
- if not skip :
385
- if isinstance (element , LTTextBoxHorizontal ):
386
- text = element .get_text ()
387
- # fix Slovene chars and other anomalies
388
- text = re .sub (r'ˇs' , "š" , text )
389
- text = re .sub (r"ˇc" , "č" , text )
390
- text = re .sub (r"ˇz" , "ž" , text )
391
- text = re .sub (r"-\s" , "" , text )
392
-
393
- document .paragraphs .append (text )
394
- elif isinstance (element , LTImage ):
395
- # Save image objects
396
- document .images .append (element )
397
- # TODO: recursively iterate over LTFigure to find images
388
+ parse_elements (document , page_layout , count )
389
+ count = count + 1
390
+
391
+
392
+ # Recursively iterate over all the elements
393
+ def parse_elements (document , page_layout , page ):
394
+ for element in page_layout :
395
+ # TODO: improve efficiency
396
+ # extract text and images if there is no table in that location
397
+ skip = False
398
+ if len (document .tables ) > 0 and hasattr (element , "x0" ):
399
+ for table in document .tables :
400
+ # skip if element is inside already detected table
401
+ if (table .page == page and doOverlap ((element .x0 , element .y1 ), (element .x1 , element .y0 ),
402
+ (table ._bbox [0 ], table ._bbox [3 ]),
403
+ (table ._bbox [2 ], table ._bbox [1 ]))):
404
+ skip = True
405
+ break
406
+ if not skip :
407
+ if isinstance (element , LTTextBoxHorizontal ):
408
+ text = element .get_text ()
409
+ # fix Slovene chars and other anomalies
410
+ text = re .sub (r'ˇs' , "š" , text )
411
+ text = re .sub (r"ˇc" , "č" , text )
412
+ text = re .sub (r"ˇz" , "ž" , text )
413
+ text = re .sub (r"-\s" , "" , text )
414
+ document .paragraphs .append (text )
415
+ elif isinstance (element , LTImage ):
416
+ # Save image objects
417
+ document .images .append (element )
418
+ elif hasattr (element , '_objs' ):
419
+ for el in element ._objs :
420
+ if hasattr (el , '__iter__' ):
421
+ parse_elements (document , el , page )
398
422
399
423
400
424
def extract_tables (document : Document , output_path : str ):
401
425
tables = camelot .read_pdf (document .path , pages = '1-' + str (document .num_pages ), flavor = 'lattice' )
402
- # find coordinates of table regions to exclude them from text extraction
403
- for table in tables :
404
- first_cell_coord = table .cells [0 ][0 ].lt
405
- last_cel_coord = table .cells [- 1 ][- 1 ].rb
406
- document .tables_coordinates .append (first_cell_coord + last_cel_coord )
407
-
408
426
document .tables = tables
409
427
410
428
0 commit comments