@@ -610,14 +610,20 @@ def extract_pdf_content(file_path, form_recognizer_client, use_layout=False):
610
610
roles_end [para_end ] = paragraph .role
611
611
612
612
for page_num , page in enumerate (form_recognizer_results .pages ):
613
+ page_offset = page .spans [0 ].offset
614
+ page_length = page .spans [0 ].length
615
+
613
616
if use_layout :
614
- tables_on_page = [table for table in form_recognizer_results .tables if table .bounding_regions [0 ].page_number == page_num + 1 ]
617
+ tables_on_page = []
618
+ for table in form_recognizer_results .tables :
619
+ table_offset = table .spans [0 ].offset
620
+ table_length = table .spans [0 ].length
621
+ if page_offset <= table_offset and table_offset + table_length < page_offset + page_length :
622
+ tables_on_page .append (table )
615
623
else :
616
624
tables_on_page = []
617
625
618
626
# (if using layout) mark all positions of the table spans in the page
619
- page_offset = page .spans [0 ].offset
620
- page_length = page .spans [0 ].length
621
627
table_chars = [- 1 ]* page_length
622
628
for table_id , table in enumerate (tables_on_page ):
623
629
for span in table .spans :
@@ -654,14 +660,14 @@ def extract_pdf_content(file_path, form_recognizer_client, use_layout=False):
654
660
655
661
full_text = "" .join ([page_text for _ , _ , page_text in page_map ])
656
662
657
- # If the images are preserved, add image tags to the full text
658
- document = fitz .open (file_path )
659
-
663
+ # Extract any images
660
664
image_mapping = {}
661
665
662
666
if "figures" in form_recognizer_results .keys () and file_path .endswith (".pdf" ):
667
+ document = fitz .open (file_path )
668
+
663
669
for figure in form_recognizer_results ["figures" ]:
664
- bounding_box = figure [ "boundingRegions" ] [0 ]
670
+ bounding_box = figure . bounding_regions [0 ]
665
671
666
672
page_number = bounding_box ['pageNumber' ] - 1 # Page numbers in PyMuPDF start from 0
667
673
x0 , y0 , x1 , y1 = polygon_to_bbox (bounding_box ['polygon' ])
0 commit comments