Skip to content

Commit db2b55c

Browse files
committed
Fix table parsing code
1 parent 10b40df commit db2b55c

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

scripts/data_utils.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -610,14 +610,20 @@ def extract_pdf_content(file_path, form_recognizer_client, use_layout=False):
610610
roles_end[para_end] = paragraph.role
611611

612612
for page_num, page in enumerate(form_recognizer_results.pages):
613+
page_offset = page.spans[0].offset
614+
page_length = page.spans[0].length
615+
613616
if use_layout:
614-
tables_on_page = [table for table in form_recognizer_results.tables if table.bounding_regions[0].page_number == page_num + 1]
617+
tables_on_page = []
618+
for table in form_recognizer_results.tables:
619+
table_offset = table.spans[0].offset
620+
table_length = table.spans[0].length
621+
if page_offset <= table_offset and table_offset + table_length < page_offset + page_length:
622+
tables_on_page.append(table)
615623
else:
616624
tables_on_page = []
617625

618626
# (if using layout) mark all positions of the table spans in the page
619-
page_offset = page.spans[0].offset
620-
page_length = page.spans[0].length
621627
table_chars = [-1]*page_length
622628
for table_id, table in enumerate(tables_on_page):
623629
for span in table.spans:
@@ -654,14 +660,14 @@ def extract_pdf_content(file_path, form_recognizer_client, use_layout=False):
654660

655661
full_text = "".join([page_text for _, _, page_text in page_map])
656662

657-
# If the images are preserved, add image tags to the full text
658-
document = fitz.open(file_path)
659-
663+
# Extract any images
660664
image_mapping = {}
661665

662666
if "figures" in form_recognizer_results.keys() and file_path.endswith(".pdf"):
667+
document = fitz.open(file_path)
668+
663669
for figure in form_recognizer_results["figures"]:
664-
bounding_box = figure["boundingRegions"][0]
670+
bounding_box = figure.bounding_regions[0]
665671

666672
page_number = bounding_box['pageNumber'] - 1 # Page numbers in PyMuPDF start from 0
667673
x0, y0, x1, y1 = polygon_to_bbox(bounding_box['polygon'])

0 commit comments

Comments
 (0)