Skip to content

Commit d5d8ec6

Browse files
committed
Recursive iteration over pdfminer elements
1 parent 5af8a83 commit d5d8ec6

File tree

4 files changed

+51
-34
lines changed

4 files changed

+51
-34
lines changed

PDFScraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.0.8"
1+
__version__ = "1.0.9"
22

33
import argparse
44
import logging

PDFScraper/dataStructure.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ def __init__(self, path: str, parent: Documents, isPDF: bool):
2222
self.text = []
2323
self.images = []
2424
self.tables = []
25-
self.tables_coordinates = []
2625
self.ltfigures = []
2726
self.paragraphs = []
2827
self.table_of_contents = []

PDFScraper/pdfParser.py

Lines changed: 49 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -368,43 +368,61 @@ def extract_table_of_contents(document: Document):
368368
logger.warning("Could not get table of contents for document at path " + document.path)
369369

370370

371+
# Returns true if two rectangles(l1, r1)
372+
# and (l2, r2) overlap
373+
def doOverlap(l1, r1, l2, r2):
374+
# If one rectangle is on left side of other
375+
if l1[0] >= r2[0] or l2[0] >= r1[0]:
376+
return False
377+
378+
# If one rectangle is above other
379+
if l1[1] <= r2[1] or l2[1] <= r1[1]:
380+
return False
381+
382+
return True
383+
384+
371385
def parse_layouts(document: Document):
386+
count = 1
372387
for page_layout in document.page_layouts:
373-
for element in page_layout:
374-
# TODO: improve efficiency
375-
# extract text and images if there is no table in that location
376-
skip = False
377-
if len(document.tables_coordinates) > 0:
378-
for coordinates in document.tables_coordinates:
379-
# skip if element is inside already detected table
380-
if (coordinates[0] < element.bbox[0] < coordinates[2] or coordinates[1] < element.bbox[1] <
381-
coordinates[3]):
382-
skip = True
383-
break
384-
if not skip:
385-
if isinstance(element, LTTextBoxHorizontal):
386-
text = element.get_text()
387-
# fix Slovene chars and other anomalies
388-
text = re.sub(r'ˇs', "š", text)
389-
text = re.sub(r"ˇc", "č", text)
390-
text = re.sub(r"ˇz", "ž", text)
391-
text = re.sub(r"-\s", "", text)
392-
393-
document.paragraphs.append(text)
394-
elif isinstance(element, LTImage):
395-
# Save image objects
396-
document.images.append(element)
397-
# TODO: recursively iterate over LTFigure to find images
388+
parse_elements(document, page_layout, count)
389+
count = count + 1
390+
391+
392+
# Recursively iterate over all the elements
393+
def parse_elements(document, page_layout, page):
394+
for element in page_layout:
395+
# TODO: improve efficiency
396+
# extract text and images if there is no table in that location
397+
skip = False
398+
if len(document.tables) > 0 and hasattr(element, "x0"):
399+
for table in document.tables:
400+
# skip if element is inside already detected table
401+
if (table.page == page and doOverlap((element.x0, element.y1), (element.x1, element.y0),
402+
(table._bbox[0], table._bbox[3]),
403+
(table._bbox[2], table._bbox[1]))):
404+
skip = True
405+
break
406+
if not skip:
407+
if isinstance(element, LTTextBoxHorizontal):
408+
text = element.get_text()
409+
# fix Slovene chars and other anomalies
410+
text = re.sub(r'ˇs', "š", text)
411+
text = re.sub(r"ˇc", "č", text)
412+
text = re.sub(r"ˇz", "ž", text)
413+
text = re.sub(r"-\s", "", text)
414+
document.paragraphs.append(text)
415+
elif isinstance(element, LTImage):
416+
# Save image objects
417+
document.images.append(element)
418+
elif hasattr(element, '_objs'):
419+
for el in element._objs:
420+
if hasattr(el, '__iter__'):
421+
parse_elements(document, el, page)
398422

399423

400424
def extract_tables(document: Document, output_path: str):
401425
tables = camelot.read_pdf(document.path, pages='1-' + str(document.num_pages), flavor='lattice')
402-
# find coordinates of table regions to exclude them from text extraction
403-
for table in tables:
404-
first_cell_coord = table.cells[0][0].lt
405-
last_cel_coord = table.cells[-1][-1].rb
406-
document.tables_coordinates.append(first_cell_coord + last_cel_coord)
407-
408426
document.tables = tables
409427

410428

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
"yattag==1.14.0",
5050
],
5151
name="PDFScraper",
52-
version="1.0.8",
52+
version="1.0.9",
5353
author="Erik Kastelec",
5454
author_email="erikkastelec@gmail.com",
5555
description="PDF text and table search",

0 commit comments

Comments
 (0)