Skip to content

Commit 3d9eec4

Browse files
committed
Made functions handle extra options and work as a library.
1 parent 00b7dbc commit 3d9eec4

File tree

3 files changed

+41
-12
lines changed

3 files changed

+41
-12
lines changed

PDFScraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.0.12"
1+
__version__ = "1.0.13"
22

33
import argparse
44
import logging

PDFScraper/pdfParser.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ def preprocess_image(image):
226226

227227

228228
# Preprocess the images for OCR then extract them
229-
def convert_to_pdf(document: Document, tessdata_location: str):
229+
def convert_to_pdf(document: Document, tessdata_location: str, config_options=""):
230230
pdf_pages = []
231231
for i in range(document.num_pages):
232232
img = cv2.imread(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + "_" + str(i) + ".jpg")
@@ -243,7 +243,9 @@ def convert_to_pdf(document: Document, tessdata_location: str):
243243
if i == 0:
244244
language = get_language(img, tessdata_location)
245245
try:
246-
config_options = '--psm 1 --tessdata-dir ' + tessdata_location
246+
# uses provided config if available
247+
if config_options == "":
248+
config_options = '--psm 1 --tessdata-dir ' + tessdata_location
247249
text = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang=language, config=config_options)
248250
with open(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + "_" + str(i) + ".pdf",
249251
'w+b') as f:
@@ -334,9 +336,22 @@ def extract_info(document: Document):
334336

335337

336338
# layout analysis for every page
337-
def extract_page_layouts(document: Document):
339+
def extract_page_layouts(document: Document, config_options=""):
340+
# calls get_pdf_object if document.doc, which contains PDFObject, is empty
341+
if document.doc is not None:
342+
get_pdf_object(document)
343+
# use config_options if specified
344+
if config_options == "":
345+
config_options = "line_margin=0.8"
346+
# converts config_options, which is a string to dictionary, so it can be passed as **kwargs to camelot
347+
args = dict(e.split('=') for e in config_options.split(','))
348+
for key in args:
349+
try:
350+
args[key] = float(args[key])
351+
except ValueError:
352+
pass
338353
resource_manager = PDFResourceManager()
339-
laparams = LAParams(line_margin=0.8)
354+
laparams = LAParams(**args)
340355
page_aggregator = PDFPageAggregator(resource_manager, laparams=laparams)
341356
interpreter = PDFPageInterpreter(resource_manager, page_aggregator)
342357
for page in PDFPage.create_pages(document.doc):
@@ -366,15 +381,20 @@ def doOverlap(l1, r1, l2, r2):
366381
return True
367382

368383

369-
# parse pdfminer.six layouts
370-
def parse_layouts(document: Document, preserve_pdfminer_structure=True):
384+
# extracts LTTextBoxHorizontal and LTImage from layouts
385+
def parse_layouts(document: Document, preserve_pdfminer_structure=True, config_options=""):
371386
count = 1
387+
# perform layout analysis if document.page_layouts is empty
388+
if len(document.page_layouts) == 0:
389+
extract_page_layouts(document, config_options)
390+
372391
for page_layout in document.page_layouts:
373392
parse_elements(document, page_layout, count)
374393
count = count + 1
375-
# keep data structure small
394+
# keep data structure small by deleting pdfminer objects, which are not needed anymore
376395
if not preserve_pdfminer_structure:
377396
page_layout = []
397+
document.doc = None
378398

379399

380400
# Recursively iterate over all the lt elements from pdfminer.six
@@ -409,8 +429,18 @@ def parse_elements(document, page_layout, page):
409429
parse_elements(document, el, page)
410430

411431

412-
def extract_tables(document: Document, output_path: str):
413-
tables = camelot.read_pdf(document.path, pages='1-' + str(document.num_pages), flavor='lattice')
432+
def extract_tables(document: Document, output_path: str, config_options=""):
433+
# use config_options if specified
434+
if config_options == "":
435+
config_options = "flavor=lattice"
436+
# converts config_options, which is a string to dictionary, so it can be passed as **kwargs to camelot
437+
args = dict(e.split('=') for e in config_options.split(','))
438+
for key in args:
439+
try:
440+
args[key] = int(args[key])
441+
except ValueError:
442+
pass
443+
tables = camelot.read_pdf(document.path, pages='1-' + str(document.num_pages), **args)
414444
document.tables = tables
415445

416446

@@ -421,4 +451,3 @@ def extract_tables(document: Document, output_path: str):
421451
argumentParser.add_argument('--path', help='path to pdf file', required=True)
422452
args = vars(argumentParser.parse_args())
423453
doc = Document(args["path"])
424-
print(extract_text(doc))

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
"yattag==1.14.0",
5050
],
5151
name="PDFScraper",
52-
version="1.0.12",
52+
version="1.0.13",
5353
author="Erik Kastelec",
5454
author_email="erikkastelec@gmail.com",
5555
description="PDF text and table search",

0 commit comments

Comments
 (0)