Skip to content

Commit dd6f29c

Browse files
committed
Refactoring
1 parent 1c19bff commit dd6f29c

File tree

6 files changed

+108
-115
lines changed

6 files changed

+108
-115
lines changed

PDFScraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.1.0"
1+
__version__ = "1.1.1"
22

33
import logging
44

PDFScraper/cli.py

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from PDFScraper.core import get_filename, pdf_to_image, convert_to_pdf, get_pdf_object, extract_page_layouts, \
1010
extract_tables, parse_layouts, extract_table_of_contents, extract_info, find_pdfs_in_path
11-
from PDFScraper.dataStructure import Documents
1211
from PDFScraper.outputGenerator import generate_html
1312

1413
# Define logger level helper
@@ -88,58 +87,57 @@ def signal_handler(sign, frame):
8887

8988

9089
def cli():
90+
path = os.path.abspath(args["path"])
91+
logger.info('Finding PDFs in ' + path)
9192
# Read PDFs from path
92-
docs = Documents(path=os.path.abspath(args["path"]))
93-
logger.info('Finding PDFs in ' + docs.path)
9493
try:
95-
find_pdfs_in_path(docs, docs.path)
94+
docs = find_pdfs_in_path(path)
9695
except Exception as e:
9796
logger.error(e)
9897
sys.exit(1)
99-
logger.info('Found ' + str(docs.num_docs) + ' PDFs')
98+
logger.info('Found ' + str(len(docs)) + ' PDFs')
10099

101-
logger.info('Parsing ' + str(docs.num_docs) + ' documents')
100+
logger.info('Parsing ' + str(len(docs)) + ' documents')
102101
# Extract information about PDFs
103102
progress_counter = 1
104-
for doc in docs.docs:
103+
for doc in docs:
105104
extract_info(doc)
106-
if doc.isPDF:
107-
get_pdf_object(doc)
105+
get_filename(doc)
106+
if doc.is_pdf:
107+
pdf_object = get_pdf_object(doc)
108108
if doc.extractable:
109109

110110
logger.debug('Document information:' + '\n' + doc.document_info_to_string())
111-
extract_table_of_contents(doc)
111+
extract_table_of_contents(doc, pdf_object)
112112
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
113-
extract_page_layouts(doc)
113+
page_layouts = extract_page_layouts(pdf_object)
114114
# table extraction is possible only for text based PDFs
115115
if tables_extract:
116-
extract_tables(doc, output_path)
117-
parse_layouts(doc)
116+
extract_tables(doc)
117+
parse_layouts(doc, page_layouts)
118118
if len(doc.paragraphs) == 0:
119119
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
120-
get_filename(doc)
121120
pdf_to_image(doc)
122121
convert_to_pdf(doc, tessdata_location)
123-
get_pdf_object(doc)
124-
extract_page_layouts(doc)
122+
pdf_object = get_pdf_object(doc)
123+
page_layouts = extract_page_layouts(pdf_object)
125124
if tables_extract:
126-
extract_tables(doc, output_path)
127-
parse_layouts(doc)
125+
extract_tables(doc)
126+
parse_layouts(doc, page_layouts)
128127

129128
else:
130-
logger.warning("Skipping parsing. Document is not extractable.")
131-
logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(docs.num_docs) + ' documents')
129+
logger.warning("Skipping parsing. Document is not exable.")
130+
logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
132131
progress_counter += 1
133132
else:
134133
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
135-
get_filename(doc)
136134
pdf_to_image(doc)
137135
convert_to_pdf(doc, tessdata_location)
138-
get_pdf_object(doc)
139-
extract_page_layouts(doc)
136+
pdf_object = get_pdf_object(doc)
137+
page_layouts = extract_page_layouts(pdf_object)
140138
if tables_extract:
141-
extract_tables(doc, output_path)
142-
parse_layouts(doc)
139+
extract_tables(doc)
140+
parse_layouts(doc, page_layouts)
143141
logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
144142
logger.info('Done parsing PDFs')
145143
logger.info('Stopping')

PDFScraper/core.py

Lines changed: 61 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,13 @@
2121
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
2222
from pdfminer.pdfpage import PDFPage
2323
from pdfminer.pdfparser import PDFParser
24+
from pdfminer.pdftypes import PDFObject
2425
from pytesseract import TesseractNotFoundError, TesseractError
2526
from skimage import io
2627
from skimage.feature import canny
2728
from skimage.transform import hough_line, hough_line_peaks, rotate
2829

29-
from PDFScraper.dataStructure import Document, Documents
30+
from PDFScraper.dataStructure import Document
3031

3132
# Set up logger
3233
log_level = 20
@@ -36,27 +37,28 @@
3637
logger.setLevel(log_level)
3738

3839

39-
def find_pdfs_in_path(docs: Documents, path: str):
40+
def find_pdfs_in_path(path: str):
41+
pdfs = []
4042
if os.path.exists(path):
4143
if os.path.isdir(path): # find PDFs in directory and add them to the list
4244
count = 0
4345
for f in os.listdir(path):
4446
count += 1
45-
find_pdfs_in_path(docs, path + '/' + f)
47+
find_pdfs_in_path(path + '/' + f)
48+
4649
elif os.path.isfile(path) and (path.endswith(".pdf")):
50+
pdfs.append(Document(path, True))
4751

48-
docs.num_docs += 1
49-
docs.docs.append(Document(path, docs, True))
5052
elif os.path.isfile(path) and (path.endswith(".bmp") or path.endswith(".jpg") or path.endswith(".pbm")
5153
or path.endswith(".pgm") or path.endswith(".ppm") or path.endswith(".jpeg")
5254
or path.endswith(".jpe") or path.endswith(".jp2") or path.endswith(".tiff")
5355
or path.endswith(".tif") or path.endswith(".png")):
54-
docs.num_docs += 1
5556

56-
docs.docs.append(Document(path, docs, False))
57+
pdfs.append(Document(path, False))
5758

5859
else:
5960
raise Exception("Provided path does not exist")
61+
return pdfs
6062

6163

6264
# Get filename from path
@@ -72,7 +74,7 @@ def pdf_to_image(document: Document):
7274
except FileExistsError:
7375
pass
7476

75-
if document.isPDF:
77+
if document.is_pdf:
7678
pages = pdf2image.convert_from_path(pdf_path=document.path, dpi=300)
7779
# TODO: implement saving to temp dir with mkstemp for better security
7880
for i in range(len(pages)):
@@ -116,10 +118,10 @@ def image_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
116118

117119

118120
# determine skew angle of image
119-
def determine_skew(image):
120-
edges = canny(image, sigma=3.0)
121+
def determine_skew(image, sigma=3.0, num_peaks=20):
122+
edges = canny(image, sigma=sigma)
121123
h, a, d = hough_line(edges)
122-
_, ap, _ = hough_line_peaks(h, a, d, num_peaks=20)
124+
_, ap, _ = hough_line_peaks(h, a, d, num_peaks=num_peaks)
123125

124126
if len(ap) == 0:
125127
return 0
@@ -207,7 +209,6 @@ def get_max_freq_elem(arr):
207209
# Apply deskewing to the image
208210
def deskew(image):
209211
angle = determine_skew(image)
210-
211212
if 0 <= angle <= 90:
212213
rot_angle = angle - 90
213214
if -45 <= angle < 0:
@@ -224,8 +225,8 @@ def preprocess_image(image):
224225
# RGB to grayscale
225226
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
226227
# Thresholding
227-
image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
228-
228+
# image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
229+
image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
229230
# save and reread to convert to scikit-image image type
230231
temp_image_path = tempfile.gettempdir() + "/PDFScraper" + "/" + "deskew.jpg"
231232
cv2.imwrite(temp_image_path, image)
@@ -250,13 +251,16 @@ def convert_to_pdf(document: Document, tessdata_location: str, config_options=""
250251
# Resize imput image if not PDF
251252
# if not document.isPDF:
252253
# img = image_resize(img, width=1024)
253-
img = preprocess_image(img)
254254

255-
# Extract testing using OCR
255+
img = preprocess_image(img)
256256

257-
# Extract language only from the first page
257+
# Extract language from the first page only
258258
if i == 0:
259259
language = get_language(img, tessdata_location)
260+
# if not english or slovene set to english
261+
if language != "eng" or language != "slv":
262+
language = "eng"
263+
260264
try:
261265
# uses provided config if available
262266
if config_options == "":
@@ -311,44 +315,46 @@ def get_language(img, tessdata_location: str):
311315

312316
# parses Document to PDFDocument
313317
def get_pdf_object(document: Document):
318+
if document.filename is None:
319+
get_filename(document)
314320
# use OCR processed file if available
315321
file = open(document.ocr_path, 'rb')
316322
parser = PDFParser(file)
317-
document.doc = PDFDocument(parser)
318-
parser.set_document(document.doc)
323+
pdf_object = PDFDocument(parser)
324+
parser.set_document(pdf_object)
319325

320-
if document.doc.is_extractable:
326+
if pdf_object.is_extractable:
321327
document.extractable = True
328+
return pdf_object
322329

323330

324331
def extract_info(document: Document):
325-
if document.isPDF:
332+
if document.filename is None:
333+
get_filename(document)
334+
if document.is_pdf:
326335
with open(document.path, 'rb') as f:
327336
pdf = PdfFileReader(f, strict=False)
328337
# TODO: Handle encrypted files
329338

330339
document.num_pages = pdf.getNumPages()
331-
info = pdf.getDocumentInfo()
332-
if info is not None:
333-
document.author = "unknown" if not info.author else info.author
334-
document.creator = "unknown" if not info.creator else info.creator
335-
document.producer = "unknown" if not info.producer else info.producer
336-
document.subject = "unknown" if not info.subject else info.subject
337-
document.title = "unknown" if not info.title else info.title
340+
informations = pdf.getDocumentInfo()
341+
if informations is not None:
342+
document.info.author = "unknown" if not informations.author else informations.author
343+
document.info.creator = "unknown" if not informations.creator else informations.creator
344+
document.info.producer = "unknown" if not informations.producer else informations.producer
345+
document.info.subject = "unknown" if not informations.subject else informations.subject
346+
document.info.title = "unknown" if not informations.title else informations.title
338347
else:
339348
document.num_pages = 1
340-
document.author = "unknown"
341-
document.creator = "unknown"
342-
document.producer = "unknown"
343-
document.subject = "unknown"
344-
document.title = "unknown"
349+
document.info.author = "unknown"
350+
document.info.creator = "unknown"
351+
document.info.producer = "unknown"
352+
document.info.subject = "unknown"
353+
document.info.title = "unknown"
345354

346355

347356
# layout analysis for every page
348-
def extract_page_layouts(document: Document, config_options="line_margin=0.8"):
349-
# calls get_pdf_object if document.doc, which contains PDFObject, is empty
350-
if document.doc is None:
351-
get_pdf_object(document)
357+
def extract_page_layouts(pdf_object: PDFObject, config_options="line_margin=0.8"):
352358
# converts config_options, which is a string to dictionary, so it can be passed as **kwargs to camelot
353359
args = dict(e.split('=') for e in config_options.split(','))
354360
for key in args:
@@ -360,15 +366,17 @@ def extract_page_layouts(document: Document, config_options="line_margin=0.8"):
360366
laparams = LAParams(**args)
361367
page_aggregator = PDFPageAggregator(resource_manager, laparams=laparams)
362368
interpreter = PDFPageInterpreter(resource_manager, page_aggregator)
363-
for page in PDFPage.create_pages(document.doc):
369+
page_layouts = []
370+
for page in PDFPage.create_pages(pdf_object):
364371
interpreter.process_page(page)
365-
document.page_layouts.append(page_aggregator.get_result())
372+
page_layouts.append(page_aggregator.get_result())
373+
return page_layouts
366374

367375

368-
def extract_table_of_contents(document: Document):
376+
def extract_table_of_contents(document: Document, pdf_object):
369377
try:
370-
for (level, title, dest, a, se) in document.doc.get_outlines():
371-
document.table_of_contents.append((level, title))
378+
for (level, title, dest, a, se) in pdf_object.get_outlines():
379+
document.info.table_of_contents.append((level, title))
372380
except PDFNoOutlines:
373381
logger.warning("Could not get table of contents for document at path " + document.path)
374382

@@ -388,19 +396,11 @@ def doOverlap(l1, r1, l2, r2):
388396

389397

390398
# extracts LTTextBoxHorizontal and LTImage from layouts
391-
def parse_layouts(document: Document, preserve_pdfminer_structure=True, config_options=""):
399+
def parse_layouts(document: Document, page_layouts):
392400
count = 1
393-
# perform layout analysis if document.page_layouts is empty
394-
if len(document.page_layouts) == 0:
395-
extract_page_layouts(document, config_options)
396-
397-
for page_layout in document.page_layouts:
401+
for page_layout in page_layouts:
398402
parse_elements(document, page_layout, count)
399403
count = count + 1
400-
# keep data structure small by deleting pdfminer objects, which are not needed anymore
401-
if not preserve_pdfminer_structure:
402-
page_layout = []
403-
document.doc = None
404404

405405

406406
# Recursively iterate over all the lt elements from pdfminer.six
@@ -435,26 +435,27 @@ def parse_elements(document, page_layout, page):
435435
parse_elements(document, el, page)
436436

437437

438-
def extract_tables(document: Document, output_path: str, config_options="flavor=lattice"):
438+
def extract_tables(document: Document, config_options="pages=all,flavor=lattice,parallel=True"):
439439
# converts config_options, which is a string to dictionary, so it can be passed as **kwargs to camelot
440440
args = dict(e.split('=') for e in config_options.split(','))
441441
for key in args:
442442
try:
443443
args[key] = int(args[key])
444444
except ValueError:
445445
pass
446-
tables = camelot.read_pdf(document.path, pages='1-' + str(document.num_pages), **args)
446+
# use new OCR path if available
447+
tables = camelot.read_pdf(document.ocr_path, **args)
447448
# remove tables with bad accuracy
448449
tables = [table for table in tables if table.accuracy > 90]
449450
document.tables = tables
450451

451452

452-
def find_words_paragraphs(document: Document, search_mode, search_word, match_score):
453+
def find_words_paragraphs(paragraphs, search_mode, search_words, match_score):
453454
result = []
454-
for paragraph in document.paragraphs:
455+
for paragraph in paragraphs:
455456
# split paragraph into sentences.
456457
split = paragraph.split(".")
457-
for word in search_word.split(","):
458+
for word in search_words:
458459
found = False
459460
for string in split:
460461
if (len(word) <= len(string)) and fuzz.partial_ratio(word, string) > match_score:
@@ -471,16 +472,16 @@ def find_words_paragraphs(document: Document, search_mode, search_word, match_sc
471472
return result
472473

473474

474-
def find_words_tables(document: Document, search_mode, search_word, match_score):
475+
def find_words_tables(tables, search_mode, search_words, match_score):
475476
result = []
476-
for table in document.tables:
477+
for table in tables:
477478
table.df[0].str.strip('.!? \n\t')
478479
# perform fuzzy search over all columns
479480
found = False
480481
for i in range(0, table.shape[1]):
481482
if found:
482483
break
483-
for x in process.extract(search_word, table.df[i].astype(str).values.tolist(),
484+
for x in process.extract(search_words[0], table.df[i].astype(str).values.tolist(),
484485
scorer=fuzz.partial_ratio):
485486
if x[1] > 80:
486487
found = True

0 commit comments

Comments
 (0)