Skip to content

Commit b9b6339

Browse files
committed
added unique filename in preprocess_image
1 parent d93e526 commit b9b6339

File tree

5 files changed

+12
-54
lines changed

5 files changed

+12
-54
lines changed

PDFScraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.1.3"
1+
__version__ = "1.1.6"
22

33
import logging
44

PDFScraper/cli.py

Lines changed: 7 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,7 @@ def process_doc(doc):
114114

115115
else:
116116
logger.warning("Skipping parsing. Document is not exable.")
117-
# logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
118-
# progress_counter += 1
117+
119118
else:
120119
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
121120
pdf_to_image(doc)
@@ -142,13 +141,16 @@ def cli():
142141
# Extract information about PDFs
143142
progress_counter = 1
144143

145-
# Multiprocessing
144+
# Multiprocessing -- Improves speed of processing multiple documents significantly
145+
# !! BAD PERFORMANCE OF OCR WITH MULTIPLE FILES
146146
if args["multiprocessing"]:
147147
pool = multiprocessing.Pool()
148148
pool.map(process_doc, docs)
149149
else:
150150
for doc in docs:
151151
process_doc(doc)
152+
logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
153+
progress_counter += 1
152154
logger.info('Done parsing PDFs')
153155
logger.info('Generating summary')
154156
generate_html(output_path, docs, search_word, search_mode)
@@ -158,41 +160,5 @@ def cli():
158160
sys.exit(0)
159161

160162

161-
def process_doc(doc):
162-
extract_info(doc)
163-
get_filename(doc)
164-
if doc.is_pdf:
165-
pdf_object = get_pdf_object(doc)
166-
if doc.extractable:
167-
168-
logger.debug('Document information:' + '\n' + doc.document_info_to_string())
169-
extract_table_of_contents(doc, pdf_object)
170-
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
171-
page_layouts = extract_page_layouts(pdf_object)
172-
# table extraction is possible only for text based PDFs
173-
if tables_extract:
174-
extract_tables(doc)
175-
parse_layouts(doc, page_layouts)
176-
if len(doc.paragraphs) == 0:
177-
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
178-
pdf_to_image(doc)
179-
convert_to_pdf(doc, tessdata_location)
180-
pdf_object = get_pdf_object(doc)
181-
page_layouts = extract_page_layouts(pdf_object)
182-
if tables_extract:
183-
extract_tables(doc)
184-
parse_layouts(doc, page_layouts)
185-
186-
else:
187-
logger.warning("Skipping parsing. Document is not exable.")
188-
# logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
189-
# progress_counter += 1
190-
else:
191-
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
192-
pdf_to_image(doc)
193-
convert_to_pdf(doc, tessdata_location)
194-
pdf_object = get_pdf_object(doc)
195-
page_layouts = extract_page_layouts(pdf_object)
196-
if tables_extract:
197-
extract_tables(doc)
198-
parse_layouts(doc, page_layouts)
163+
if __name__ == "__main__":
164+
cli()

PDFScraper/core.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import re
55
import sys
66
import tempfile
7+
import uuid
78
from typing import TYPE_CHECKING
89

910
import camelot
@@ -228,11 +229,10 @@ def preprocess_image(image):
228229
# image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
229230
image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
230231
# save and reread to convert to scikit-image image type
231-
temp_image_path = tempfile.gettempdir() + "/PDFScraper" + "/" + "deskew.jpg"
232+
temp_image_path = tempfile.gettempdir() + "/PDFScraper" + "/" + str(uuid.uuid4()) + "deskew.jpg"
232233
cv2.imwrite(temp_image_path, image)
233234
image = io.imread(temp_image_path)
234235
os.remove(temp_image_path)
235-
# perform deskewing
236236
image = deskew(image)
237237
image = image * 255
238238
io.imsave(temp_image_path, image.astype(np.uint8))
@@ -496,11 +496,3 @@ def find_words_tables(tables, search_mode, search_words, match_score):
496496
result.append(table)
497497
return result
498498

499-
500-
if __name__ == "__main__":
501-
import argparse
502-
503-
argumentParser = argparse.ArgumentParser()
504-
argumentParser.add_argument('--path', help='path to pdf file', required=True)
505-
args = vars(argumentParser.parse_args())
506-
doc = Document(args["path"])

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ optional arguments:
4444

4545
`search_mode`, by default in 'and' mode, specifies whether all the search terms need to be contained inside paragraph. In 'or' mode, the paragraph is returned if any of the terms are contained. In 'and' mode, the paragraph is returned if all the terms are contained.
4646

47-
`multiprocessing`, by default True, runs process in multiple threads to speed up.
47+
`multiprocessing`, by default True, runs process in multiple threads to speed up processing. **Should not be used with OCR as it significantly decreases performance**
4848
### OCR
4949

5050
**tessdata pretrained language [files](https://github.com/tesseract-ocr/tessdata_best) need to be manually added to the tessdata directory.**

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
"yattag==1.14.0",
5050
],
5151
name="PDFScraper",
52-
version="1.1.3",
52+
version="1.1.6",
5353
author="Erik Kastelec",
5454
author_email="erikkastelec@gmail.com",
5555
description="PDF text and table search",

0 commit comments

Comments
 (0)