Skip to content

Commit 3bddfcb

Browse files
committed
Fixes for package release
1 parent 9958220 commit 3bddfcb

File tree

7 files changed

+405
-137
lines changed

7 files changed

+405
-137
lines changed

PDFScraper/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
__version__ = "0.0.1"
1+
__version__ = "1.0.4"
22

33

44
def version():
55
return __version__
6+

PDFScraper/__main__.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import argparse
2+
import logging
3+
import shutil
4+
import signal
5+
import sys
6+
import tempfile
7+
8+
from PDFScraper.batchProcessing import find_pdfs_in_path
9+
from PDFScraper.dataStructure import Documents
10+
from PDFScraper.outputGenerator import generate_html
11+
from PDFScraper.pdfParser import get_filename, pdf_to_image, extract_text_ocr, get_pdf_object, extract_page_layouts, \
12+
extract_tables, parse_layouts, extract_table_of_contents, extract_info
13+
14+
15+
def main():
16+
# Define logger level helper
17+
switcher = {
18+
'critical': 50,
19+
'error': 40,
20+
'warning': 30,
21+
'info': 20,
22+
'debug': 10
23+
}
24+
25+
# boolean input helper for ArgumentParser
26+
def str2bool(v):
27+
if isinstance(v, bool):
28+
return v
29+
if v.lower() in ('yes', 'true', 't', 'y', '1'):
30+
return True
31+
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
32+
return False
33+
else:
34+
raise argparse.ArgumentTypeError('Boolean value expected.')
35+
36+
# Parse arguments from command line
37+
argumentParser = argparse.ArgumentParser()
38+
argumentParser.add_argument('--path', help='path to pdf folder or file', default=".")
39+
argumentParser.add_argument('--out', help='path to output file location', default=".")
40+
argumentParser.add_argument('--log_level', choices=['critical', 'error', 'warning', 'info', 'debug'], help='logger '
41+
'level to '
42+
'use ('
43+
'default: '
44+
'info)',
45+
default='info')
46+
argumentParser.add_argument('--search', help='word to search for', default="default")
47+
argumentParser.add_argument('--tessdata', help='location of tesseract data files', default="/usr/share/tessdata")
48+
argumentParser.add_argument('--tables', type=str2bool, help='should tables be extracted and searched', default=True)
49+
50+
args = vars(argumentParser.parse_args())
51+
output_path = args["out"]
52+
log_level = switcher.get(args["log_level"])
53+
searchWord = args["search"]
54+
tessdata_location = args["tessdata"]
55+
tables_extract = args["tables"]
56+
57+
# Set up logger
58+
logger = logging.getLogger(__name__)
59+
logger.setLevel(log_level)
60+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
61+
consoleHandler = logging.StreamHandler()
62+
consoleHandler.setLevel(log_level)
63+
consoleHandler.setFormatter(formatter)
64+
fileHandler = logging.FileHandler('PDFScraper.log', 'w')
65+
fileHandler.setLevel(log_level)
66+
fileHandler.setFormatter(formatter)
67+
logger.addHandler(consoleHandler)
68+
logger.addHandler(fileHandler)
69+
logger.info("Started")
70+
71+
# Define signal handlers
72+
def signal_handler(sign, frame):
73+
logger.info("Ctrl+C pressed")
74+
logger.info("Stopping")
75+
sys.exit(0)
76+
77+
# Start signal handlers
78+
signal.signal(signal.SIGINT, signal_handler)
79+
80+
# Read PDFs from path
81+
docs = Documents(args["path"])
82+
logger.info('Finding PDFs in ' + docs.path)
83+
try:
84+
find_pdfs_in_path(docs, docs.path)
85+
except Exception as e:
86+
logger.error(e)
87+
sys.exit(1)
88+
logger.info('Found ' + str(docs.num_docs) + ' PDFs')
89+
90+
logger.info('Parsing ' + str(docs.num_docs) + ' documents')
91+
# Extract information about PDFs
92+
progress_counter = 1
93+
for doc in docs.docs:
94+
get_pdf_object(doc)
95+
96+
if doc.extractable:
97+
extract_info(doc)
98+
logger.debug('Document information:' + '\n' + doc.document_info_to_string())
99+
extract_table_of_contents(doc)
100+
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
101+
extract_page_layouts(doc)
102+
# table extraction is possible only for text based PDFs
103+
if tables_extract:
104+
extract_tables(doc, output_path)
105+
parse_layouts(doc)
106+
if len(doc.paragraphs) == 0:
107+
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
108+
get_filename(doc)
109+
pdf_to_image(doc)
110+
extract_text_ocr(doc, tessdata_location)
111+
get_pdf_object(doc)
112+
extract_page_layouts(doc)
113+
if tables_extract:
114+
extract_tables(doc, output_path)
115+
parse_layouts(doc)
116+
logger.debug(doc.text)
117+
logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
118+
119+
else:
120+
logger.warning("Skipping parsing. Document is not extractable.")
121+
logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(docs.num_docs) + ' documents')
122+
progress_counter += 1
123+
124+
logger.info('Done parsing PDFs')
125+
logger.info('Stopping')
126+
generate_html(output_path, docs, searchWord)
127+
# clean up temporary directory
128+
shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
129+
sys.exit(0)
130+
131+
if __name__ == "__main__":
132+
main()

PDFScraper/main.py

Lines changed: 0 additions & 131 deletions
This file was deleted.

Pipfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ verify_ssl = true
66
[dev-packages]
77
pylint = "==2.4.4"
88
mypy = "*"
9+
pipenv-setup = "*"
910

1011
[packages]
1112
fuzzywuzzy = "*"
@@ -23,4 +24,4 @@ python-levenshtein = "==0.12.0"
2324
pdfminer-six = "==20200726"
2425

2526
[requires]
26-
python_version = "3.8"
27+
python_version = ">=3.6"

0 commit comments

Comments
 (0)