Skip to content

Commit 1c19bff

Browse files
committed
Refactoring
1 parent a16d7d0 commit 1c19bff

File tree

8 files changed

+266
-267
lines changed

8 files changed

+266
-267
lines changed

PDFScraper/__init__.py

Lines changed: 10 additions & 156 deletions
Original file line numberDiff line numberDiff line change
@@ -1,164 +1,18 @@
1-
__version__ = "1.0.14"
1+
__version__ = "1.1.0"
22

3-
import argparse
43
import logging
5-
import os
6-
import shutil
7-
import signal
8-
import sys
9-
import tempfile
10-
11-
from PDFScraper.batchProcessing import find_pdfs_in_path
12-
from PDFScraper.dataStructure import Documents
13-
from PDFScraper.outputGenerator import generate_html
14-
from PDFScraper.pdfParser import get_filename, pdf_to_image, convert_to_pdf, get_pdf_object, extract_page_layouts, \
15-
extract_tables, parse_layouts, extract_table_of_contents, extract_info
164

175

186
def version():
197
return __version__
208

219

22-
def main():
23-
# Define logger level helper
24-
logger_switcher = {
25-
'critical': 50,
26-
'error': 40,
27-
'warning': 30,
28-
'info': 20,
29-
'debug': 10
30-
}
31-
32-
# boolean input helper for ArgumentParser
33-
def str2bool(v):
34-
if isinstance(v, bool):
35-
return v
36-
if v.lower() in ('yes', 'true', 't', 'y', '1'):
37-
return True
38-
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
39-
return False
40-
else:
41-
raise argparse.ArgumentTypeError('Boolean value expected.')
42-
43-
# boolean input helper for search_mode
44-
def search_mode_helper(v):
45-
if isinstance(v, bool):
46-
return v
47-
if v.lower() in ('and', '&', 't', 'y', '1', 'true'):
48-
return True
49-
elif v.lower() in ('or', '|', 'f', 'n', '0', 'false'):
50-
return False
51-
else:
52-
raise argparse.ArgumentTypeError('"and" or "or" value expected')
53-
54-
55-
# Parse arguments from command line
56-
argumentParser = argparse.ArgumentParser()
57-
argumentParser.add_argument('--path', help='path to pdf folder or file', default=".")
58-
argumentParser.add_argument('--out', help='path to output file location', default=".")
59-
argumentParser.add_argument('--log_level', choices=['critical', 'error', 'warning', 'info', 'debug'], help='logger '
60-
'level to '
61-
'use ('
62-
'default: '
63-
'info)',
64-
default='info')
65-
argumentParser.add_argument('--search', help='word to search for', default="default")
66-
argumentParser.add_argument('--tessdata', help='location of tesseract data files', default="/usr/share/tessdata")
67-
argumentParser.add_argument('--tables', type=str2bool, help='should tables be extracted and searched', default=True)
68-
# True -> and mode, False -> or mode
69-
argumentParser.add_argument('--search_mode', type=search_mode_helper, help='And or Or search, when multiple '
70-
'search words are provided',
71-
default=True)
72-
73-
args = vars(argumentParser.parse_args())
74-
output_path = args["out"]
75-
log_level = logger_switcher.get(args["log_level"])
76-
search_word = args["search"]
77-
tessdata_location = args["tessdata"]
78-
tables_extract = args["tables"]
79-
search_mode = args["search_mode"]
80-
81-
# Set up logger
82-
logger = logging.getLogger(__name__)
83-
logger.setLevel(log_level)
84-
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
85-
consoleHandler = logging.StreamHandler()
86-
consoleHandler.setLevel(log_level)
87-
consoleHandler.setFormatter(formatter)
88-
fileHandler = logging.FileHandler('PDFScraper.log', 'w')
89-
fileHandler.setLevel(log_level)
90-
fileHandler.setFormatter(formatter)
91-
logger.addHandler(consoleHandler)
92-
logger.addHandler(fileHandler)
93-
logger.info("Started")
94-
95-
# Define signal handlers
96-
def signal_handler(sign, frame):
97-
logger.info("Ctrl+C pressed")
98-
logger.info("Stopping")
99-
sys.exit(0)
100-
101-
# Start signal handlers
102-
signal.signal(signal.SIGINT, signal_handler)
103-
104-
# Read PDFs from path
105-
docs = Documents(path=os.path.abspath(args["path"]))
106-
logger.info('Finding PDFs in ' + docs.path)
107-
try:
108-
find_pdfs_in_path(docs, docs.path)
109-
except Exception as e:
110-
logger.error(e)
111-
sys.exit(1)
112-
logger.info('Found ' + str(docs.num_docs) + ' PDFs')
113-
114-
logger.info('Parsing ' + str(docs.num_docs) + ' documents')
115-
# Extract information about PDFs
116-
progress_counter = 1
117-
for doc in docs.docs:
118-
extract_info(doc)
119-
if doc.isPDF:
120-
get_pdf_object(doc)
121-
if doc.extractable:
122-
123-
logger.debug('Document information:' + '\n' + doc.document_info_to_string())
124-
extract_table_of_contents(doc)
125-
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
126-
extract_page_layouts(doc)
127-
# table extraction is possible only for text based PDFs
128-
if tables_extract:
129-
extract_tables(doc, output_path)
130-
parse_layouts(doc)
131-
if len(doc.paragraphs) == 0:
132-
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
133-
get_filename(doc)
134-
pdf_to_image(doc)
135-
convert_to_pdf(doc, tessdata_location)
136-
get_pdf_object(doc)
137-
extract_page_layouts(doc)
138-
if tables_extract:
139-
extract_tables(doc, output_path)
140-
parse_layouts(doc)
141-
142-
else:
143-
logger.warning("Skipping parsing. Document is not extractable.")
144-
logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(docs.num_docs) + ' documents')
145-
progress_counter += 1
146-
else:
147-
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
148-
get_filename(doc)
149-
pdf_to_image(doc)
150-
convert_to_pdf(doc, tessdata_location)
151-
get_pdf_object(doc)
152-
extract_page_layouts(doc)
153-
if tables_extract:
154-
extract_tables(doc, output_path)
155-
parse_layouts(doc)
156-
logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
157-
logger.info('Done parsing PDFs')
158-
logger.info('Stopping')
159-
generate_html(output_path, docs, search_word, search_mode)
160-
# clean up temporary directory
161-
shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
162-
sys.exit(0)
163-
164-
10+
# set up logging
11+
logger = logging.getLogger("PDFScraper")
12+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
13+
consoleHandler = logging.StreamHandler()
14+
consoleHandler.setFormatter(formatter)
15+
fileHandler = logging.FileHandler('PDFScraper.log', 'w')
16+
logger.addHandler(consoleHandler)
17+
logger.addHandler(fileHandler)
18+
logger.info("Started")

PDFScraper/__main__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from PDFScraper import main
1+
from PDFScraper.cli import cli
22

33
if __name__ == "__main__":
4-
main()
4+
cli()

PDFScraper/batchProcessing.py

Lines changed: 0 additions & 28 deletions
This file was deleted.

PDFScraper/cli.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import argparse
2+
import logging
3+
import os
4+
import shutil
5+
import signal
6+
import sys
7+
import tempfile
8+
9+
from PDFScraper.core import get_filename, pdf_to_image, convert_to_pdf, get_pdf_object, extract_page_layouts, \
10+
extract_tables, parse_layouts, extract_table_of_contents, extract_info, find_pdfs_in_path
11+
from PDFScraper.dataStructure import Documents
12+
from PDFScraper.outputGenerator import generate_html
13+
14+
# Define logger level helper
15+
logger_switcher = {
16+
'critical': 50,
17+
'error': 40,
18+
'warning': 30,
19+
'info': 20,
20+
'debug': 10
21+
}
22+
23+
24+
# boolean input helper for ArgumentParser
25+
def str2bool(v):
26+
if isinstance(v, bool):
27+
return v
28+
if v.lower() in ('yes', 'true', 't', 'y', '1'):
29+
return True
30+
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
31+
return False
32+
else:
33+
raise argparse.ArgumentTypeError('Boolean value expected.')
34+
35+
36+
# boolean input helper for search_mode
37+
def search_mode_helper(v):
38+
if isinstance(v, bool):
39+
return v
40+
if v.lower() in ('and', '&', 't', 'y', '1', 'true'):
41+
return True
42+
elif v.lower() in ('or', '|', 'f', 'n', '0', 'false'):
43+
return False
44+
else:
45+
raise argparse.ArgumentTypeError('"and" or "or" value expected')
46+
47+
48+
# Parse arguments from command line
49+
argumentParser = argparse.ArgumentParser()
50+
argumentParser.add_argument('--path', help='path to pdf folder or file', default=".")
51+
argumentParser.add_argument('--out', help='path to output file location', default=".")
52+
argumentParser.add_argument('--log_level', choices=['critical', 'error', 'warning', 'info', 'debug'], help='logger '
53+
'level to '
54+
'use ('
55+
'default: '
56+
'info)',
57+
default='info')
58+
argumentParser.add_argument('--search', help='word to search for', default="default")
59+
argumentParser.add_argument('--tessdata', help='location of tesseract data files', default="/usr/share/tessdata")
60+
argumentParser.add_argument('--tables', type=str2bool, help='should tables be extracted and searched', default=True)
61+
# True -> and mode, False -> or mode
62+
argumentParser.add_argument('--search_mode', type=search_mode_helper, help='And or Or search, when multiple '
63+
'search words are provided',
64+
default=True)
65+
66+
args = vars(argumentParser.parse_args())
67+
output_path = args["out"]
68+
log_level = logger_switcher.get(args["log_level"])
69+
search_word = args["search"]
70+
tessdata_location = args["tessdata"]
71+
tables_extract = args["tables"]
72+
search_mode = args["search_mode"]
73+
74+
# Set up logger
75+
logger = logging.getLogger("PDFScraper")
76+
logger.setLevel(log_level)
77+
78+
79+
# Define signal handlers
80+
def signal_handler(sign, frame):
81+
logger.info("Ctrl+C pressed")
82+
logger.info("Stopping")
83+
sys.exit(0)
84+
85+
86+
# Start signal handlers
87+
signal.signal(signal.SIGINT, signal_handler)
88+
89+
90+
def cli():
91+
# Read PDFs from path
92+
docs = Documents(path=os.path.abspath(args["path"]))
93+
logger.info('Finding PDFs in ' + docs.path)
94+
try:
95+
find_pdfs_in_path(docs, docs.path)
96+
except Exception as e:
97+
logger.error(e)
98+
sys.exit(1)
99+
logger.info('Found ' + str(docs.num_docs) + ' PDFs')
100+
101+
logger.info('Parsing ' + str(docs.num_docs) + ' documents')
102+
# Extract information about PDFs
103+
progress_counter = 1
104+
for doc in docs.docs:
105+
extract_info(doc)
106+
if doc.isPDF:
107+
get_pdf_object(doc)
108+
if doc.extractable:
109+
110+
logger.debug('Document information:' + '\n' + doc.document_info_to_string())
111+
extract_table_of_contents(doc)
112+
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
113+
extract_page_layouts(doc)
114+
# table extraction is possible only for text based PDFs
115+
if tables_extract:
116+
extract_tables(doc, output_path)
117+
parse_layouts(doc)
118+
if len(doc.paragraphs) == 0:
119+
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
120+
get_filename(doc)
121+
pdf_to_image(doc)
122+
convert_to_pdf(doc, tessdata_location)
123+
get_pdf_object(doc)
124+
extract_page_layouts(doc)
125+
if tables_extract:
126+
extract_tables(doc, output_path)
127+
parse_layouts(doc)
128+
129+
else:
130+
logger.warning("Skipping parsing. Document is not extractable.")
131+
logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(docs.num_docs) + ' documents')
132+
progress_counter += 1
133+
else:
134+
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
135+
get_filename(doc)
136+
pdf_to_image(doc)
137+
convert_to_pdf(doc, tessdata_location)
138+
get_pdf_object(doc)
139+
extract_page_layouts(doc)
140+
if tables_extract:
141+
extract_tables(doc, output_path)
142+
parse_layouts(doc)
143+
logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
144+
logger.info('Done parsing PDFs')
145+
logger.info('Stopping')
146+
generate_html(output_path, docs, search_word, search_mode)
147+
# clean up temporary directory
148+
shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
149+
sys.exit(0)

0 commit comments

Comments
 (0)