|
1 |
| -__version__ = "1.0.14" |
| 1 | +__version__ = "1.1.0" |
2 | 2 |
|
3 |
| -import argparse |
4 | 3 | import logging
|
5 |
| -import os |
6 |
| -import shutil |
7 |
| -import signal |
8 |
| -import sys |
9 |
| -import tempfile |
10 |
| - |
11 |
| -from PDFScraper.batchProcessing import find_pdfs_in_path |
12 |
| -from PDFScraper.dataStructure import Documents |
13 |
| -from PDFScraper.outputGenerator import generate_html |
14 |
| -from PDFScraper.pdfParser import get_filename, pdf_to_image, convert_to_pdf, get_pdf_object, extract_page_layouts, \ |
15 |
| - extract_tables, parse_layouts, extract_table_of_contents, extract_info |
16 | 4 |
|
17 | 5 |
|
18 | 6 | def version():
|
19 | 7 | return __version__
|
20 | 8 |
|
21 | 9 |
|
22 |
| -def main(): |
23 |
| - # Define logger level helper |
24 |
| - logger_switcher = { |
25 |
| - 'critical': 50, |
26 |
| - 'error': 40, |
27 |
| - 'warning': 30, |
28 |
| - 'info': 20, |
29 |
| - 'debug': 10 |
30 |
| - } |
31 |
| - |
32 |
| - # boolean input helper for ArgumentParser |
33 |
| - def str2bool(v): |
34 |
| - if isinstance(v, bool): |
35 |
| - return v |
36 |
| - if v.lower() in ('yes', 'true', 't', 'y', '1'): |
37 |
| - return True |
38 |
| - elif v.lower() in ('no', 'false', 'f', 'n', '0'): |
39 |
| - return False |
40 |
| - else: |
41 |
| - raise argparse.ArgumentTypeError('Boolean value expected.') |
42 |
| - |
43 |
| - # boolean input helper for search_mode |
44 |
| - def search_mode_helper(v): |
45 |
| - if isinstance(v, bool): |
46 |
| - return v |
47 |
| - if v.lower() in ('and', '&', 't', 'y', '1', 'true'): |
48 |
| - return True |
49 |
| - elif v.lower() in ('or', '|', 'f', 'n', '0', 'false'): |
50 |
| - return False |
51 |
| - else: |
52 |
| - raise argparse.ArgumentTypeError('"and" or "or" value expected') |
53 |
| - |
54 |
| - |
55 |
| - # Parse arguments from command line |
56 |
| - argumentParser = argparse.ArgumentParser() |
57 |
| - argumentParser.add_argument('--path', help='path to pdf folder or file', default=".") |
58 |
| - argumentParser.add_argument('--out', help='path to output file location', default=".") |
59 |
| - argumentParser.add_argument('--log_level', choices=['critical', 'error', 'warning', 'info', 'debug'], help='logger ' |
60 |
| - 'level to ' |
61 |
| - 'use (' |
62 |
| - 'default: ' |
63 |
| - 'info)', |
64 |
| - default='info') |
65 |
| - argumentParser.add_argument('--search', help='word to search for', default="default") |
66 |
| - argumentParser.add_argument('--tessdata', help='location of tesseract data files', default="/usr/share/tessdata") |
67 |
| - argumentParser.add_argument('--tables', type=str2bool, help='should tables be extracted and searched', default=True) |
68 |
| - # True -> and mode, False -> or mode |
69 |
| - argumentParser.add_argument('--search_mode', type=search_mode_helper, help='And or Or search, when multiple ' |
70 |
| - 'search words are provided', |
71 |
| - default=True) |
72 |
| - |
73 |
| - args = vars(argumentParser.parse_args()) |
74 |
| - output_path = args["out"] |
75 |
| - log_level = logger_switcher.get(args["log_level"]) |
76 |
| - search_word = args["search"] |
77 |
| - tessdata_location = args["tessdata"] |
78 |
| - tables_extract = args["tables"] |
79 |
| - search_mode = args["search_mode"] |
80 |
| - |
81 |
| - # Set up logger |
82 |
| - logger = logging.getLogger(__name__) |
83 |
| - logger.setLevel(log_level) |
84 |
| - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
85 |
| - consoleHandler = logging.StreamHandler() |
86 |
| - consoleHandler.setLevel(log_level) |
87 |
| - consoleHandler.setFormatter(formatter) |
88 |
| - fileHandler = logging.FileHandler('PDFScraper.log', 'w') |
89 |
| - fileHandler.setLevel(log_level) |
90 |
| - fileHandler.setFormatter(formatter) |
91 |
| - logger.addHandler(consoleHandler) |
92 |
| - logger.addHandler(fileHandler) |
93 |
| - logger.info("Started") |
94 |
| - |
95 |
| - # Define signal handlers |
96 |
| - def signal_handler(sign, frame): |
97 |
| - logger.info("Ctrl+C pressed") |
98 |
| - logger.info("Stopping") |
99 |
| - sys.exit(0) |
100 |
| - |
101 |
| - # Start signal handlers |
102 |
| - signal.signal(signal.SIGINT, signal_handler) |
103 |
| - |
104 |
| - # Read PDFs from path |
105 |
| - docs = Documents(path=os.path.abspath(args["path"])) |
106 |
| - logger.info('Finding PDFs in ' + docs.path) |
107 |
| - try: |
108 |
| - find_pdfs_in_path(docs, docs.path) |
109 |
| - except Exception as e: |
110 |
| - logger.error(e) |
111 |
| - sys.exit(1) |
112 |
| - logger.info('Found ' + str(docs.num_docs) + ' PDFs') |
113 |
| - |
114 |
| - logger.info('Parsing ' + str(docs.num_docs) + ' documents') |
115 |
| - # Extract information about PDFs |
116 |
| - progress_counter = 1 |
117 |
| - for doc in docs.docs: |
118 |
| - extract_info(doc) |
119 |
| - if doc.isPDF: |
120 |
| - get_pdf_object(doc) |
121 |
| - if doc.extractable: |
122 |
| - |
123 |
| - logger.debug('Document information:' + '\n' + doc.document_info_to_string()) |
124 |
| - extract_table_of_contents(doc) |
125 |
| - logger.debug('Table of contents: \n' + doc.table_of_contents_to_string()) |
126 |
| - extract_page_layouts(doc) |
127 |
| - # table extraction is possible only for text based PDFs |
128 |
| - if tables_extract: |
129 |
| - extract_tables(doc, output_path) |
130 |
| - parse_layouts(doc) |
131 |
| - if len(doc.paragraphs) == 0: |
132 |
| - logger.info("Regular text extraction is not possible. Trying to extract text using OCR") |
133 |
| - get_filename(doc) |
134 |
| - pdf_to_image(doc) |
135 |
| - convert_to_pdf(doc, tessdata_location) |
136 |
| - get_pdf_object(doc) |
137 |
| - extract_page_layouts(doc) |
138 |
| - if tables_extract: |
139 |
| - extract_tables(doc, output_path) |
140 |
| - parse_layouts(doc) |
141 |
| - |
142 |
| - else: |
143 |
| - logger.warning("Skipping parsing. Document is not extractable.") |
144 |
| - logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(docs.num_docs) + ' documents') |
145 |
| - progress_counter += 1 |
146 |
| - else: |
147 |
| - logger.info("Regular text extraction is not possible. Trying to extract text using OCR") |
148 |
| - get_filename(doc) |
149 |
| - pdf_to_image(doc) |
150 |
| - convert_to_pdf(doc, tessdata_location) |
151 |
| - get_pdf_object(doc) |
152 |
| - extract_page_layouts(doc) |
153 |
| - if tables_extract: |
154 |
| - extract_tables(doc, output_path) |
155 |
| - parse_layouts(doc) |
156 |
| - logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs)) |
157 |
| - logger.info('Done parsing PDFs') |
158 |
| - logger.info('Stopping') |
159 |
| - generate_html(output_path, docs, search_word, search_mode) |
160 |
| - # clean up temporary directory |
161 |
| - shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True) |
162 |
| - sys.exit(0) |
163 |
| - |
164 |
| - |
| 10 | +# set up logging |
| 11 | +logger = logging.getLogger("PDFScraper") |
| 12 | +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
| 13 | +consoleHandler = logging.StreamHandler() |
| 14 | +consoleHandler.setFormatter(formatter) |
| 15 | +fileHandler = logging.FileHandler('PDFScraper.log', 'w') |
| 16 | +logger.addHandler(consoleHandler) |
| 17 | +logger.addHandler(fileHandler) |
| 18 | +logger.info("Started") |
0 commit comments