1+ import argparse
2+ import logging
3+ import shutil
4+ import signal
5+ import sys
6+ import tempfile
7+
8+ from PDFScraper .batchProcessing import find_pdfs_in_path
9+ from PDFScraper .dataStructure import Documents
10+ from PDFScraper .outputGenerator import generate_html
11+ from PDFScraper .pdfParser import get_filename , pdf_to_image , extract_text_ocr , get_pdf_object , extract_page_layouts , \
12+ extract_tables , parse_layouts , extract_table_of_contents , extract_info
13+
14+
15+ def main ():
16+ # Define logger level helper
17+ switcher = {
18+ 'critical' : 50 ,
19+ 'error' : 40 ,
20+ 'warning' : 30 ,
21+ 'info' : 20 ,
22+ 'debug' : 10
23+ }
24+
25+ # boolean input helper for ArgumentParser
26+ def str2bool (v ):
27+ if isinstance (v , bool ):
28+ return v
29+ if v .lower () in ('yes' , 'true' , 't' , 'y' , '1' ):
30+ return True
31+ elif v .lower () in ('no' , 'false' , 'f' , 'n' , '0' ):
32+ return False
33+ else :
34+ raise argparse .ArgumentTypeError ('Boolean value expected.' )
35+
36+ # Parse arguments from command line
37+ argumentParser = argparse .ArgumentParser ()
38+ argumentParser .add_argument ('--path' , help = 'path to pdf folder or file' , default = "." )
39+ argumentParser .add_argument ('--out' , help = 'path to output file location' , default = "." )
40+ argumentParser .add_argument ('--log_level' , choices = ['critical' , 'error' , 'warning' , 'info' , 'debug' ], help = 'logger '
41+ 'level to '
42+ 'use ('
43+ 'default: '
44+ 'info)' ,
45+ default = 'info' )
46+ argumentParser .add_argument ('--search' , help = 'word to search for' , default = "default" )
47+ argumentParser .add_argument ('--tessdata' , help = 'location of tesseract data files' , default = "/usr/share/tessdata" )
48+ argumentParser .add_argument ('--tables' , type = str2bool , help = 'should tables be extracted and searched' , default = True )
49+
50+ args = vars (argumentParser .parse_args ())
51+ output_path = args ["out" ]
52+ log_level = switcher .get (args ["log_level" ])
53+ searchWord = args ["search" ]
54+ tessdata_location = args ["tessdata" ]
55+ tables_extract = args ["tables" ]
56+
57+ # Set up logger
58+ logger = logging .getLogger (__name__ )
59+ logger .setLevel (log_level )
60+ formatter = logging .Formatter ('%(asctime)s - %(name)s - %(levelname)s - %(message)s' )
61+ consoleHandler = logging .StreamHandler ()
62+ consoleHandler .setLevel (log_level )
63+ consoleHandler .setFormatter (formatter )
64+ fileHandler = logging .FileHandler ('PDFScraper.log' , 'w' )
65+ fileHandler .setLevel (log_level )
66+ fileHandler .setFormatter (formatter )
67+ logger .addHandler (consoleHandler )
68+ logger .addHandler (fileHandler )
69+ logger .info ("Started" )
70+
71+ # Define signal handlers
72+ def signal_handler (sign , frame ):
73+ logger .info ("Ctrl+C pressed" )
74+ logger .info ("Stopping" )
75+ sys .exit (0 )
76+
77+ # Start signal handlers
78+ signal .signal (signal .SIGINT , signal_handler )
79+
80+ # Read PDFs from path
81+ docs = Documents (args ["path" ])
82+ logger .info ('Finding PDFs in ' + docs .path )
83+ try :
84+ find_pdfs_in_path (docs , docs .path )
85+ except Exception as e :
86+ logger .error (e )
87+ sys .exit (1 )
88+ logger .info ('Found ' + str (docs .num_docs ) + ' PDFs' )
89+
90+ logger .info ('Parsing ' + str (docs .num_docs ) + ' documents' )
91+ # Extract information about PDFs
92+ progress_counter = 1
93+ for doc in docs .docs :
94+ get_pdf_object (doc )
95+
96+ if doc .extractable :
97+ extract_info (doc )
98+ logger .debug ('Document information:' + '\n ' + doc .document_info_to_string ())
99+ extract_table_of_contents (doc )
100+ logger .debug ('Table of contents: \n ' + doc .table_of_contents_to_string ())
101+ extract_page_layouts (doc )
102+ # table extraction is possible only for text based PDFs
103+ if tables_extract :
104+ extract_tables (doc , output_path )
105+ parse_layouts (doc )
106+ if len (doc .paragraphs ) == 0 :
107+ logger .info ("Regular text extraction is not possible. Trying to extract text using OCR" )
108+ get_filename (doc )
109+ pdf_to_image (doc )
110+ extract_text_ocr (doc , tessdata_location )
111+ get_pdf_object (doc )
112+ extract_page_layouts (doc )
113+ if tables_extract :
114+ extract_tables (doc , output_path )
115+ parse_layouts (doc )
116+ logger .debug (doc .text )
117+ logger .debug ('Paragraphs: \n ' + '\n ' .join (doc .paragraphs ))
118+
119+ else :
120+ logger .warning ("Skipping parsing. Document is not extractable." )
121+ logger .info ('Parsed ' + str (progress_counter ) + ' out of ' + str (docs .num_docs ) + ' documents' )
122+ progress_counter += 1
123+
124+ logger .info ('Done parsing PDFs' )
125+ logger .info ('Stopping' )
126+ generate_html (output_path , docs , searchWord )
127+ # clean up temporary directory
128+ shutil .rmtree (tempfile .gettempdir () + "/PDFScraper" , ignore_errors = True )
129+ sys .exit (0 )
130+
131+ if __name__ == "__main__" :
132+ main ()
0 commit comments