2222from pdfminer .pdfparser import PDFParser
2323from pytesseract import TesseractNotFoundError , TesseractError
2424
25- from pdfExtractor .dataStructure import Document
25+ from PDFScraper .dataStructure import Document
2626
2727# Set up logger
2828log_level = 20
2929if TYPE_CHECKING :
30- from pdfExtractor .main import log_level
30+ from PDFScraper .main import log_level
3131logger = logging .getLogger (__name__ )
3232logger .setLevel (log_level )
3333formatter = logging .Formatter ('%(asctime)s - %(name)s - %(levelname)s - %(message)s' )
3434consoleHandler = logging .StreamHandler ()
3535consoleHandler .setLevel (log_level )
3636consoleHandler .setFormatter (formatter )
37- fileHandler = logging .FileHandler ('pdfExtractor .log' , 'w' )
37+ fileHandler = logging .FileHandler ('PDFScraper .log' , 'w' )
3838fileHandler .setLevel (log_level )
3939fileHandler .setFormatter (formatter )
4040logger .addHandler (consoleHandler )
@@ -50,7 +50,7 @@ def get_filename(document: Document):
5050def pdf_to_image (document : Document ):
5151 pages = pdf2image .convert_from_path (pdf_path = document .path , dpi = 300 )
5252 # TODO: implement saving to temp dir with mkstemp for better security
53- tempfile_path = tempfile .gettempdir () + "/pdfExtractor "
53+ tempfile_path = tempfile .gettempdir () + "/PDFScraper "
5454 try :
5555 os .makedirs (tempfile_path )
5656 except FileExistsError :
@@ -64,9 +64,9 @@ def pdf_to_image(document: Document):
6464def extract_text_ocr (document : Document , tessdata_location : str ):
6565 pdf_pages = []
6666 for i in range (document .num_pages ):
67- img = cv2 .imread (tempfile .gettempdir () + "/pdfExtractor " + "/" + document .filename + "_" + str (i ) + ".jpg" )
67+ img = cv2 .imread (tempfile .gettempdir () + "/PDFScraper " + "/" + document .filename + "_" + str (i ) + ".jpg" )
6868 # remove temporary image file
69- os .remove (tempfile .gettempdir () + "/pdfExtractor " + "/" + document .filename + "_" + str (i ) + ".jpg" )
69+ os .remove (tempfile .gettempdir () + "/PDFScraper " + "/" + document .filename + "_" + str (i ) + ".jpg" )
7070 # RGB to grayscale
7171 img = cv2 .cvtColor (img , cv2 .COLOR_BGR2GRAY )
7272 # Threshold
@@ -81,11 +81,11 @@ def extract_text_ocr(document: Document, tessdata_location: str):
8181 try :
8282 config_options = '--psm 1 --tessdata-dir ' + tessdata_location
8383 text = pytesseract .image_to_pdf_or_hocr (img , extension = 'pdf' , lang = language , config = config_options )
84- with open (tempfile .gettempdir () + "/pdfExtractor " + "/" + document .filename + "_" + str (i ) + ".pdf" ,
84+ with open (tempfile .gettempdir () + "/PDFScraper " + "/" + document .filename + "_" + str (i ) + ".pdf" ,
8585 'w+b' ) as f :
8686 f .write (text )
8787 pdf_pages .append (
88- tempfile .gettempdir () + "/pdfExtractor " + "/" + document .filename + "_" + str (i ) + ".pdf" )
88+ tempfile .gettempdir () + "/PDFScraper " + "/" + document .filename + "_" + str (i ) + ".pdf" )
8989 except TesseractNotFoundError :
9090 logger .error ("Tesseract is not installed. Exiting" )
9191 sys .exit (1 )
@@ -99,10 +99,10 @@ def extract_text_ocr(document: Document, tessdata_location: str):
9999 for i in range (pdf_reader .numPages ):
100100 page = pdf_reader .getPage (i )
101101 pdf_writer .addPage (page )
102- with open (tempfile .gettempdir () + "/pdfExtractor " + "/" + document .filename + ".pdf" , 'w+b' ) as out :
102+ with open (tempfile .gettempdir () + "/PDFScraper " + "/" + document .filename + ".pdf" , 'w+b' ) as out :
103103 pdf_writer .write (out )
104104 out .close ()
105- document .ocr_path = tempfile .gettempdir () + "/pdfExtractor " + "/" + document .filename + ".pdf"
105+ document .ocr_path = tempfile .gettempdir () + "/PDFScraper " + "/" + document .filename + ".pdf"
106106 # cleanup temporary files
107107 for filename in pdf_pages :
108108 os .remove (filename )
0 commit comments