Skip to content

Commit 9958220

Browse files
committed
Renaming
1 parent 1626421 commit 9958220

File tree

8 files changed

+27
-27
lines changed

8 files changed

+27
-27
lines changed
File renamed without changes.

pdfExtractor/batchProcessing.py renamed to PDFScraper/batchProcessing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22

3-
from pdfExtractor.dataStructure import Document
4-
from pdfExtractor.dataStructure import Documents
3+
from PDFScraper.dataStructure import Document
4+
from PDFScraper.dataStructure import Documents
55

66

77
def find_pdfs_in_path(docs: Documents, path: str):
File renamed without changes.

pdfExtractor/main.py renamed to PDFScraper/main.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
import sys
66
import tempfile
77

8-
from pdfExtractor.batchProcessing import find_pdfs_in_path
9-
from pdfExtractor.dataStructure import Documents
10-
from pdfExtractor.outputGenerator import generate_html
11-
from pdfExtractor.pdfParser import extract_info, extract_table_of_contents, get_pdf_object, \
12-
extract_page_layouts, get_filename, pdf_to_image, parse_layouts, extract_text_ocr
8+
from PDFScraper.batchProcessing import find_pdfs_in_path
9+
from PDFScraper.dataStructure import Documents
10+
from PDFScraper.outputGenerator import generate_html
11+
from PDFScraper.pdfParser import extract_info, extract_table_of_contents, get_pdf_object, \
12+
extract_page_layouts, get_filename, pdf_to_image, parse_layouts, extract_text_ocr, extract_tables
1313

1414
# Define logger level helper
1515
switcher = {
@@ -52,7 +52,7 @@ def str2bool(v):
5252
log_level = switcher.get(args["log_level"])
5353
searchWord = args["search"]
5454
tessdata_location = args["tessdata"]
55-
extract_tables = args["tables"]
55+
tables_extract = args["tables"]
5656

5757
# Set up logger
5858
logger = logging.getLogger(__name__)
@@ -61,7 +61,7 @@ def str2bool(v):
6161
consoleHandler = logging.StreamHandler()
6262
consoleHandler.setLevel(log_level)
6363
consoleHandler.setFormatter(formatter)
64-
fileHandler = logging.FileHandler('pdfExtractor.log', 'w')
64+
fileHandler = logging.FileHandler('PDFScraper.log', 'w')
6565
fileHandler.setLevel(log_level)
6666
fileHandler.setFormatter(formatter)
6767
logger.addHandler(consoleHandler)
@@ -102,7 +102,7 @@ def signal_handler(sign, frame):
102102
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
103103
extract_page_layouts(doc)
104104
# table extraction is possible only for text based PDFs
105-
if extract_tables:
105+
if tables_extract:
106106
extract_tables(doc, output_path)
107107
parse_layouts(doc)
108108
if len(doc.paragraphs) == 0:
@@ -112,7 +112,7 @@ def signal_handler(sign, frame):
112112
extract_text_ocr(doc, tessdata_location)
113113
get_pdf_object(doc)
114114
extract_page_layouts(doc)
115-
if extract_tables:
115+
if tables_extract:
116116
extract_tables(doc, output_path)
117117
parse_layouts(doc)
118118
logger.debug(doc.text)
@@ -127,5 +127,5 @@ def signal_handler(sign, frame):
127127
logger.info('Stopping')
128128
generate_html(output_path, docs, searchWord)
129129
# clean up temporary directory
130-
shutil.rmtree(tempfile.gettempdir() + "/pdfExtractor", ignore_errors=True)
130+
shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
131131
sys.exit(0)

pdfExtractor/outputGenerator.py renamed to PDFScraper/outputGenerator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from fuzzywuzzy import fuzz, process
88
from yattag import Doc, indent
99

10-
from pdfExtractor.dataStructure import Documents
10+
from PDFScraper.dataStructure import Documents
1111

1212

1313
def generate_html(output_path: str, docs: Documents, search_word: str):
@@ -282,7 +282,7 @@ def generate_html(output_path: str, docs: Documents, search_word: str):
282282
for table in document.tables:
283283
with tag('div', id="table" + str(table_index), klass="container"):
284284
table_index += 1
285-
tempfile_path = tempfile.gettempdir() + "/pdfExtractor"
285+
tempfile_path = tempfile.gettempdir() + "/PDFScraper"
286286
try:
287287
os.makedirs(tempfile_path)
288288
except FileExistsError:

pdfExtractor/pdfParser.py renamed to PDFScraper/pdfParser.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,19 @@
2222
from pdfminer.pdfparser import PDFParser
2323
from pytesseract import TesseractNotFoundError, TesseractError
2424

25-
from pdfExtractor.dataStructure import Document
25+
from PDFScraper.dataStructure import Document
2626

2727
# Set up logger
2828
log_level = 20
2929
if TYPE_CHECKING:
30-
from pdfExtractor.main import log_level
30+
from PDFScraper.main import log_level
3131
logger = logging.getLogger(__name__)
3232
logger.setLevel(log_level)
3333
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
3434
consoleHandler = logging.StreamHandler()
3535
consoleHandler.setLevel(log_level)
3636
consoleHandler.setFormatter(formatter)
37-
fileHandler = logging.FileHandler('pdfExtractor.log', 'w')
37+
fileHandler = logging.FileHandler('PDFScraper.log', 'w')
3838
fileHandler.setLevel(log_level)
3939
fileHandler.setFormatter(formatter)
4040
logger.addHandler(consoleHandler)
@@ -50,7 +50,7 @@ def get_filename(document: Document):
5050
def pdf_to_image(document: Document):
5151
pages = pdf2image.convert_from_path(pdf_path=document.path, dpi=300)
5252
# TODO: implement saving to temp dir with mkstemp for better security
53-
tempfile_path = tempfile.gettempdir() + "/pdfExtractor"
53+
tempfile_path = tempfile.gettempdir() + "/PDFScraper"
5454
try:
5555
os.makedirs(tempfile_path)
5656
except FileExistsError:
@@ -64,9 +64,9 @@ def pdf_to_image(document: Document):
6464
def extract_text_ocr(document: Document, tessdata_location: str):
6565
pdf_pages = []
6666
for i in range(document.num_pages):
67-
img = cv2.imread(tempfile.gettempdir() + "/pdfExtractor" + "/" + document.filename + "_" + str(i) + ".jpg")
67+
img = cv2.imread(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + "_" + str(i) + ".jpg")
6868
# remove temporary image file
69-
os.remove(tempfile.gettempdir() + "/pdfExtractor" + "/" + document.filename + "_" + str(i) + ".jpg")
69+
os.remove(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + "_" + str(i) + ".jpg")
7070
# RGB to grayscale
7171
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
7272
# Threshold
@@ -81,11 +81,11 @@ def extract_text_ocr(document: Document, tessdata_location: str):
8181
try:
8282
config_options = '--psm 1 --tessdata-dir ' + tessdata_location
8383
text = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang=language, config=config_options)
84-
with open(tempfile.gettempdir() + "/pdfExtractor" + "/" + document.filename + "_" + str(i) + ".pdf",
84+
with open(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + "_" + str(i) + ".pdf",
8585
'w+b') as f:
8686
f.write(text)
8787
pdf_pages.append(
88-
tempfile.gettempdir() + "/pdfExtractor" + "/" + document.filename + "_" + str(i) + ".pdf")
88+
tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + "_" + str(i) + ".pdf")
8989
except TesseractNotFoundError:
9090
logger.error("Tesseract is not installed. Exiting")
9191
sys.exit(1)
@@ -99,10 +99,10 @@ def extract_text_ocr(document: Document, tessdata_location: str):
9999
for i in range(pdf_reader.numPages):
100100
page = pdf_reader.getPage(i)
101101
pdf_writer.addPage(page)
102-
with open(tempfile.gettempdir() + "/pdfExtractor" + "/" + document.filename + ".pdf", 'w+b') as out:
102+
with open(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + ".pdf", 'w+b') as out:
103103
pdf_writer.write(out)
104104
out.close()
105-
document.ocr_path = tempfile.gettempdir() + "/pdfExtractor" + "/" + document.filename + ".pdf"
105+
document.ocr_path = tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + ".pdf"
106106
# cleanup temporary files
107107
for filename in pdf_pages:
108108
os.remove(filename)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# pdfSearch
1+
# PDFScraper
22
CLI program for searching text and tables inside of PDF documents and displaying results in HTML. It combines [Pdfminer.six](https://github.com/pdfminer/pdfminer.six), [Camelot](https://github.com/camelot-dev/camelot) and [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) in a single program, which is simple to use.
33

44
# How to install

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
long_description = fh.read()
55

66
setuptools.setup(
7-
name="pdfSearch",
7+
name="PDFScraper",
88
version="1.0.0",
99
author="Erik Kastelec",
1010
author_email="erikkastelec@gmail.com",
1111
description="PDF text and table search",
1212
long_description=long_description,
1313
long_description_content_type="text/markdown",
14-
url="https://github.com/erikkastelec/pdfSearch",
14+
url="https://github.com/erikkastelec/PDFScraper",
1515
packages=setuptools.find_packages(),
1616
classifiers=[
1717
"Programming Language :: Python :: 3",

0 commit comments

Comments
 (0)