Skip to content

Commit 33a8c8e

Browse files
committed
Implemented multiple word search with "and" and "or" mode.
1 parent 17683c3 commit 33a8c8e

File tree

6 files changed

+786
-12
lines changed

6 files changed

+786
-12
lines changed

PDFScraper/__init__.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.0.5"
1+
__version__ = "1.0.6"
22

33
import argparse
44
import logging
@@ -20,7 +20,7 @@ def version():
2020

2121
def main():
2222
# Define logger level helper
23-
switcher = {
23+
logger_switcher = {
2424
'critical': 50,
2525
'error': 40,
2626
'warning': 30,
@@ -39,6 +39,18 @@ def str2bool(v):
3939
else:
4040
raise argparse.ArgumentTypeError('Boolean value expected.')
4141

42+
# boolean input helper for search_mode
43+
def search_mode_helper(v):
44+
if isinstance(v, bool):
45+
return v
46+
if v.lower() in ('and', '&', 't', 'y', '1', 'true'):
47+
return True
48+
elif v.lower() in ('or', '|', 'f', 'n', '0', 'false'):
49+
return False
50+
else:
51+
raise argparse.ArgumentTypeError('"and" or "or" value expected')
52+
53+
4254
# Parse arguments from command line
4355
argumentParser = argparse.ArgumentParser()
4456
argumentParser.add_argument('--path', help='path to pdf folder or file', default=".")
@@ -52,13 +64,18 @@ def str2bool(v):
5264
argumentParser.add_argument('--search', help='word to search for', default="default")
5365
argumentParser.add_argument('--tessdata', help='location of tesseract data files', default="/usr/share/tessdata")
5466
argumentParser.add_argument('--tables', type=str2bool, help='should tables be extracted and searched', default=True)
67+
# True -> and mode, False -> or mode
68+
argumentParser.add_argument('--search_mode', type=search_mode_helper, help='And or Or search, when multiple '
69+
'search words are provided',
70+
default=True)
5571

5672
args = vars(argumentParser.parse_args())
5773
output_path = args["out"]
58-
log_level = switcher.get(args["log_level"])
59-
searchWord = args["search"]
74+
log_level = logger_switcher.get(args["log_level"])
75+
search_word = args["search"]
6076
tessdata_location = args["tessdata"]
6177
tables_extract = args["tables"]
78+
search_mode = args["search_mode"]
6279

6380
# Set up logger
6481
logger = logging.getLogger(__name__)
@@ -140,7 +157,9 @@ def signal_handler(sign, frame):
140157
logger.debug(doc.text)
141158
logger.info('Done parsing PDFs')
142159
logger.info('Stopping')
143-
generate_html(output_path, docs, searchWord)
160+
generate_html(output_path, docs, search_word, search_mode)
144161
# clean up temporary directory
145162
shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
146163
sys.exit(0)
164+
165+

PDFScraper/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from PDFScraper import main
22

33
if __name__ == "__main__":
4-
main()
4+
main()

PDFScraper/outputGenerator.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from PDFScraper.dataStructure import Documents
1111

1212

13-
def generate_html(output_path: str, docs: Documents, search_word: str):
13+
def generate_html(output_path: str, docs: Documents, search_word: str, search_mode: bool):
1414
# TODO: implement html generation
1515
doc, tag, text = Doc().tagtext()
1616

@@ -263,13 +263,23 @@ def generate_html(output_path: str, docs: Documents, search_word: str):
263263
header_printed = False
264264

265265
# output extracted paragraphs
266+
266267
for paragraph in document.paragraphs:
268+
# split paragraph into sentences.
267269
split = paragraph.split(".")
268-
print_paragraph = False
269-
for string in split:
270-
if (len(search_word) <= len(string)) and fuzz.partial_ratio(search_word, string) > 80:
271-
print_paragraph = True
270+
for word in search_word.split(","):
271+
print_paragraph = False
272+
for string in split:
273+
if (len(word) <= len(string)) and fuzz.partial_ratio(word, string) > 80:
274+
print_paragraph = True
275+
break
276+
# exit after finding first match when or mode is selected
277+
if print_paragraph and not search_mode:
278+
break
279+
# exit if one of words was not Found in and mode
280+
if not print_paragraph and search_mode:
272281
break
282+
273283
if print_paragraph:
274284
with tag('p'):
275285
if not header_printed:

0 commit comments

Comments
 (0)