erikkastelec
diff --git a/‎PDFScraper/__init__.py
Lines changed: 24 additions & 5 deletions b/‎PDFScraper/__init__.py
Lines changed: 24 additions & 5 deletions
diff --git a/‎PDFScraper/__main__.py
Lines changed: 1 addition & 1 deletion b/‎PDFScraper/__main__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎PDFScraper/outputGenerator.py
Lines changed: 15 additions & 5 deletions b/‎PDFScraper/outputGenerator.py
Lines changed: 15 additions & 5 deletions
@@ -1,4 +1,4 @@
-__version__ = "1.0.5"
+__version__ = "1.0.6"
 
 import argparse
 import logging
@@ -20,7 +20,7 @@ def version():
 
 def main():
     # Define logger level helper
-    switcher = {
+    logger_switcher = {
         'critical': 50,
         'error': 40,
         'warning': 30,
@@ -39,6 +39,18 @@ def str2bool(v):
         else:
             raise argparse.ArgumentTypeError('Boolean value expected.')
 
+    # boolean input helper for search_mode
+    def search_mode_helper(v):
+        if isinstance(v, bool):
+            return v
+        if v.lower() in ('and', '&', 't', 'y', '1', 'true'):
+            return True
+        elif v.lower() in ('or', '|', 'f', 'n', '0', 'false'):
+            return False
+        else:
+            raise argparse.ArgumentTypeError('"and" or "or" value expected')
+
+
     # Parse arguments from command line
     argumentParser = argparse.ArgumentParser()
     argumentParser.add_argument('--path', help='path to pdf folder or file', default=".")
@@ -52,13 +64,18 @@ def str2bool(v):
     argumentParser.add_argument('--search', help='word to search for', default="default")
     argumentParser.add_argument('--tessdata', help='location of tesseract data files', default="/usr/share/tessdata")
     argumentParser.add_argument('--tables', type=str2bool, help='should tables be extracted and searched', default=True)
+    # True -> and mode, False -> or mode
+    argumentParser.add_argument('--search_mode', type=search_mode_helper, help='And or Or search, when multiple '
+                                                                               'search words are provided',
+                                default=True)
 
     args = vars(argumentParser.parse_args())
     output_path = args["out"]
-    log_level = switcher.get(args["log_level"])
-    searchWord = args["search"]
+    log_level = logger_switcher.get(args["log_level"])
+    search_word = args["search"]
     tessdata_location = args["tessdata"]
     tables_extract = args["tables"]
+    search_mode = args["search_mode"]
 
     # Set up logger
     logger = logging.getLogger(__name__)
@@ -140,7 +157,9 @@ def signal_handler(sign, frame):
             logger.debug(doc.text)
     logger.info('Done parsing PDFs')
     logger.info('Stopping')
-    generate_html(output_path, docs, searchWord)
+    generate_html(output_path, docs, search_word, search_mode)
     # clean up temporary directory
     shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
     sys.exit(0)
+
+
@@ -1,4 +1,4 @@
 from PDFScraper import main
 
 if __name__ == "__main__":
-    main()
+    main()
@@ -10,7 +10,7 @@
 from PDFScraper.dataStructure import Documents
 
 
-def generate_html(output_path: str, docs: Documents, search_word: str):
+def generate_html(output_path: str, docs: Documents, search_word: str, search_mode: bool):
     # TODO: implement html generation
     doc, tag, text = Doc().tagtext()
 
@@ -263,13 +263,23 @@ def generate_html(output_path: str, docs: Documents, search_word: str):
                     header_printed = False
 
                     # output extracted paragraphs
+
                     for paragraph in document.paragraphs:
+                        # split paragraph into sentences.
                         split = paragraph.split(".")
-                        print_paragraph = False
-                        for string in split:
-                            if (len(search_word) <= len(string)) and fuzz.partial_ratio(search_word, string) > 80:
-                                print_paragraph = True
+                        for word in search_word.split(","):
+                            print_paragraph = False
+                            for string in split:
+                                if (len(word) <= len(string)) and fuzz.partial_ratio(word, string) > 80:
+                                    print_paragraph = True
+                                    break
+                            # exit after finding first match when or mode is selected
+                            if print_paragraph and not search_mode:
+                                break
+                            # exit if one of words was not Found in and mode
+                            if not print_paragraph and search_mode:
                                 break
+
                         if print_paragraph:
                             with tag('p'):
                                 if not header_printed:
-Original file line number
+Diff line change
@@ @@ -1,4 +1,4 @@ @@
 from PDFScraper import main
 if __name__ == "__main__":
 -    main()
 +    main()