Fix for files being left open.

erikkastelec · erikkastelec · commit 08ef6c0ec4c4 · 2020-09-11T09:19:53.000+02:00
diff --git a/PDFScraper/__init__.py b/PDFScraper/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.1.6"
+__version__ = "1.1.7"
 
 import logging
 
diff --git a/PDFScraper/cli.py b/PDFScraper/cli.py
@@ -7,6 +7,8 @@
 import sys
 import tempfile
 
+import psutil
+
 from PDFScraper.core import get_filename, pdf_to_image, convert_to_pdf, get_pdf_object, extract_page_layouts, \
     extract_tables, parse_layouts, extract_table_of_contents, extract_info, find_pdfs_in_path
 from PDFScraper.outputGenerator import generate_html
@@ -126,6 +128,7 @@ def process_doc(doc):
         parse_layouts(doc, page_layouts)
     logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
 
+
 def cli():
     path = os.path.abspath(args["path"])
     logger.info('Finding PDFs in ' + path)
@@ -157,6 +160,13 @@ def cli():
     # clean up temporary directory
     logger.info('Stopping')
     shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
+    # close all files that were left open
+    try:
+        for fd in psutil.Process().open_files():
+            open(fd.fd).close()
+    except Exception as e:
+        print(e)
+
     sys.exit(0)
 
 
diff --git a/PDFScraper/core.py b/PDFScraper/core.py
@@ -284,6 +284,7 @@ def convert_to_pdf(document: Document, tessdata_location: str, config_options=""
         for i in range(pdf_reader.numPages):
             page = pdf_reader.getPage(i)
             pdf_writer.addPage(page)
+        pdf_file.close()
     with open(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + ".pdf", 'w+b') as out:
         pdf_writer.write(out)
         out.close()
@@ -378,7 +379,8 @@ def extract_table_of_contents(document: Document, pdf_object):
         for (level, title, dest, a, se) in pdf_object.get_outlines():
             document.info.table_of_contents.append((level, title))
     except PDFNoOutlines:
-        logger.warning("Could not get table of contents for document at path " + document.path)
+        pass
+        # logger.warning("Could not get table of contents for document at path " + document.path)
 
 
 # Returns true if two rectangles(l1, r1)
diff --git a/PDFScraper/outputGenerator.py b/PDFScraper/outputGenerator.py
@@ -290,7 +290,7 @@ def generate_html(output_path: str, docs, search_word: str, search_mode: bool):
                                     with tag('h2'):
                                         text("Found in document with location: " + str(document.path))
                                 doc.asis(tab)
-                                os.remove(tempfile_path)
+                            os.remove(tempfile_path)
 
     # write HTML to file
     # check if output path is a directory
diff --git a/Pipfile b/Pipfile
@@ -24,6 +24,8 @@ python-levenshtein = "==0.12.0"
 pdfminer-six = "==20200726"
 scipy = "==1.5.2"
 scikit-image = "*"
+psutil = "*"
+memory-profiler = "*"
 
 [requires]
 python_version = ">=3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
         "cffi==1.14.2",
         "chardet==3.0.4; python_version > '3.0'",
         "click==7.1.2",
-        "cryptography==3.0",
+        "cryptography==3.1",
         "cycler==0.10.0",
         "decorator==4.4.2",
         "distro==1.5.0",
@@ -22,15 +22,17 @@
         "kiwisolver==1.2.0",
         "langdetect==1.0.8",
         "matplotlib==3.3.1",
-        "networkx==2.4",
-        "numpy==1.19.1",
+        "memory-profiler==0.57.0",
+        "networkx==2.5",
+        "numpy==1.19.2",
         "opencv-python==4.3.0.36",
-        "openpyxl==3.0.4",
-        "pandas==1.1.0",
-        "pdf2image==1.13.1",
+        "openpyxl==3.0.5",
+        "pandas==1.1.2",
+        "pdf2image==1.14.0",
         "pdfminer-six==20200726",
         "pdfminer.six==20200726",
         "pillow==7.2.0",
+        "psutil==5.7.2",
         "pycparser==2.20",
         "pyparsing==2.4.7",
         "pypdf2==1.26.0",
@@ -43,13 +45,13 @@
         "scipy==1.5.2",
         "six==1.15.0",
         "sortedcontainers==2.2.2",
-        "tabula-py==2.1.1",
-        "tifffile==2020.8.13",
+        "tabula-py==2.2.0",
+        "tifffile==2020.9.3",
         "wand==0.6.2",
         "yattag==1.14.0",
     ],
     name="PDFScraper",
-    version="1.1.6",
+    version="1.1.7",
     author="Erik Kastelec",
     author_email="erikkastelec@gmail.com",
     description="PDF text and table search",

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "1.1.6"`
	`1`	`+__version__ = "1.1.7"`
`2`	`2`
`3`	`3`	`import logging`
`4`	`4`