Skip to content

Commit 08ef6c0

Browse files
committed
Fix for files being left open.
1 parent b9b6339 commit 08ef6c0

File tree

7 files changed

+99
-59
lines changed

7 files changed

+99
-59
lines changed

PDFScraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.1.6"
1+
__version__ = "1.1.7"
22

33
import logging
44

PDFScraper/cli.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import sys
88
import tempfile
99

10+
import psutil
11+
1012
from PDFScraper.core import get_filename, pdf_to_image, convert_to_pdf, get_pdf_object, extract_page_layouts, \
1113
extract_tables, parse_layouts, extract_table_of_contents, extract_info, find_pdfs_in_path
1214
from PDFScraper.outputGenerator import generate_html
@@ -126,6 +128,7 @@ def process_doc(doc):
126128
parse_layouts(doc, page_layouts)
127129
logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
128130

131+
129132
def cli():
130133
path = os.path.abspath(args["path"])
131134
logger.info('Finding PDFs in ' + path)
@@ -157,6 +160,13 @@ def cli():
157160
# clean up temporary directory
158161
logger.info('Stopping')
159162
shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
163+
# close all files that were left open
164+
try:
165+
for fd in psutil.Process().open_files():
166+
open(fd.fd).close()
167+
except Exception as e:
168+
print(e)
169+
160170
sys.exit(0)
161171

162172

PDFScraper/core.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ def convert_to_pdf(document: Document, tessdata_location: str, config_options=""
284284
for i in range(pdf_reader.numPages):
285285
page = pdf_reader.getPage(i)
286286
pdf_writer.addPage(page)
287+
pdf_file.close()
287288
with open(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + ".pdf", 'w+b') as out:
288289
pdf_writer.write(out)
289290
out.close()
@@ -378,7 +379,8 @@ def extract_table_of_contents(document: Document, pdf_object):
378379
for (level, title, dest, a, se) in pdf_object.get_outlines():
379380
document.info.table_of_contents.append((level, title))
380381
except PDFNoOutlines:
381-
logger.warning("Could not get table of contents for document at path " + document.path)
382+
pass
383+
# logger.warning("Could not get table of contents for document at path " + document.path)
382384

383385

384386
# Returns true if two rectangles(l1, r1)

PDFScraper/outputGenerator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ def generate_html(output_path: str, docs, search_word: str, search_mode: bool):
290290
with tag('h2'):
291291
text("Found in document with location: " + str(document.path))
292292
doc.asis(tab)
293-
os.remove(tempfile_path)
293+
os.remove(tempfile_path)
294294

295295
# write HTML to file
296296
# check if output path is a directory

Pipfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ python-levenshtein = "==0.12.0"
2424
pdfminer-six = "==20200726"
2525
scipy = "==1.5.2"
2626
scikit-image = "*"
27+
psutil = "*"
28+
memory-profiler = "*"
2729

2830
[requires]
2931
python_version = ">=3.6"

Pipfile.lock

Lines changed: 71 additions & 47 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

setup.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"cffi==1.14.2",
1111
"chardet==3.0.4; python_version > '3.0'",
1212
"click==7.1.2",
13-
"cryptography==3.0",
13+
"cryptography==3.1",
1414
"cycler==0.10.0",
1515
"decorator==4.4.2",
1616
"distro==1.5.0",
@@ -22,15 +22,17 @@
2222
"kiwisolver==1.2.0",
2323
"langdetect==1.0.8",
2424
"matplotlib==3.3.1",
25-
"networkx==2.4",
26-
"numpy==1.19.1",
25+
"memory-profiler==0.57.0",
26+
"networkx==2.5",
27+
"numpy==1.19.2",
2728
"opencv-python==4.3.0.36",
28-
"openpyxl==3.0.4",
29-
"pandas==1.1.0",
30-
"pdf2image==1.13.1",
29+
"openpyxl==3.0.5",
30+
"pandas==1.1.2",
31+
"pdf2image==1.14.0",
3132
"pdfminer-six==20200726",
3233
"pdfminer.six==20200726",
3334
"pillow==7.2.0",
35+
"psutil==5.7.2",
3436
"pycparser==2.20",
3537
"pyparsing==2.4.7",
3638
"pypdf2==1.26.0",
@@ -43,13 +45,13 @@
4345
"scipy==1.5.2",
4446
"six==1.15.0",
4547
"sortedcontainers==2.2.2",
46-
"tabula-py==2.1.1",
47-
"tifffile==2020.8.13",
48+
"tabula-py==2.2.0",
49+
"tifffile==2020.9.3",
4850
"wand==0.6.2",
4951
"yattag==1.14.0",
5052
],
5153
name="PDFScraper",
52-
version="1.1.6",
54+
version="1.1.7",
5355
author="Erik Kastelec",
5456
author_email="erikkastelec@gmail.com",
5557
description="PDF text and table search",

0 commit comments

Comments
 (0)