Skip to content

Commit 1615719

Browse files
committed
Fixed duplicate results in table search
1 parent fcc9260 commit 1615719

File tree

3 files changed

+18
-15
lines changed

3 files changed

+18
-15
lines changed

PDFScraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.0.10"
1+
__version__ = "1.0.11"
22

33
import argparse
44
import logging

PDFScraper/outputGenerator.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -300,20 +300,23 @@ def generate_html(output_path: str, docs: Documents, search_word: str, search_mo
300300
tempfile_path = tempfile_path + "/table"
301301
table.df[0].str.strip('.!? \n\t')
302302
# perform fuzzy search over all columns
303+
found = False
303304
for i in range(0, table.shape[1]):
304-
for x in process.extract(search_word, table.df[i].astype(str).values.tolist(),
305-
scorer=fuzz.partial_ratio):
306-
if x[1] > 80:
307-
table.to_html(tempfile_path, classes="responsive-table", index=False)
308-
with codecs.open(tempfile_path, 'r') as table_file:
309-
# replace \n in table to fix formatting
310-
tab = re.sub(r'\\n', '<br>', table_file.read())
311-
if not header_printed:
312-
with tag('h2'):
313-
text("Found in document with location: " + str(document.path))
314-
doc.asis(tab)
315-
os.remove(tempfile_path)
316-
break
305+
if not found:
306+
for x in process.extract(search_word, table.df[i].astype(str).values.tolist(),
307+
scorer=fuzz.partial_ratio):
308+
if x[1] > 80:
309+
table.to_html(tempfile_path, classes="responsive-table", index=False)
310+
with codecs.open(tempfile_path, 'r') as table_file:
311+
# replace \n in table to fix formatting
312+
tab = re.sub(r'\\n', '<br>', table_file.read())
313+
if not header_printed:
314+
with tag('h2'):
315+
text("Found in document with location: " + str(document.path))
316+
doc.asis(tab)
317+
os.remove(tempfile_path)
318+
found = True
319+
break
317320

318321
# write HTML to file
319322
# check if output path is a directory

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
"yattag==1.14.0",
5050
],
5151
name="PDFScraper",
52-
version="1.0.10",
52+
version="1.0.11",
5353
author="Erik Kastelec",
5454
author_email="erikkastelec@gmail.com",
5555
description="PDF text and table search",

0 commit comments

Comments
 (0)