Skip to content

Commit 00b7dbc

Browse files
committed
Cleanup
1 parent 1615719 commit 00b7dbc

File tree

4 files changed

+10
-24
lines changed

4 files changed

+10
-24
lines changed

PDFScraper/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.0.11"
1+
__version__ = "1.0.12"
22

33
import argparse
44
import logging
@@ -119,7 +119,7 @@ def signal_handler(sign, frame):
119119
if doc.isPDF:
120120
get_pdf_object(doc)
121121
if doc.extractable:
122-
#extract_info(doc)
122+
123123
logger.debug('Document information:' + '\n' + doc.document_info_to_string())
124124
extract_table_of_contents(doc)
125125
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())

PDFScraper/dataStructure.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,8 @@ def __init__(self, path: str, parent: Documents, isPDF: bool):
1919
self.path = path
2020
self.ocr_path = path
2121
self.num_pages = None
22-
self.text = []
2322
self.images = []
2423
self.tables = []
25-
self.ltfigures = []
2624
self.paragraphs = []
2725
self.table_of_contents = []
2826
self.page_layouts = []

PDFScraper/pdfParser.py

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import re
55
import sys
66
import tempfile
7-
from io import StringIO
87
from typing import TYPE_CHECKING
98

109
import camelot
@@ -15,7 +14,7 @@
1514
from iso639 import languages
1615
from langdetect import detect_langs
1716
from pdf2image import pdf2image
18-
from pdfminer.converter import PDFPageAggregator, TextConverter
17+
from pdfminer.converter import PDFPageAggregator
1918
from pdfminer.layout import LAParams, LTTextBoxHorizontal, LTImage
2019
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
2120
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
@@ -306,21 +305,6 @@ def get_pdf_object(document: Document):
306305
document.extractable = True
307306

308307

309-
def extract_text(document: Document):
310-
output_string = StringIO()
311-
with open(document.path, 'rb') as in_file:
312-
parser = PDFParser(in_file)
313-
pdf = PDFDocument(parser)
314-
codec = 'unicode'
315-
rsrcmgr = PDFResourceManager()
316-
device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=LAParams())
317-
interpreter = PDFPageInterpreter(rsrcmgr, device)
318-
for page in PDFPage.create_pages(pdf):
319-
interpreter.process_page(page)
320-
321-
return output_string.getvalue()
322-
323-
324308
def extract_info(document: Document):
325309
if document.isPDF:
326310
with open(document.path, 'rb') as f:
@@ -382,14 +366,18 @@ def doOverlap(l1, r1, l2, r2):
382366
return True
383367

384368

385-
def parse_layouts(document: Document):
369+
# parse pdfminer.six layouts
370+
def parse_layouts(document: Document, preserve_pdfminer_structure=True):
386371
count = 1
387372
for page_layout in document.page_layouts:
388373
parse_elements(document, page_layout, count)
389374
count = count + 1
375+
# keep data structure small
376+
if not preserve_pdfminer_structure:
377+
page_layout = []
390378

391379

392-
# Recursively iterate over all the elements
380+
# Recursively iterate over all the lt elements from pdfminer.six
393381
def parse_elements(document, page_layout, page):
394382
for element in page_layout:
395383
# TODO: improve efficiency

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
"yattag==1.14.0",
5050
],
5151
name="PDFScraper",
52-
version="1.0.11",
52+
version="1.0.12",
5353
author="Erik Kastelec",
5454
author_email="erikkastelec@gmail.com",
5555
description="PDF text and table search",

0 commit comments

Comments
 (0)