Skip to content

Commit 27f74df

Browse files
committed
Improved skew correction
1 parent 33a8c8e commit 27f74df

File tree

6 files changed

+317
-67
lines changed

6 files changed

+317
-67
lines changed

PDFScraper/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "1.0.6"
1+
__version__ = "1.0.7"
22

33
import argparse
44
import logging
@@ -137,8 +137,6 @@ def signal_handler(sign, frame):
137137
if tables_extract:
138138
extract_tables(doc, output_path)
139139
parse_layouts(doc)
140-
logger.debug(doc.text)
141-
logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
142140

143141
else:
144142
logger.warning("Skipping parsing. Document is not extractable.")
@@ -154,7 +152,7 @@ def signal_handler(sign, frame):
154152
if tables_extract:
155153
extract_tables(doc, output_path)
156154
parse_layouts(doc)
157-
logger.debug(doc.text)
155+
logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
158156
logger.info('Done parsing PDFs')
159157
logger.info('Stopping')
160158
generate_html(output_path, docs, search_word, search_mode)

PDFScraper/pdfParser.py

Lines changed: 123 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@
99

1010
import camelot
1111
import cv2
12+
import numpy as np
1213
import pytesseract
1314
from PyPDF2 import PdfFileReader, PdfFileWriter
1415
from iso639 import languages
1516
from langdetect import detect_langs
16-
import numpy as np
1717
from pdf2image import pdf2image
1818
from pdfminer.converter import PDFPageAggregator, TextConverter
1919
from pdfminer.layout import LAParams, LTTextBoxHorizontal, LTImage
@@ -22,7 +22,10 @@
2222
from pdfminer.pdfpage import PDFPage
2323
from pdfminer.pdfparser import PDFParser
2424
from pytesseract import TesseractNotFoundError, TesseractError
25-
from scipy.ndimage import interpolation as inter
25+
from skimage import io
26+
from skimage.feature import canny
27+
from skimage.transform import hough_line, hough_line_peaks, rotate
28+
2629
from PDFScraper.dataStructure import Document
2730

2831
# Set up logger
@@ -98,35 +101,129 @@ def image_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
98101
return resized
99102

100103

101-
def preprocess_image(image):
104+
# determine skew angle of image
105+
def determine_skew(image):
106+
edges = canny(image, sigma=3.0)
107+
h, a, d = hough_line(edges)
108+
_, ap, _ = hough_line_peaks(h, a, d, num_peaks=20)
109+
110+
if len(ap) == 0:
111+
return 0
112+
113+
def calculate_deviation(angle):
114+
115+
angle_in_degrees = np.abs(angle)
116+
deviation = np.abs(np.pi / 4 - angle_in_degrees)
117+
return deviation
118+
119+
absolute_deviations = [calculate_deviation(k) for k in ap]
120+
average_deviation = np.mean(np.rad2deg(absolute_deviations))
121+
ap_deg = [np.rad2deg(x) for x in ap]
122+
123+
bin_0_45 = []
124+
bin_45_90 = []
125+
bin_0_45n = []
126+
bin_45_90n = []
127+
128+
def compare_sum(value):
129+
if 44 <= value <= 46:
130+
return True
131+
else:
132+
return False
133+
134+
for ang in ap_deg:
135+
deviation_sum = int(90 - ang + average_deviation)
136+
if compare_sum(deviation_sum):
137+
bin_45_90.append(ang)
138+
continue
139+
140+
deviation_sum = int(ang + average_deviation)
141+
if compare_sum(deviation_sum):
142+
bin_0_45.append(ang)
143+
continue
144+
145+
deviation_sum = int(-ang + average_deviation)
146+
if compare_sum(deviation_sum):
147+
bin_0_45n.append(ang)
148+
continue
149+
150+
deviation_sum = int(90 + ang + average_deviation)
151+
if compare_sum(deviation_sum):
152+
bin_45_90n.append(ang)
153+
154+
angles = [bin_0_45, bin_45_90, bin_0_45n, bin_45_90n]
155+
lmax = 0
156+
157+
for j in range(len(angles)):
158+
l = len(angles[j])
159+
if l > lmax:
160+
lmax = l
161+
maxi = j
162+
163+
def get_max_freq_elem(arr):
164+
165+
max_arr = []
166+
freqs = {}
167+
for i in arr:
168+
if i in freqs:
169+
freqs[i] += 1
170+
else:
171+
freqs[i] = 1
102172

173+
sorted_keys = sorted(freqs, key=freqs.get, reverse=True)
174+
max_freq = freqs[sorted_keys[0]]
175+
176+
for k in sorted_keys:
177+
if freqs[k] == max_freq:
178+
max_arr.append(k)
179+
180+
return max_arr
181+
182+
if lmax:
183+
ans_arr = get_max_freq_elem(angles[maxi])
184+
ans_res = np.mean(ans_arr)
185+
186+
else:
187+
ans_arr = get_max_freq_elem(ap_deg)
188+
ans_res = np.mean(ans_arr)
189+
190+
return ans_res
191+
192+
193+
# Apply deskewing to the image
194+
def deskew(image):
195+
angle = determine_skew(image)
196+
197+
if 0 <= angle <= 90:
198+
rot_angle = angle - 90
199+
if -45 <= angle < 0:
200+
rot_angle = angle - 90
201+
if -90 <= angle < -45:
202+
rot_angle = 90 + angle
203+
204+
return rotate(image, rot_angle, resize=True)
205+
206+
207+
def preprocess_image(image):
103208
# Denoising
104209
image = cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 15)
105210
# RGB to grayscale
106211
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
107212
# Thresholding
108213
image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
109214

110-
# Skew correction
111-
def determine_score(arr, angle):
112-
data = inter.rotate(arr, angle, reshape=False, order=0)
113-
histogram = np.sum(data, axis=1)
114-
score = np.sum((histogram[1:] - histogram[:-1]) ** 2)
115-
return histogram, score
116-
117-
scores = []
118-
angles = np.arange(-5, 5 + 1, 1)
119-
for angle in angles:
120-
histogram, score = determine_score(image, angle)
121-
scores.append(score)
122-
best_angle = angles[scores.index(max(scores))]
123-
124-
(h, w) = image.shape[:2]
125-
center = (w // 2, h // 2)
126-
M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
127-
image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, \
128-
borderMode=cv2.BORDER_REPLICATE)
129-
215+
# save and reread to convert to scikit-image image type
216+
temp_image_path = tempfile.gettempdir() + "/PDFScraper" + "/" + "deskew.jpg"
217+
cv2.imwrite(temp_image_path, image)
218+
image = io.imread(temp_image_path)
219+
os.remove(temp_image_path)
220+
# perform deskewing
221+
image = deskew(image)
222+
image = image * 255
223+
io.imsave(temp_image_path, image.astype(np.uint8))
224+
io.imsave("/home/erikkastelec/Desktop/out2.jpg", image.astype(np.uint8))
225+
image = cv2.imread(temp_image_path)
226+
os.remove(temp_image_path)
130227
return image
131228

132229

@@ -138,8 +235,8 @@ def convert_to_pdf(document: Document, tessdata_location: str):
138235
# remove temporary image file
139236
os.remove(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + "_" + str(i) + ".jpg")
140237
# Resize imput image if not PDF
141-
if not document.isPDF:
142-
image = image_resize(image, width=1024)
238+
# if not document.isPDF:
239+
# img = image_resize(img, width=1024)
143240
img = preprocess_image(img)
144241

145242
# Extract testing using OCR
@@ -195,8 +292,7 @@ def get_language(img, tessdata_location: str):
195292
detected_languages = detect_langs(text)
196293
# Convert iso-639-2b to iso-639-2t
197294
language = languages.get(part1=detected_languages[0].lang)
198-
199-
return language.part2t
295+
return "eng"
200296

201297

202298
# parses Document to PDFDocument

Pipfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,7 @@ yattag = "*"
2323
python-levenshtein = "==0.12.0"
2424
pdfminer-six = "==20200726"
2525
scipy = "==1.5.2"
26+
scikit-image = "*"
27+
2628
[requires]
2729
python_version = ">=3.6"

0 commit comments

Comments
 (0)