9
9
10
10
import camelot
11
11
import cv2
12
+ import numpy as np
12
13
import pytesseract
13
14
from PyPDF2 import PdfFileReader , PdfFileWriter
14
15
from iso639 import languages
15
16
from langdetect import detect_langs
16
- import numpy as np
17
17
from pdf2image import pdf2image
18
18
from pdfminer .converter import PDFPageAggregator , TextConverter
19
19
from pdfminer .layout import LAParams , LTTextBoxHorizontal , LTImage
22
22
from pdfminer .pdfpage import PDFPage
23
23
from pdfminer .pdfparser import PDFParser
24
24
from pytesseract import TesseractNotFoundError , TesseractError
25
- from scipy .ndimage import interpolation as inter
25
+ from skimage import io
26
+ from skimage .feature import canny
27
+ from skimage .transform import hough_line , hough_line_peaks , rotate
28
+
26
29
from PDFScraper .dataStructure import Document
27
30
28
31
# Set up logger
@@ -98,35 +101,129 @@ def image_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
98
101
return resized
99
102
100
103
101
- def preprocess_image (image ):
104
+ # determine skew angle of image
105
+ def determine_skew (image ):
106
+ edges = canny (image , sigma = 3.0 )
107
+ h , a , d = hough_line (edges )
108
+ _ , ap , _ = hough_line_peaks (h , a , d , num_peaks = 20 )
109
+
110
+ if len (ap ) == 0 :
111
+ return 0
112
+
113
+ def calculate_deviation (angle ):
114
+
115
+ angle_in_degrees = np .abs (angle )
116
+ deviation = np .abs (np .pi / 4 - angle_in_degrees )
117
+ return deviation
118
+
119
+ absolute_deviations = [calculate_deviation (k ) for k in ap ]
120
+ average_deviation = np .mean (np .rad2deg (absolute_deviations ))
121
+ ap_deg = [np .rad2deg (x ) for x in ap ]
122
+
123
+ bin_0_45 = []
124
+ bin_45_90 = []
125
+ bin_0_45n = []
126
+ bin_45_90n = []
127
+
128
+ def compare_sum (value ):
129
+ if 44 <= value <= 46 :
130
+ return True
131
+ else :
132
+ return False
133
+
134
+ for ang in ap_deg :
135
+ deviation_sum = int (90 - ang + average_deviation )
136
+ if compare_sum (deviation_sum ):
137
+ bin_45_90 .append (ang )
138
+ continue
139
+
140
+ deviation_sum = int (ang + average_deviation )
141
+ if compare_sum (deviation_sum ):
142
+ bin_0_45 .append (ang )
143
+ continue
144
+
145
+ deviation_sum = int (- ang + average_deviation )
146
+ if compare_sum (deviation_sum ):
147
+ bin_0_45n .append (ang )
148
+ continue
149
+
150
+ deviation_sum = int (90 + ang + average_deviation )
151
+ if compare_sum (deviation_sum ):
152
+ bin_45_90n .append (ang )
153
+
154
+ angles = [bin_0_45 , bin_45_90 , bin_0_45n , bin_45_90n ]
155
+ lmax = 0
156
+
157
+ for j in range (len (angles )):
158
+ l = len (angles [j ])
159
+ if l > lmax :
160
+ lmax = l
161
+ maxi = j
162
+
163
+ def get_max_freq_elem (arr ):
164
+
165
+ max_arr = []
166
+ freqs = {}
167
+ for i in arr :
168
+ if i in freqs :
169
+ freqs [i ] += 1
170
+ else :
171
+ freqs [i ] = 1
102
172
173
+ sorted_keys = sorted (freqs , key = freqs .get , reverse = True )
174
+ max_freq = freqs [sorted_keys [0 ]]
175
+
176
+ for k in sorted_keys :
177
+ if freqs [k ] == max_freq :
178
+ max_arr .append (k )
179
+
180
+ return max_arr
181
+
182
+ if lmax :
183
+ ans_arr = get_max_freq_elem (angles [maxi ])
184
+ ans_res = np .mean (ans_arr )
185
+
186
+ else :
187
+ ans_arr = get_max_freq_elem (ap_deg )
188
+ ans_res = np .mean (ans_arr )
189
+
190
+ return ans_res
191
+
192
+
193
+ # Apply deskewing to the image
194
+ def deskew (image ):
195
+ angle = determine_skew (image )
196
+
197
+ if 0 <= angle <= 90 :
198
+ rot_angle = angle - 90
199
+ if - 45 <= angle < 0 :
200
+ rot_angle = angle - 90
201
+ if - 90 <= angle < - 45 :
202
+ rot_angle = 90 + angle
203
+
204
+ return rotate (image , rot_angle , resize = True )
205
+
206
+
207
+ def preprocess_image (image ):
103
208
# Denoising
104
209
image = cv2 .fastNlMeansDenoisingColored (image , None , 10 , 10 , 7 , 15 )
105
210
# RGB to grayscale
106
211
image = cv2 .cvtColor (image , cv2 .COLOR_BGR2GRAY )
107
212
# Thresholding
108
213
image = cv2 .threshold (image , 0 , 255 , cv2 .THRESH_BINARY + cv2 .THRESH_OTSU )[1 ]
109
214
110
- # Skew correction
111
- def determine_score (arr , angle ):
112
- data = inter .rotate (arr , angle , reshape = False , order = 0 )
113
- histogram = np .sum (data , axis = 1 )
114
- score = np .sum ((histogram [1 :] - histogram [:- 1 ]) ** 2 )
115
- return histogram , score
116
-
117
- scores = []
118
- angles = np .arange (- 5 , 5 + 1 , 1 )
119
- for angle in angles :
120
- histogram , score = determine_score (image , angle )
121
- scores .append (score )
122
- best_angle = angles [scores .index (max (scores ))]
123
-
124
- (h , w ) = image .shape [:2 ]
125
- center = (w // 2 , h // 2 )
126
- M = cv2 .getRotationMatrix2D (center , best_angle , 1.0 )
127
- image = cv2 .warpAffine (image , M , (w , h ), flags = cv2 .INTER_CUBIC , \
128
- borderMode = cv2 .BORDER_REPLICATE )
129
-
215
+ # save and reread to convert to scikit-image image type
216
+ temp_image_path = tempfile .gettempdir () + "/PDFScraper" + "/" + "deskew.jpg"
217
+ cv2 .imwrite (temp_image_path , image )
218
+ image = io .imread (temp_image_path )
219
+ os .remove (temp_image_path )
220
+ # perform deskewing
221
+ image = deskew (image )
222
+ image = image * 255
223
+ io .imsave (temp_image_path , image .astype (np .uint8 ))
224
+ io .imsave ("/home/erikkastelec/Desktop/out2.jpg" , image .astype (np .uint8 ))
225
+ image = cv2 .imread (temp_image_path )
226
+ os .remove (temp_image_path )
130
227
return image
131
228
132
229
@@ -138,8 +235,8 @@ def convert_to_pdf(document: Document, tessdata_location: str):
138
235
# remove temporary image file
139
236
os .remove (tempfile .gettempdir () + "/PDFScraper" + "/" + document .filename + "_" + str (i ) + ".jpg" )
140
237
# Resize imput image if not PDF
141
- if not document .isPDF :
142
- image = image_resize (image , width = 1024 )
238
+ # if not document.isPDF:
239
+ # img = image_resize(img , width=1024)
143
240
img = preprocess_image (img )
144
241
145
242
# Extract testing using OCR
@@ -195,8 +292,7 @@ def get_language(img, tessdata_location: str):
195
292
detected_languages = detect_langs (text )
196
293
# Convert iso-639-2b to iso-639-2t
197
294
language = languages .get (part1 = detected_languages [0 ].lang )
198
-
199
- return language .part2t
295
+ return "eng"
200
296
201
297
202
298
# parses Document to PDFDocument
0 commit comments