1
- import re
2
1
import abc
3
2
4
- from abc import ABC
5
3
from lxml .html import document_fromstring
6
4
7
- from python_advanced_search .models .serp import (
8
- LinkType ,
9
- Serp ,
10
- Video ,
11
- Link ,
12
- MapLink ,
13
- AdLink ,
14
- OtherSearchEngineLink ,
15
- GalleryBlock ,
16
- AdBlock ,
17
- MapBlock ,
18
- VideoBlock ,
19
- FAQBlock ,
20
- SimilarRequestBlock ,
21
- RightBlock ,
22
- OtherSearchEngineBlock ,
23
- )
5
+ from python_advanced_search .models .serp import Serp
24
6
25
7
26
8
class SerpAnalyzer :
@@ -32,296 +14,3 @@ def __init__(self, html):
32
14
@abc .abstractmethod
33
15
def get_serp (self ):
34
16
""" This method must be implemented"""
35
-
36
-
37
- class GoogleSerpAnalyzer (SerpAnalyzer , ABC ):
38
- def get_serp (self ):
39
- self .serp .title = self .__get_title ()
40
- self .serp .nb_results = self .__get_nb_results ()
41
- self .serp .nb_pages = self .__get_nb_pages ()
42
- self .serp .ms = self .__get_ms ()
43
- self .serp .blocks = self .__get_blocks ()
44
- self .serp .links = self .__get_links ()
45
- return self .serp
46
-
47
- def __get_title (self ):
48
- elms = self .document .xpath ('//title' )
49
-
50
- if len (elms ) > 0 :
51
- return elms [0 ].text_content ()
52
-
53
- def __get_nb_results (self ):
54
- elms = self .document .xpath ('//*[@id="result-stats"]' )
55
-
56
- if len (elms ) > 0 :
57
- field = elms [0 ].text
58
-
59
- if field is not None :
60
- numbers = re .findall ("[0-9]+" , field )
61
- if len (numbers ) > 0 :
62
- return int ('' .join (numbers ))
63
- return 0
64
-
65
- def __get_nb_pages (self ):
66
- count = self .document .xpath ('count(//*[@class="AaVjTc"]//td)' )
67
- if count > 0 :
68
- return int (count - 2 )
69
- return int (count )
70
-
71
- def __get_ms (self ):
72
- elms = self .document .xpath ('//*[@id="result-stats"]/nobr' )
73
-
74
- if len (elms ) > 0 :
75
- field = elms [0 ].text
76
-
77
- if field is not None :
78
- numbers = re .findall ("[0-9]+" , field )
79
- if len (numbers ) > 0 :
80
- return float ('.' .join (numbers ))
81
- return 0
82
-
83
- def __get_blocks (self ):
84
- blocks = []
85
-
86
- blocks += self .__faq_blocks ()
87
- blocks += self .__ad_blocks ()
88
- blocks += self .__right_blocks ()
89
- blocks += self .__video_blocks ()
90
- blocks += self .__gallery_blocks ()
91
- blocks += self .__similar_request_blocks ()
92
- blocks += self .__map_blocks ()
93
- blocks += self .__other_search_engine_blocks ()
94
-
95
- return blocks
96
-
97
- def __faq_blocks (self ):
98
- blocks = []
99
-
100
- faq_blocks = self .document .xpath ('.//*[@id="rso"]//*[contains(@class, "AuVD")]' )
101
- if len (faq_blocks ) > 0 :
102
- for faq_block in faq_blocks :
103
- block = FAQBlock ()
104
-
105
- for question in faq_block .xpath ('.//*[@jsname="Cpkphb"]//*[@class="wWOJcd"]//span' ):
106
- block .questions .append (question .text_content ())
107
-
108
- blocks .append (block )
109
- return blocks
110
-
111
- def __ad_blocks (self ):
112
- blocks = []
113
-
114
- ad_blocks = self .document .xpath ('.//*[@id="tads" or @id="tadsb"]' )
115
- if len (ad_blocks ) > 0 :
116
- for ad_block in ad_blocks :
117
- block = AdBlock ()
118
-
119
- for ad in ad_block .xpath ('.//*[@class="uEierd"]' ):
120
- link = AdLink ()
121
-
122
- elms = ad .xpath ('.//*[@class="sVXRqc"]' )
123
- if len (elms ) > 0 :
124
- link .url = elms [0 ].get ('href' )
125
-
126
- elms = ad .xpath ('.//span' )
127
- if len (elms ) > 0 :
128
- link .title = elms [0 ].text_content ()
129
-
130
- block .links .append (link )
131
-
132
- blocks .append (block )
133
- return blocks
134
-
135
- def __right_blocks (self ):
136
- blocks = []
137
-
138
- right_blocks = self .document .xpath ('.//*[contains(@class, "liYKde")]' )
139
- if len (right_blocks ) > 0 :
140
- for right_block in right_blocks :
141
- block = RightBlock ()
142
-
143
- elms = right_block .xpath ('.//*[@jsname="cQhrTd"]' )
144
- if len (elms ) > 0 :
145
- block .source = 'Google Business'
146
-
147
- elms = right_block .xpath ('.//*[contains(@class, "ruhjFe")]' )
148
- if len (elms ) > 0 :
149
- block .source = elms [0 ].text_content ()
150
-
151
- blocks .append (block )
152
- return blocks
153
-
154
- def __video_blocks (self ):
155
- blocks = []
156
-
157
- video_blocks = self .document .xpath ('.//*[contains(@class, "uVMCKf")]' )
158
- if len (video_blocks ) > 0 :
159
- for video_block in video_blocks :
160
- block = VideoBlock ()
161
-
162
- for video in video_block .xpath ('.//video-voyager' ):
163
- v = Video ()
164
-
165
- video_urls = video .xpath ('.//a' )
166
- if len (video_urls ) > 0 :
167
- v .url = video_urls [0 ].get ('href' )
168
-
169
- video_titles = video .xpath ('.//span[@class="cHaqb"]' )
170
- if len (video_titles ) > 0 :
171
- v .title = video_titles [0 ].text_content ()
172
-
173
- video_companys = video .xpath ('.//cite' )
174
- if len (video_companys ) > 0 :
175
- v .company = video_companys [0 ].text_content ()
176
-
177
- block .videos .append (v )
178
-
179
- blocks .append (block )
180
- return blocks
181
-
182
- def __gallery_blocks (self ):
183
- blocks = []
184
-
185
- gallery_blocks = self .document .xpath ('//*[@id="iur"]' )
186
- if len (gallery_blocks ) > 0 :
187
- for gallery_block in gallery_blocks :
188
- block = GalleryBlock ()
189
-
190
- for suggest in gallery_block .xpath ('.//*[@class="dgdd6c"]' ):
191
- block .suggests .append (suggest .text_content ())
192
- blocks .append (block )
193
- return blocks
194
-
195
- def __similar_request_blocks (self ):
196
- blocks = []
197
-
198
- similar_request_blocks = self .document .xpath ('.//*[@id="botstuff"]' )
199
- if len (similar_request_blocks ) > 0 :
200
- for similar_request_block in similar_request_blocks :
201
- block = SimilarRequestBlock ()
202
-
203
- for similar_request in similar_request_block .xpath ('.//*[@class="y6Uyqe"]//a' ):
204
- block .requests .append (similar_request .text_content ())
205
-
206
- blocks .append (block )
207
- return blocks
208
-
209
- def __map_blocks (self ):
210
- blocks = []
211
-
212
- map_blocks = self .document .xpath ('.//*[@jscontroller="OWrb3e"]' )
213
- if len (map_blocks ) > 0 :
214
- for map_block in map_blocks :
215
- block = MapBlock ()
216
-
217
- for link in map_block .xpath ('.//*[contains(@class, "w7Dbne")]' ):
218
- l = MapLink ()
219
-
220
- link_urls = link .xpath ('.//a[contains(@class, "L48Cpd")]' )
221
- if len (link_urls ) > 0 :
222
- l .url = link_urls [0 ].get ('href' )
223
-
224
- link_titles = link .xpath ('.//span[@class="OSrXXb"]' )
225
- if len (link_titles ) > 0 :
226
- l .title = link_titles [0 ].text_content ()
227
-
228
- block .links .append (l )
229
-
230
- blocks .append (block )
231
- return blocks
232
-
233
- def __other_search_engine_blocks (self ):
234
- blocks = []
235
-
236
- other_search_engine_blocks = self .document .xpath ('.//*[@id="i4BWVe"]' )
237
- if len (other_search_engine_blocks ) > 0 :
238
- for other_search_engine_block in other_search_engine_blocks :
239
- block = OtherSearchEngineBlock ()
240
-
241
- for link in other_search_engine_block .xpath ('.//a[contains(@class, "t2Yvdb")]' ):
242
- l = OtherSearchEngineLink ()
243
- l .url = link .get ('href' )
244
-
245
- link_titles = link .xpath ('.//*[contains(@class, "NNFu9b")]' )
246
- if len (link_titles ) > 0 :
247
- l .title = link_titles [0 ].text_content ()
248
-
249
- link_companys = link .xpath ('.//span[@class="izosSe"]' )
250
- if len (link_companys ) > 0 :
251
- l .company = link_companys [0 ].text_content ()
252
-
253
- block .links .append (l )
254
-
255
- blocks .append (block )
256
- return blocks
257
-
258
- def __get_links (self ):
259
- links = []
260
-
261
- elms = self .document .xpath ('//*[@id="rso"]//*[@jscontroller="SC7lYd" or contains(@class, "dFd2Tb")]' )
262
- for e in elms :
263
- link = Link ()
264
-
265
- elms = e .xpath ('.//a' )
266
- if len (elms ) > 0 :
267
- link .url = elms [0 ].get ('href' )
268
-
269
- elms = e .xpath ('.//h3' )
270
- if len (elms ) > 0 :
271
- link .title = elms [0 ].text
272
-
273
- elms = e .xpath ('.//*[contains(@class, "VwiC3b")]' )
274
- if len (elms ) > 0 :
275
- link .desc = elms [0 ].text_content ()
276
-
277
- elms = e .xpath ('.//img' )
278
- if len (elms ) > 0 :
279
- link .typeof = LinkType .CLASSIC_THUMB
280
- if len (elms ) > 1 :
281
- link .typeof = LinkType .THUMBS_LIST
282
-
283
- css_class = e .get ('class' )
284
- if css_class :
285
- if 'dFd2Tb' in css_class :
286
- link .typeof = LinkType .VIDEO
287
-
288
- elms = e .xpath ('.//g-review-stars' )
289
- if len (elms ) > 0 :
290
- spans = elms [0 ].getparent ().xpath ('./span' )
291
-
292
- score = '-'
293
- if len (spans ) > 0 :
294
- numbers = re .findall ("[0-9]+" , spans [0 ].text_content ())
295
-
296
- if len (numbers ):
297
- score = float ('.' .join (numbers ))
298
- link .extra ['reviews' ] = score
299
-
300
- elms = e .xpath ('.//*[contains(@class, "Zh9jr")]/span' )
301
- if len (elms ) > 0 :
302
- faq = []
303
- for elm in elms :
304
- faq .append (elm .text_content ())
305
- link .extra ['faq' ] = faq
306
-
307
- elms = e .xpath ('.//*[contains(@class, "wFMWsc")]' )
308
- if len (elms ) > 0 :
309
- data = []
310
-
311
- for elm in elms :
312
- data .append (elm .text_content ())
313
- link .extra ['data' ] = data
314
-
315
- elms = e .xpath ('.//*[contains(@class, "HiHjCd")]/a' )
316
- if len (elms ) > 0 :
317
- sub_links = []
318
-
319
- for elm in elms :
320
- sub_links .append ({
321
- 'title' : elm .text_content (),
322
- 'url' : elm .get ('href' )
323
- })
324
- link .extra ['sub_links' ] = sub_links
325
-
326
- links .append (link )
327
- return links
0 commit comments