Skip to content

Commit 6695606

Browse files
committed
Bing SERP extractor
1 parent 8fd0c1f commit 6695606

File tree

13 files changed

+2978
-193
lines changed

13 files changed

+2978
-193
lines changed
Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
import re
2+
from abc import ABC
3+
4+
from python_advanced_search.models.serp import (
5+
LinkType,
6+
Video,
7+
Link,
8+
MapLink,
9+
AdLink,
10+
OtherSearchEngineLink,
11+
GalleryBlock,
12+
AdBlock,
13+
MapBlock,
14+
VideoBlock,
15+
FAQBlock,
16+
SimilarRequestBlock,
17+
RightBlock,
18+
OtherSearchEngineBlock,
19+
)
20+
from python_advanced_search.models.serp.analyzer import SerpAnalyzer
21+
22+
23+
class BingSerpAnalyzer(SerpAnalyzer, ABC):
24+
def get_serp(self):
25+
self.serp.title = self.__get_title()
26+
self.serp.nb_results = self.__get_nb_results()
27+
self.serp.nb_pages = self.__get_nb_pages()
28+
self.serp.ms = self.__get_ms()
29+
self.serp.blocks = self.__get_blocks()
30+
self.serp.links = self.__get_links()
31+
return self.serp
32+
33+
def __get_title(self):
34+
elms = self.document.xpath('//title//text()')
35+
36+
if len(elms) > 0:
37+
return elms[0]
38+
39+
def __get_nb_results(self):
40+
elms = self.document.xpath('//span[@class="sb_count"]/text()')
41+
42+
if len(elms) > 0:
43+
field = elms[0]
44+
45+
if field is not None:
46+
numbers = re.findall("[0-9]+", field)
47+
if len(numbers) > 0:
48+
return int(''.join(numbers))
49+
return 0
50+
51+
def __get_nb_pages(self):
52+
# Can't have full pages number
53+
count = self.document.xpath('count(//*[@class="AaVjTc"]//td)')
54+
if count > 0:
55+
return int(count - 2)
56+
return int(count)
57+
58+
def __get_ms(self):
59+
# Can't have query ms time
60+
elms = self.document.xpath('//*[@id="result-stats"]/nobr')
61+
62+
if len(elms) > 0:
63+
field = elms[0].text
64+
65+
if field is not None:
66+
numbers = re.findall("[0-9]+", field)
67+
if len(numbers) > 0:
68+
return float('.'.join(numbers))
69+
return 0
70+
71+
def __get_blocks(self):
72+
blocks = []
73+
74+
blocks += self.__faq_blocks()
75+
blocks += self.__ad_blocks()
76+
blocks += self.__right_blocks()
77+
blocks += self.__video_blocks()
78+
blocks += self.__gallery_blocks()
79+
blocks += self.__similar_request_blocks()
80+
blocks += self.__map_blocks()
81+
blocks += self.__other_search_engine_blocks()
82+
83+
return blocks
84+
85+
def __faq_blocks(self):
86+
blocks = []
87+
88+
faq_blocks = self.document.xpath('.//*[@id="rso"]//*[contains(@class, "AuVD")]')
89+
if len(faq_blocks) > 0:
90+
for faq_block in faq_blocks:
91+
block = FAQBlock()
92+
93+
for question in faq_block.xpath('.//*[@jsname="Cpkphb"]//*[@class="wWOJcd"]//span'):
94+
block.questions.append(question.text_content())
95+
96+
blocks.append(block)
97+
return blocks
98+
99+
def __ad_blocks(self):
100+
blocks = []
101+
102+
ad_blocks = self.document.xpath('.//*[@id="tads" or @id="tadsb"]')
103+
if len(ad_blocks) > 0:
104+
for ad_block in ad_blocks:
105+
block = AdBlock()
106+
107+
for ad in ad_block.xpath('.//*[@class="uEierd"]'):
108+
link = AdLink()
109+
110+
elms = ad.xpath('.//*[@class="sVXRqc"]')
111+
if len(elms) > 0:
112+
link.url = elms[0].get('href')
113+
114+
elms = ad.xpath('.//span')
115+
if len(elms) > 0:
116+
link.title = elms[0].text_content()
117+
118+
block.links.append(link)
119+
120+
blocks.append(block)
121+
return blocks
122+
123+
def __right_blocks(self):
124+
blocks = []
125+
126+
right_blocks = self.document.xpath('.//*[contains(@class, "liYKde")]')
127+
if len(right_blocks) > 0:
128+
for right_block in right_blocks:
129+
block = RightBlock()
130+
131+
elms = right_block.xpath('.//*[@jsname="cQhrTd"]')
132+
if len(elms) > 0:
133+
block.source = 'Google Business'
134+
135+
elms = right_block.xpath('.//*[contains(@class, "ruhjFe")]')
136+
if len(elms) > 0:
137+
block.source = elms[0].text_content()
138+
139+
blocks.append(block)
140+
return blocks
141+
142+
def __video_blocks(self):
143+
blocks = []
144+
145+
video_blocks = self.document.xpath('.//*[contains(@class, "uVMCKf")]')
146+
if len(video_blocks) > 0:
147+
for video_block in video_blocks:
148+
block = VideoBlock()
149+
150+
for video in video_block.xpath('.//video-voyager'):
151+
v = Video()
152+
153+
video_urls = video.xpath('.//a')
154+
if len(video_urls) > 0:
155+
v.url = video_urls[0].get('href')
156+
157+
video_titles = video.xpath('.//span[@class="cHaqb"]')
158+
if len(video_titles) > 0:
159+
v.title = video_titles[0].text_content()
160+
161+
video_companys = video.xpath('.//cite')
162+
if len(video_companys) > 0:
163+
v.company = video_companys[0].text_content()
164+
165+
block.videos.append(v)
166+
167+
blocks.append(block)
168+
return blocks
169+
170+
def __gallery_blocks(self):
171+
blocks = []
172+
173+
gallery_blocks = self.document.xpath('//*[@id="iur"]')
174+
if len(gallery_blocks) > 0:
175+
for gallery_block in gallery_blocks:
176+
block = GalleryBlock()
177+
178+
for suggest in gallery_block.xpath('.//*[@class="dgdd6c"]'):
179+
block.suggests.append(suggest.text_content())
180+
blocks.append(block)
181+
return blocks
182+
183+
def __similar_request_blocks(self):
184+
blocks = []
185+
186+
similar_request_blocks = self.document.xpath('.//*[@id="brsv3"]')
187+
if len(similar_request_blocks) > 0:
188+
for similar_request_block in similar_request_blocks:
189+
block = SimilarRequestBlock()
190+
191+
for similar_request in similar_request_block.xpath('.//div[contains(@class, "b_suggestionText")]'):
192+
block.requests.append(similar_request.text_content())
193+
194+
blocks.append(block)
195+
return blocks
196+
197+
def __map_blocks(self):
198+
blocks = []
199+
200+
map_blocks = self.document.xpath('.//*[@jscontroller="OWrb3e"]')
201+
if len(map_blocks) > 0:
202+
for map_block in map_blocks:
203+
block = MapBlock()
204+
205+
for link in map_block.xpath('.//*[contains(@class, "w7Dbne")]'):
206+
l = MapLink()
207+
208+
link_urls = link.xpath('.//a[contains(@class, "L48Cpd")]')
209+
if len(link_urls) > 0:
210+
l.url = link_urls[0].get('href')
211+
212+
link_titles = link.xpath('.//span[contains(@class, "OSrXXb")]//text()')
213+
if len(link_titles) > 0:
214+
l.title = link_titles[0].text_content()
215+
216+
block.links.append(l)
217+
218+
blocks.append(block)
219+
return blocks
220+
221+
def __other_search_engine_blocks(self):
222+
blocks = []
223+
224+
other_search_engine_blocks = self.document.xpath('.//*[@id="i4BWVe"]')
225+
if len(other_search_engine_blocks) > 0:
226+
for other_search_engine_block in other_search_engine_blocks:
227+
block = OtherSearchEngineBlock()
228+
229+
for link in other_search_engine_block.xpath('.//a[contains(@class, "t2Yvdb")]'):
230+
l = OtherSearchEngineLink()
231+
l.url = link.get('href')
232+
233+
link_titles = link.xpath('.//*[contains(@class, "NNFu9b")]')
234+
if len(link_titles) > 0:
235+
l.title = link_titles[0].text_content()
236+
237+
link_companys = link.xpath('.//span[@class="izosSe"]')
238+
if len(link_companys) > 0:
239+
l.company = link_companys[0].text_content()
240+
241+
block.links.append(l)
242+
243+
blocks.append(block)
244+
return blocks
245+
246+
def __get_links(self):
247+
links = []
248+
249+
elms = self.document.xpath('//ol[@id="b_results"]/li[contains(@class, "b_algo")]')
250+
for e in elms:
251+
link = Link()
252+
253+
elms = e.xpath('.//cite//text()')
254+
if len(elms) > 0:
255+
link.url = ''.join(elms).strip()
256+
257+
elms = e.xpath('.//h2/a//text()')
258+
if len(elms) > 0:
259+
link.title = elms[0]
260+
261+
elms = e.xpath('.//*[contains(@class, "b_algoSlug")]//text()')
262+
if len(elms) > 0:
263+
link.desc = ' '.join(elms)
264+
265+
elms = e.xpath('.//img')
266+
if len(elms) > 0:
267+
link.typeof = LinkType.CLASSIC_THUMB
268+
if len(elms) > 1:
269+
link.typeof = LinkType.THUMBS_LIST
270+
271+
css_class = e.get('class')
272+
if css_class:
273+
if 'dFd2Tb' in css_class:
274+
link.typeof = LinkType.VIDEO
275+
276+
elms = e.xpath('.//g-review-stars')
277+
if len(elms) > 0:
278+
spans = elms[0].getparent().xpath('./span')
279+
280+
score = '-'
281+
if len(spans) > 0:
282+
numbers = re.findall("[0-9]+", spans[0].text_content())
283+
284+
if len(numbers):
285+
score = float('.'.join(numbers))
286+
link.extra['reviews'] = score
287+
288+
elms = e.xpath('.//*[contains(@class, "Zh9jr")]/span')
289+
if len(elms) > 0:
290+
faq = []
291+
for elm in elms:
292+
faq.append(elm.text_content())
293+
link.extra['faq'] = faq
294+
295+
elms = e.xpath('.//*[contains(@class, "wFMWsc")]')
296+
if len(elms) > 0:
297+
data = []
298+
299+
for elm in elms:
300+
data.append(elm.text_content())
301+
link.extra['data'] = data
302+
303+
elms = e.xpath('.//*[contains(@class, "HiHjCd")]/a')
304+
if len(elms) > 0:
305+
sub_links = []
306+
307+
for elm in elms:
308+
sub_links.append({
309+
'title': elm.text_content(),
310+
'url': elm.get('href')
311+
})
312+
link.extra['sub_links'] = sub_links
313+
314+
links.append(link)
315+
return links

python_advanced_search/services/serp/__init__.py renamed to python_advanced_search/engines/google/serp/__init__.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
11
import re
2-
import abc
3-
42
from abc import ABC
5-
from lxml.html import document_fromstring
63

74
from python_advanced_search.models.serp import (
85
LinkType,
9-
Serp,
106
Video,
117
Link,
128
MapLink,
@@ -21,17 +17,7 @@
2117
RightBlock,
2218
OtherSearchEngineBlock,
2319
)
24-
25-
26-
class SerpAnalyzer:
27-
def __init__(self, html):
28-
self.html = html
29-
self.document = document_fromstring(self.html)
30-
self.serp = Serp()
31-
32-
@abc.abstractmethod
33-
def get_serp(self):
34-
""" This method must be implemented"""
20+
from python_advanced_search.models.serp.analyzer import SerpAnalyzer
3521

3622

3723
class GoogleSerpAnalyzer(SerpAnalyzer, ABC):

python_advanced_search/models/query/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import urllib.parse
22

3+
from python_crawler.crawler import GoogleRequest, BingRequest
4+
35
from python_advanced_search.models.commands.expressions import Expression
46
from python_advanced_search.models.commands import (
57
ExpressionCommand,
@@ -28,7 +30,6 @@
2830
DefineCommand,
2931
)
3032
from python_advanced_search.models.location import Location
31-
from python_advanced_search.services.crawler import GoogleRequest, BingRequest
3233

3334

3435
class Query:
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import abc
2+
3+
from lxml.html import document_fromstring
4+
5+
from python_advanced_search.models.serp import Serp
6+
7+
8+
class SerpAnalyzer:
9+
def __init__(self, html):
10+
self.html = html
11+
self.document = document_fromstring(self.html)
12+
self.serp = Serp()
13+
14+
@abc.abstractmethod
15+
def get_serp(self):
16+
""" This method must be implemented"""

python_advanced_search/services/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)