Skip to content

Commit e78062e

Browse files
committed
Bing SERP extractor
1 parent 8fd0c1f commit e78062e

File tree

10 files changed

+3388
-412
lines changed

10 files changed

+3388
-412
lines changed

python_advanced_search/services/crawler/__init__.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
1+
from enum import Enum
2+
13
from playwright.sync_api import sync_playwright
24
from playwright_stealth import stealth_sync
35

46
from python_advanced_search.models.location import Location
57

68

9+
class BrowserType(Enum):
10+
CHROMIUM = 1, 'Chromium'
11+
FIREFOX = 1, 'Firefox'
12+
13+
714
class Response:
815
def __init__(self):
916
self.status_code = 500
@@ -61,7 +68,10 @@ def __init__(self, query, tld=Location.WORLDWIDE):
6168
self.domain = 'google%s' % tld.value
6269

6370
super().__init__(
64-
crawler=Crawler(domain='.%s' % self.domain),
71+
crawler=Crawler(
72+
domain='.%s' % self.domain,
73+
browser_type=BrowserType.CHROMIUM
74+
),
6575
url='https://%s/search?%s' % (
6676
self.domain,
6777
query.encoded_str
@@ -74,7 +84,10 @@ def __init__(self, query, tld=Location.WORLDWIDE):
7484
self.domain = 'bing%s' % tld.value
7585

7686
super().__init__(
77-
crawler=Crawler(domain='.%s' % self.domain),
87+
crawler=Crawler(
88+
domain='.%s' % self.domain,
89+
browser_type=BrowserType.FIREFOX
90+
),
7891
url='https://%s/search?%s' % (
7992
self.domain,
8093
query.encoded_str
@@ -91,13 +104,19 @@ def __init__(self, url):
91104

92105

93106
class Crawler:
94-
def __init__(self, domain='.google.com'):
107+
def __init__(self, domain='.google.com', browser_type=BrowserType.CHROMIUM):
95108
self.domain = domain
96109
self.playwright = sync_playwright().start()
97-
self.browser = self.playwright.chromium.launch(
98-
headless=True,
99-
args=['--single-process', '--no-zygote', '--no-sandbox']
100-
)
110+
111+
if browser_type == BrowserType.FIREFOX:
112+
self.browser = self.playwright.firefox.launch(
113+
headless=True,
114+
)
115+
else:
116+
self.browser = self.playwright.chromium.launch(
117+
headless=True,
118+
args=['--single-process', '--no-zygote', '--no-sandbox']
119+
)
101120

102121
self.context = self.browser.new_context()
103122
self.context.add_cookies([
Lines changed: 1 addition & 312 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,8 @@
1-
import re
21
import abc
32

4-
from abc import ABC
53
from lxml.html import document_fromstring
64

7-
from python_advanced_search.models.serp import (
8-
LinkType,
9-
Serp,
10-
Video,
11-
Link,
12-
MapLink,
13-
AdLink,
14-
OtherSearchEngineLink,
15-
GalleryBlock,
16-
AdBlock,
17-
MapBlock,
18-
VideoBlock,
19-
FAQBlock,
20-
SimilarRequestBlock,
21-
RightBlock,
22-
OtherSearchEngineBlock,
23-
)
5+
from python_advanced_search.models.serp import Serp
246

257

268
class SerpAnalyzer:
@@ -32,296 +14,3 @@ def __init__(self, html):
3214
@abc.abstractmethod
3315
def get_serp(self):
3416
""" This method must be implemented"""
35-
36-
37-
class GoogleSerpAnalyzer(SerpAnalyzer, ABC):
38-
def get_serp(self):
39-
self.serp.title = self.__get_title()
40-
self.serp.nb_results = self.__get_nb_results()
41-
self.serp.nb_pages = self.__get_nb_pages()
42-
self.serp.ms = self.__get_ms()
43-
self.serp.blocks = self.__get_blocks()
44-
self.serp.links = self.__get_links()
45-
return self.serp
46-
47-
def __get_title(self):
48-
elms = self.document.xpath('//title')
49-
50-
if len(elms) > 0:
51-
return elms[0].text_content()
52-
53-
def __get_nb_results(self):
54-
elms = self.document.xpath('//*[@id="result-stats"]')
55-
56-
if len(elms) > 0:
57-
field = elms[0].text
58-
59-
if field is not None:
60-
numbers = re.findall("[0-9]+", field)
61-
if len(numbers) > 0:
62-
return int(''.join(numbers))
63-
return 0
64-
65-
def __get_nb_pages(self):
66-
count = self.document.xpath('count(//*[@class="AaVjTc"]//td)')
67-
if count > 0:
68-
return int(count - 2)
69-
return int(count)
70-
71-
def __get_ms(self):
72-
elms = self.document.xpath('//*[@id="result-stats"]/nobr')
73-
74-
if len(elms) > 0:
75-
field = elms[0].text
76-
77-
if field is not None:
78-
numbers = re.findall("[0-9]+", field)
79-
if len(numbers) > 0:
80-
return float('.'.join(numbers))
81-
return 0
82-
83-
def __get_blocks(self):
84-
blocks = []
85-
86-
blocks += self.__faq_blocks()
87-
blocks += self.__ad_blocks()
88-
blocks += self.__right_blocks()
89-
blocks += self.__video_blocks()
90-
blocks += self.__gallery_blocks()
91-
blocks += self.__similar_request_blocks()
92-
blocks += self.__map_blocks()
93-
blocks += self.__other_search_engine_blocks()
94-
95-
return blocks
96-
97-
def __faq_blocks(self):
98-
blocks = []
99-
100-
faq_blocks = self.document.xpath('.//*[@id="rso"]//*[contains(@class, "AuVD")]')
101-
if len(faq_blocks) > 0:
102-
for faq_block in faq_blocks:
103-
block = FAQBlock()
104-
105-
for question in faq_block.xpath('.//*[@jsname="Cpkphb"]//*[@class="wWOJcd"]//span'):
106-
block.questions.append(question.text_content())
107-
108-
blocks.append(block)
109-
return blocks
110-
111-
def __ad_blocks(self):
112-
blocks = []
113-
114-
ad_blocks = self.document.xpath('.//*[@id="tads" or @id="tadsb"]')
115-
if len(ad_blocks) > 0:
116-
for ad_block in ad_blocks:
117-
block = AdBlock()
118-
119-
for ad in ad_block.xpath('.//*[@class="uEierd"]'):
120-
link = AdLink()
121-
122-
elms = ad.xpath('.//*[@class="sVXRqc"]')
123-
if len(elms) > 0:
124-
link.url = elms[0].get('href')
125-
126-
elms = ad.xpath('.//span')
127-
if len(elms) > 0:
128-
link.title = elms[0].text_content()
129-
130-
block.links.append(link)
131-
132-
blocks.append(block)
133-
return blocks
134-
135-
def __right_blocks(self):
136-
blocks = []
137-
138-
right_blocks = self.document.xpath('.//*[contains(@class, "liYKde")]')
139-
if len(right_blocks) > 0:
140-
for right_block in right_blocks:
141-
block = RightBlock()
142-
143-
elms = right_block.xpath('.//*[@jsname="cQhrTd"]')
144-
if len(elms) > 0:
145-
block.source = 'Google Business'
146-
147-
elms = right_block.xpath('.//*[contains(@class, "ruhjFe")]')
148-
if len(elms) > 0:
149-
block.source = elms[0].text_content()
150-
151-
blocks.append(block)
152-
return blocks
153-
154-
def __video_blocks(self):
155-
blocks = []
156-
157-
video_blocks = self.document.xpath('.//*[contains(@class, "uVMCKf")]')
158-
if len(video_blocks) > 0:
159-
for video_block in video_blocks:
160-
block = VideoBlock()
161-
162-
for video in video_block.xpath('.//video-voyager'):
163-
v = Video()
164-
165-
video_urls = video.xpath('.//a')
166-
if len(video_urls) > 0:
167-
v.url = video_urls[0].get('href')
168-
169-
video_titles = video.xpath('.//span[@class="cHaqb"]')
170-
if len(video_titles) > 0:
171-
v.title = video_titles[0].text_content()
172-
173-
video_companys = video.xpath('.//cite')
174-
if len(video_companys) > 0:
175-
v.company = video_companys[0].text_content()
176-
177-
block.videos.append(v)
178-
179-
blocks.append(block)
180-
return blocks
181-
182-
def __gallery_blocks(self):
183-
blocks = []
184-
185-
gallery_blocks = self.document.xpath('//*[@id="iur"]')
186-
if len(gallery_blocks) > 0:
187-
for gallery_block in gallery_blocks:
188-
block = GalleryBlock()
189-
190-
for suggest in gallery_block.xpath('.//*[@class="dgdd6c"]'):
191-
block.suggests.append(suggest.text_content())
192-
blocks.append(block)
193-
return blocks
194-
195-
def __similar_request_blocks(self):
196-
blocks = []
197-
198-
similar_request_blocks = self.document.xpath('.//*[@id="botstuff"]')
199-
if len(similar_request_blocks) > 0:
200-
for similar_request_block in similar_request_blocks:
201-
block = SimilarRequestBlock()
202-
203-
for similar_request in similar_request_block.xpath('.//*[@class="y6Uyqe"]//a'):
204-
block.requests.append(similar_request.text_content())
205-
206-
blocks.append(block)
207-
return blocks
208-
209-
def __map_blocks(self):
210-
blocks = []
211-
212-
map_blocks = self.document.xpath('.//*[@jscontroller="OWrb3e"]')
213-
if len(map_blocks) > 0:
214-
for map_block in map_blocks:
215-
block = MapBlock()
216-
217-
for link in map_block.xpath('.//*[contains(@class, "w7Dbne")]'):
218-
l = MapLink()
219-
220-
link_urls = link.xpath('.//a[contains(@class, "L48Cpd")]')
221-
if len(link_urls) > 0:
222-
l.url = link_urls[0].get('href')
223-
224-
link_titles = link.xpath('.//span[@class="OSrXXb"]')
225-
if len(link_titles) > 0:
226-
l.title = link_titles[0].text_content()
227-
228-
block.links.append(l)
229-
230-
blocks.append(block)
231-
return blocks
232-
233-
def __other_search_engine_blocks(self):
234-
blocks = []
235-
236-
other_search_engine_blocks = self.document.xpath('.//*[@id="i4BWVe"]')
237-
if len(other_search_engine_blocks) > 0:
238-
for other_search_engine_block in other_search_engine_blocks:
239-
block = OtherSearchEngineBlock()
240-
241-
for link in other_search_engine_block.xpath('.//a[contains(@class, "t2Yvdb")]'):
242-
l = OtherSearchEngineLink()
243-
l.url = link.get('href')
244-
245-
link_titles = link.xpath('.//*[contains(@class, "NNFu9b")]')
246-
if len(link_titles) > 0:
247-
l.title = link_titles[0].text_content()
248-
249-
link_companys = link.xpath('.//span[@class="izosSe"]')
250-
if len(link_companys) > 0:
251-
l.company = link_companys[0].text_content()
252-
253-
block.links.append(l)
254-
255-
blocks.append(block)
256-
return blocks
257-
258-
def __get_links(self):
259-
links = []
260-
261-
elms = self.document.xpath('//*[@id="rso"]//*[@jscontroller="SC7lYd" or contains(@class, "dFd2Tb")]')
262-
for e in elms:
263-
link = Link()
264-
265-
elms = e.xpath('.//a')
266-
if len(elms) > 0:
267-
link.url = elms[0].get('href')
268-
269-
elms = e.xpath('.//h3')
270-
if len(elms) > 0:
271-
link.title = elms[0].text
272-
273-
elms = e.xpath('.//*[contains(@class, "VwiC3b")]')
274-
if len(elms) > 0:
275-
link.desc = elms[0].text_content()
276-
277-
elms = e.xpath('.//img')
278-
if len(elms) > 0:
279-
link.typeof = LinkType.CLASSIC_THUMB
280-
if len(elms) > 1:
281-
link.typeof = LinkType.THUMBS_LIST
282-
283-
css_class = e.get('class')
284-
if css_class:
285-
if 'dFd2Tb' in css_class:
286-
link.typeof = LinkType.VIDEO
287-
288-
elms = e.xpath('.//g-review-stars')
289-
if len(elms) > 0:
290-
spans = elms[0].getparent().xpath('./span')
291-
292-
score = '-'
293-
if len(spans) > 0:
294-
numbers = re.findall("[0-9]+", spans[0].text_content())
295-
296-
if len(numbers):
297-
score = float('.'.join(numbers))
298-
link.extra['reviews'] = score
299-
300-
elms = e.xpath('.//*[contains(@class, "Zh9jr")]/span')
301-
if len(elms) > 0:
302-
faq = []
303-
for elm in elms:
304-
faq.append(elm.text_content())
305-
link.extra['faq'] = faq
306-
307-
elms = e.xpath('.//*[contains(@class, "wFMWsc")]')
308-
if len(elms) > 0:
309-
data = []
310-
311-
for elm in elms:
312-
data.append(elm.text_content())
313-
link.extra['data'] = data
314-
315-
elms = e.xpath('.//*[contains(@class, "HiHjCd")]/a')
316-
if len(elms) > 0:
317-
sub_links = []
318-
319-
for elm in elms:
320-
sub_links.append({
321-
'title': elm.text_content(),
322-
'url': elm.get('href')
323-
})
324-
link.extra['sub_links'] = sub_links
325-
326-
links.append(link)
327-
return links

0 commit comments

Comments
 (0)