Skip to content

Commit 6f836f6

Browse files
Merge pull request #16 from shaikhsajid1111/fixes
Fixes for scrolling issue and Timeout
2 parents fa23d56 + 66694bd commit 6f836f6

File tree

7 files changed

+98
-53
lines changed

7 files changed

+98
-53
lines changed

README.MD

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,12 @@ page_name = "facebookai"
4848
posts_count = 10
4949
browser = "firefox"
5050
proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
51-
meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy)
51+
timeout = 600 #600 seconds
52+
meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy,timeout=timeout)
5253

5354
```
5455

55-
<h3> Parameters for <code>Facebook_scraper(page_name,posts_count,browser,proxy) </code> class </h3>
56+
<h3> Parameters for <code>Facebook_scraper(page_name,posts_count,browser,proxy,timeout) </code> class </h3>
5657
<table>
5758
<th>
5859
<tr>
@@ -109,6 +110,18 @@ string
109110
optional argument, if user wants to set proxy, if proxy requires authentication then the format will be <code> user:password@IP:PORT </code>
110111
</td>
111112
</tr>
113+
<tr>
114+
<td>
115+
timeout
116+
</td>
117+
<td>
118+
integer
119+
</td>
120+
<td>
121+
The maximum amount of time the bot should run for. If not passed, the default timeout is set to 10 minutes
122+
</code>
123+
</td>
124+
</tr>
112125

113126
</table>
114127
<br>

changelog.MD

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
<h1> Changelog </h1>
22
<section>
3+
<h2> 2.0.0 </h2>
4+
<h3>Added</h3>
5+
<li>Timeout argument to set the maximum amount of time the bot should run in case if no post were found.</li>
6+
<h3>Changes</h3>
7+
<li>Updated selenium from version <code>3.141.0</code> to <code>4.1.0</code> </li>
8+
<h3>Fixed</h3>
9+
<li>Fixed issue of browser keep on scrolling above despite calling scroll down method, happening due to different multiple functions call </li>
10+
<br>
11+
<section>
312
<h2> 0.1.10 </h2>
413
<h3>Added</h3>
514
<li>Support for new Facebook Layout</li>

facebook_page_scraper/driver_utilities.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
#!/usr/bin/env python3
2+
from fileinput import close
3+
4+
25
try:
36
from selenium.webdriver.support.ui import WebDriverWait
47
from selenium.webdriver.support import expected_conditions as EC
@@ -28,7 +31,7 @@ def __close_error_popup(driver):
2831
than click on close button to skip that popup.'''
2932
try:
3033
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.layerCancel'))) #wait for popup to show
31-
button = driver.find_element_by_css_selector("a.layerCancel") #grab that popup's close button
34+
button = driver.find_element(By.CSS_SELECTOR,"a.layerCancel") #grab that popup's close button
3235
button.click() #click "close" button
3336
except WebDriverException:
3437
#it is possible that even after waiting for given amount of time,modal may not appear
@@ -49,6 +52,19 @@ def __scroll_down_half(driver):
4952
Utilities.__close_driver(driver)
5053
print("error at scroll_down_half method : {}".format(ex))
5154

55+
@staticmethod
56+
def __close_modern_layout_signup_modal(driver):
57+
try:
58+
driver.execute_script(
59+
"window.scrollTo(0, document.body.scrollHeight);")
60+
close_button = driver.find_element(By.CSS_SELECTOR,'[aria-label="Close"]')
61+
close_button.click()
62+
except NoSuchElementException:
63+
pass
64+
except Exception as ex:
65+
print("error at close_modern_layout_signup_modal: {}".format(ex))
66+
67+
5268
@staticmethod
5369
def __scroll_down(driver,layout):
5470
"""expects driver's instance as a argument, and it scrolls down page to the most bottom till the height"""
@@ -57,9 +73,13 @@ def __scroll_down(driver,layout):
5773
driver.execute_script(
5874
"window.scrollTo(0, document.body.scrollHeight);")
5975
elif layout == "new":
60-
body = driver.find_element_by_css_selector("body")
61-
for _ in range(randint(2, 3)):
76+
body = driver.find_element(By.CSS_SELECTOR,"body")
77+
for _ in range(randint(5,6)):
78+
body.send_keys(Keys.PAGE_UP)
79+
for _ in range(randint(5, 8)):
6280
body.send_keys(Keys.PAGE_DOWN)
81+
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
82+
#Utilities.__close_modern_layout_signup_modal(driver)
6383
except Exception as ex:
6484
#if any error occured than close the driver and exit
6585
Utilities.__close_driver(driver)
@@ -69,11 +89,11 @@ def __scroll_down(driver,layout):
6989
def __close_popup(driver):
7090
"""expects driver's instance and closes modal that ask for login, by clicking "Not Now" button """
7191
try:
72-
Utilities.__scroll_down_half(driver) #try to scroll
92+
#Utilities.__scroll_down_half(driver) #try to scroll
7393
#wait for popup to show
7494
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,'expanding_cta_close_button')))
7595
#grab "Not Now" button
76-
popup_close_button = driver.find_element_by_id('expanding_cta_close_button')
96+
popup_close_button = driver.find_element(By.ID,'expanding_cta_close_button')
7797
popup_close_button.click() #click the button
7898
except WebDriverException:
7999
#modal may not popup, so no need to raise exception in case it is not found
@@ -91,7 +111,7 @@ def __wait_for_element_to_appear(driver,layout):
91111
try:
92112
if layout == "old":
93113
#wait for page to load so posts are visible
94-
body = driver.find_element_by_css_selector("body")
114+
body = driver.find_element(By.CSS_SELECTOR,"body")
95115
for _ in range(randint(3, 5)):
96116
body.send_keys(Keys.PAGE_DOWN)
97117
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.userContentWrapper')))
@@ -115,7 +135,7 @@ def __click_see_more(driver,content):
115135
"""expects driver's instance and selenium element, click on "see more" link to open hidden content"""
116136
try:
117137
#find element and click 'see more' button
118-
element = content.find_element_by_css_selector('span.see_more_link_inner')
138+
element = content.find_element(By.CSS_SELECTOR,'span.see_more_link_inner')
119139
driver.execute_script("arguments[0].click();", element) #click button using js
120140

121141
except NoSuchElementException:

facebook_page_scraper/element_finder.py

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from dateutil.parser import parse
1010
import dateutil
1111
import datetime
12+
from selenium.webdriver.common.by import By
1213
except Exception as ex:
1314
print(ex)
1415

@@ -59,14 +60,13 @@ def __find_status(post,layout):
5960
if layout == "old":
6061
#aim is to find element that looks like <a href="URL" class="_5pcq"></a>
6162
#after finding that element, get it's href value and pass it to different method that extracts post_id from that href
62-
status_link = post.find_element_by_class_name("_5pcq").get_attribute("href")
63+
status_link = post.find_element(By.CLASS_NAME,"_5pcq").get_attribute("href")
6364
#extract out post id from post's url
6465
status = Scraping_utilities._Scraping_utilities__extract_id_from_link(status_link)
6566
elif layout == "new":
66-
links = post.find_elements_by_css_selector("a[role='link']")
67-
link = Finder.__get_status_link(links)
67+
#links = post.find_elements(By.CSS_SELECTOR,"a[role='link']")
68+
link = post.find_element(By.CSS_SELECTOR,'.gpro0wi8.b1v8xokw')
6869
status_link = link.get_attribute('href')
69-
print("Status Link: ",status_link)
7070
status = Scraping_utilities._Scraping_utilities__extract_id_from_link(
7171
status_link)
7272
except NoSuchElementException:
@@ -85,10 +85,10 @@ def __find_share(post,layout):
8585
try:
8686
if layout == "old":
8787
#aim is to find element that have datatest-id attribute as UFI2SharesCount/root
88-
shares = post.find_element_by_css_selector("[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
88+
shares = post.find_element(By.CSS_SELECTOR,"[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
8989
shares = Scraping_utilities._Scraping_utilities__extract_numbers(shares)
9090
elif layout == "new":
91-
elements = post.find_elements_by_css_selector("div.gtad4xkn")
91+
elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
9292
shares = "0"
9393
for element in elements:
9494
text = element.text
@@ -112,8 +112,7 @@ def __find_reactions(post):
112112
"""finds all reaction of the facebook post using selenium's webdriver's method"""
113113
try:
114114
#find element that have attribute aria-label as 'See who reacted to this
115-
reactions_all = post.find_element_by_css_selector(
116-
'[aria-label="See who reacted to this"]')
115+
reactions_all = post.find_element(By.CSS_SELECTOR,'[aria-label="See who reacted to this"]')
117116
except NoSuchElementException:
118117
reactions_all = ""
119118
except Exception as ex:
@@ -126,11 +125,11 @@ def __find_comments(post,layout):
126125
try:
127126
comments = ""
128127
if layout == "old":
129-
comments = post.find_element_by_css_selector("a._3hg-").get_attribute('textContent')
128+
comments = post.find_element(By.CSS_SELECTOR,"a._3hg-").get_attribute('textContent')
130129
#extract numbers from text
131130
comments = Scraping_utilities._Scraping_utilities__extract_numbers(comments)
132131
elif layout == "new":
133-
elements = post.find_elements_by_css_selector("div.gtad4xkn")
132+
elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
134133
comments = "0"
135134
for element in elements:
136135
text = element.text
@@ -164,7 +163,7 @@ def __fetch_post_passage(href):
164163
@staticmethod
165164
def __element_exists(element,css_selector):
166165
try:
167-
found = element.find_element_by_css_selector(css_selector)
166+
found = element.find_element(By.CSS_SELECTOR,css_selector)
168167
return True
169168
except NoSuchElementException:
170169
return False
@@ -174,12 +173,12 @@ def __find_content(post,driver,layout):
174173
"""finds content of the facebook post using selenium's webdriver's method and returns string containing text of the posts"""
175174
try:
176175
if layout == "old":
177-
post_content = post.find_element_by_class_name('userContent')
176+
post_content = post.find_element(By.CLASS_NAME,'userContent')
178177
elif layout == "new":
179-
post_content = post.find_element_by_css_selector('[data-ad-preview="message"]')
178+
post_content = post.find_element(By.CSS_SELECTOR,'[data-ad-preview="message"]')
180179
#if 'See more' or 'Continue reading' is present in post
181180
if Finder._Finder__element_exists(post_content,"span.text_exposed_link > a"):
182-
element = post_content.find_element_by_css_selector("span.text_exposed_link > a") #grab that element
181+
element = post_content.find_element(By.CSS_SELECTOR,"span.text_exposed_link > a") #grab that element
183182
#if element have already the onclick function, that means it is expandable paragraph
184183
if element.get_attribute("onclick"):
185184
Utilities._Utilities__click_see_more(driver,post_content) #click 'see more' button to get hidden text as well
@@ -209,7 +208,7 @@ def __find_posted_time(post,layout,link_element):
209208
#extract element that looks like <abbr class='_5ptz' data-utime="some unix timestamp"> </abbr>
210209
#posted_time = post.find_element_by_css_selector("abbr._5ptz").get_attribute("data-utime")
211210
if layout == "old":
212-
posted_time = post.find_element_by_tag_name("abbr").get_attribute('data-utime')
211+
posted_time = post.find_element(By.TAG_NAME,"abbr").get_attribute('data-utime')
213212
return datetime.datetime.fromtimestamp(float(posted_time)).isoformat()
214213
elif layout == "new":
215214
aria_label_value = link_element.get_attribute("aria-label")
@@ -233,7 +232,7 @@ def __find_video_url(post,page_name,status):
233232
"""finds video of the facebook post using selenium's webdriver's method"""
234233
try:
235234
#if video is found in the post, than create a video URL by concatenating post's id with page_name
236-
video_element = post.find_element_by_tag_name("video")
235+
video_element = post.find_element(By.TAG_NAME,"video")
237236
video = "https://www.facebook.com/{}/videos/{}".format(page_name,status)
238237

239238
except NoSuchElementException:
@@ -250,7 +249,7 @@ def __find_image_url(post):
250249
"""finds all image of the facebook post using selenium's webdriver's method"""
251250
try:
252251
#find all img tag that looks like <img class="scaledImageFitWidth img" src="">
253-
images = post.find_elements_by_css_selector("img.scaledImageFitWidth.img")
252+
images = post.find_elements(By.CSS_SELECTOR,"img.scaledImageFitWidth.img")
254253
#extract src attribute from all the img tag,store it in list
255254
sources = [image.get_attribute("src") for image in images] if len(images) > 0 else []
256255
except NoSuchElementException:
@@ -268,10 +267,9 @@ def __find_all_posts(driver,layout):
268267
try:
269268
#find all posts that looks like <div class="userContentWrapper"> </div>
270269
if layout == "old":
271-
all_posts = driver.find_elements_by_css_selector("div.userContentWrapper")
270+
all_posts = driver.find_elements(By.CSS_SELECTOR,"div.userContentWrapper")
272271
elif layout == "new":
273-
all_posts = driver.find_elements_by_css_selector(
274-
'[aria-posinset]')
272+
all_posts = driver.find_elements(By.CSS_SELECTOR,'[aria-posinset]')
275273
return all_posts
276274
except NoSuchElementException:
277275
print("Cannot find any posts! Exiting!")
@@ -288,17 +286,17 @@ def __find_name(driver,layout):
288286
"""finds name of the facebook page using selenium's webdriver's method"""
289287
try:
290288
if layout == "old":
291-
name = driver.find_element_by_css_selector('a._64-f').get_attribute('textContent')
289+
name = driver.find_element(By.CSS_SELECTOR,'a._64-f').get_attribute('textContent')
292290
elif layout == "new":
293-
name = driver.find_element_by_tag_name("strong").get_attribute("textContent")
291+
name = driver.find_element(By.TAG_NAME,"strong").get_attribute("textContent")
294292
return name
295293
except Exception as ex:
296294
print("error at __find_name method : {}".format(ex))
297295

298296
@staticmethod
299297
def __detect_ui(driver):
300298
try:
301-
driver.find_element_by_id("pagelet_bluebar")
299+
driver.find_element(By.ID,"pagelet_bluebar")
302300
return "old"
303301
except NoSuchElementException:
304302
return "new"
@@ -311,10 +309,10 @@ def __detect_ui(driver):
311309
def __find_reaction(layout, reactions_all):
312310
try:
313311
if layout == "old":
314-
return reactions_all.find_elements_by_tag_name(
312+
return reactions_all.find_elements(By.TAG_NAME,
315313
"a")
316314
elif layout == "new":
317-
return reactions_all.find_elements_by_tag_name(
315+
return reactions_all.find_elements(By.TAG_NAME,
318316
"div")
319317

320318
except Exception as ex:

facebook_page_scraper/scraper.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import json
99
import csv
1010
import os
11+
import time
1112

1213
except Exception as ex:
1314
print(ex)
@@ -38,9 +39,8 @@ class Facebook_scraper:
3839
#on each iteration __close_after_retry is called to check if retry have turned to 0
3940
# if it returns true,it will break the loop. After coming out of loop,driver will be closed and it will return post whatever was found
4041

41-
retry = 10
4242

43-
def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None):
43+
def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None,timeout=600):
4444
self.page_name = page_name
4545
self.posts_count = int(posts_count)
4646
#self.URL = "https://en-gb.facebook.com/pg/{}/posts".format(self.page_name)
@@ -49,21 +49,30 @@ def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None):
4949
self.__driver = ''
5050
self.proxy = proxy
5151
self.__layout = ''
52+
self.timeout = timeout
5253

5354
def __start_driver(self):
5455
"""changes the class member __driver value to driver on call"""
5556
self.__driver = Initializer(self.browser,self.proxy).init()
56-
def __handle_popup_old_layout(self,layout):
57+
def __handle_popup(self,layout):
5758
#while scrolling, wait for login popup to show, it can be skipped by clicking "Not Now" button
5859
try:
59-
Utilities._Utilities__close_popup(self.__driver)
60-
except:
61-
pass
60+
if layout == "old":
61+
#if during scrolling any of error or signup popup shows
62+
Utilities._Utilities__close_error_popup(self.__driver)
63+
Utilities._Utilities__close_popup(self.__driver)
64+
elif layout == "new":
65+
Utilities._Utilities__close_modern_layout_signup_modal(self.__driver)
66+
except Exception as ex:
67+
print(ex)
68+
69+
def __check_timeout(self,start_time,current_time):
70+
return (current_time-start_time) > self.timeout
6271

6372
def scrap_to_json(self):
6473
#call the __start_driver and override class member __driver to webdriver's instance
6574
self.__start_driver()
66-
75+
starting_time = time.time()
6776
#navigate to URL
6877
self.__driver.get(self.URL)
6978

@@ -75,21 +84,18 @@ def scrap_to_json(self):
7584
Utilities._Utilities__wait_for_element_to_appear(self.__driver,self.__layout)
7685
#scroll down to bottom most
7786
Utilities._Utilities__scroll_down(self.__driver,self.__layout)
78-
self.__handle_popup_old_layout(self.__layout)
87+
self.__handle_popup(self.__layout)
7988

8089

8190
name = Finder._Finder__find_name(self.__driver,self.__layout) #find name element
8291

8392
while len(self.__data_dict) <= self.posts_count:
84-
85-
#if during scrolling any of error or signup popup shows
86-
Utilities._Utilities__close_error_popup(self.__driver)
87-
self.__handle_popup_old_layout(self.__layout)
93+
self.__handle_popup(self.__layout)
8894
self.__find_elements(name)
89-
90-
if self.__close_after_retry() is True:
91-
#keep a check if posts are available, if retry is 0, than it breaks loop
92-
break
95+
current_time = time.time()
96+
if self.__check_timeout(starting_time,current_time) is True:
97+
print("Timeout...")
98+
break
9399
Utilities._Utilities__scroll_down(self.__driver, self.__layout) #scroll down
94100
#print(len(self.__data_dict))
95101
#close the browser window after job is done.
@@ -163,7 +169,6 @@ def __find_elements(self,name):
163169
all_posts = Finder._Finder__find_all_posts(self.__driver,self.__layout) #find all posts
164170
all_posts = self.__remove_duplicates(all_posts) #remove duplicates from the list
165171

166-
self.__no_post_found(all_posts) #after removing duplicates if length is 0, retry will decrease by 1
167172
#iterate over all the posts and find details from the same
168173
for post in all_posts:
169174
try:

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
selenium==3.141.0
1+
selenium==4.1.0
22
webdriver-manager==3.2.2
33
selenium-wire==4.3.1
44
python-dateutil==2.8.2

0 commit comments

Comments
 (0)