Skip to content

Commit fa23d56

Browse files
Merge pull request #14 from shaikhsajid1111/new_ui_changes
New UI changes
2 parents c19835c + e7d0821 commit fa23d56

File tree

8 files changed

+321
-143
lines changed

8 files changed

+321
-143
lines changed

README.MD

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ page_name = "facebookai"
4848
posts_count = 10
4949
browser = "firefox"
5050
proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
51-
facebook_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy)
51+
meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy)
5252

5353
```
5454

@@ -124,36 +124,36 @@ optional argument, if user wants to set proxy, if proxy requires authentication
124124
```python
125125
#call the scrap_to_json() method
126126

127-
json_data = facebook_ai.scrap_to_json()
127+
json_data = meta_ai.scrap_to_json()
128128
print(json_data)
129129

130130
```
131131
Output:
132132
```javascript
133133

134134
{
135-
"1739843239525955": {
136-
"name": "Facebook AI",
137-
"shares": 43,
138-
"reactions": {
139-
"likes": 129,
140-
"loves": 11,
141-
"wow": 8,
142-
"cares": 0,
143-
"sad": 0,
144-
"angry": 0,
145-
"haha": 0
146-
},
147-
"reaction_count": 148,
148-
"comments": 3,
149-
"content": "We’re transitioning the Visdom project to the team at FOSSASIA. Visdom is a flexible tool for creating, organizing, and sharing visualizations of live, rich data. It aims to facilitate visualization of remote data with an emphasis on supporting scientific experimentation. We’re excited to see where the team, in collaboration with the developer and user community, take the project.",
150-
"posted_on": "2021-01-05T17:22:54",
151-
"video": "https://www.facebook.com/facebookai/videos/1739843239525955",
152-
"image": [
153-
"https://scontent-bom1-2.xx.fbcdn.net/v/t1.0-0/s526x296/135871741_1739843246192621_8564947121610203331_o.png?_nc_cat=108&ccb=2&_nc_sid=da1649&_nc_ohc=Hk7peLe8e-cAX_xLejp&_nc_ht=scontent-bom1-2.xx&_nc_tp=30&oh=856a17109cbc4a6657dbb68564dfc568&oe=60291FC7"
154-
],
155-
"post_url": "https://www.facebook.com/facebookai/posts/1739843239525955"
156-
}, ...
135+
"2024182624425347": {
136+
"name": "Meta AI",
137+
"shares": 0,
138+
"reactions": {
139+
"likes": 154,
140+
"loves": 19,
141+
"wow": 0,
142+
"cares": 0,
143+
"sad": 0,
144+
"angry": 0,
145+
"haha": 0
146+
},
147+
"reaction_count": 173,
148+
"comments": 2,
149+
"content": "We’ve built data2vec, the first general high-performance self-supervised algorithm for speech, vision, and text. We applied it to different modalities and found it matches or outperforms the best self-supervised algorithms. We hope this brings us closer to a world where computers can learn to solve many different tasks without supervision. Learn more and get the code: https://ai.facebook.com/…/the-first-high-performance-self-s…",
150+
"posted_on": "2022-01-20T22:43:35",
151+
"video": "",
152+
"image": [
153+
"https://scontent-bom1-2.xx.fbcdn.net/v/t39.30808-6/s480x480/272147088_2024182621092014_6532581039236849529_n.jpg?_nc_cat=100&ccb=1-5&_nc_sid=8024bb&_nc_ohc=j4_1PAndJTIAX82OLNq&_nc_ht=scontent-bom1-2.xx&oh=00_AT9us__TvC9eYBqRyQEwEtYSit9r2UKYg0gFoRK7Efrhyw&oe=61F17B71"
154+
],
155+
"post_url": "https://www.facebook.com/MetaAI/photos/a.360372474139712/2024182624425347/?type=3&__xts__%5B0%5D=68.ARBoSaQ-pAC_ApucZNHZ6R-BI3YUSjH4sXsfdZRQ2zZFOwgWGhjt6dmg0VOcmGCLhSFyXpecOY9g1A94vrzU_T-GtYFagqDkJjHuhoyPW2vnkn7fvfzx-ql7fsBYxL5DgQVSsiC1cPoycdCvHmi6BV5Sc4fKADdgDhdFvVvr-ttzXG1ng2DbLzU-XfSes7SAnrPs-gxjODPKJ7AdqkqkSQJ4HrsLgxMgcLFdCsE6feWL7rXjptVWegMVMthhJNVqO0JHu986XBfKKqB60aBFvyAzTSEwJD6o72GtnyzQ-BcH7JxmLtb2_A&__tn__=-R"
156+
}, ...
157157

158158
}
159159

@@ -199,14 +199,14 @@ Output Structure for JSON format:
199199

200200
filename = "data_file" #file name without CSV extension,where data will be saved
201201
directory = "E:\data" #directory where CSV file will be saved
202-
facebook_ai.scrap_to_csv(filename,directory)
202+
meta_ai.scrap_to_csv(filename,directory)
203203

204204
```
205205

206206
content of ```data_file.csv```:
207207
```csv
208208
id,name,shares,likes,loves,wow,cares,sad,angry,haha,reactions_count,comments,content,posted_on,video,image,post_url
209-
1791700921006853,Facebook AI,45,150,19,5,0,0,0,0,174,8,"Facebook AI has built TimeSformer, an entirely new architecture for video understanding. It’s the first that’s based exclusively on the self-attention mechanism used in Transformers. TimeSformer outperforms the state of the art while being more efficient than 3D ConvNets for video.",2021-03-15T17:14:30,,https://scontent-bom1-2.xx.fbcdn.net/v/t39.2365-6/p540x282/156274680_471569777206221_706631440205169419_n.jpg?_nc_cat=110&ccb=1-3&_nc_sid=eaa83b&_nc_ohc=eyfETEUuHzQAX8DqwMU&_nc_ht=scontent-bom1-2.xx&tp=6&oh=2e9c6490fe3ad19a398905b3b615c88b&oe=6075FFE4,https://www.facebook.com/FacebookAI/posts/1791700921006853?__xts__%5B0%5D=68.ARCfsjOoZa0yc0TPws1koBr9ezS44Xf6Up04CqOhWnoDqrO35NdIdgjNSTWBrsUtm_y7MamZTjc_-p2rTobXe5WvxWd_eywuSzt98B7Vaj5hobF4OTZhe7VRgVJJY1wxEeAJf4nCZSs1tF1gWJJ0s5pPUGMmJsfD1UM5a3eERo-2t1JnTBHOSYs9Xsj5fV0iL-FiWAms_2-9KNRGqoojg9KfSAlffh_qxL8ztgznqC1sxfcU6MwAqdPN2va_T8cez29ZvJ1Er1j26VR7pnpWGyTMuW5wMrNxC-pz_8pVls8uk0iDramIOA&__tn__=-R
209+
2024182624425347,Meta AI,0,154,19,0,0,0,0,0,173,2,"We’ve built data2vec, the first general high-performance self-supervised algorithm for speech, vision, and text. We applied it to different modalities and found it matches or outperforms the best self-supervised algorithms. We hope this brings us closer to a world where computers can learn to solve many different tasks without supervision. Learn more and get the code: https://ai.facebook.com/…/the-first-high-performance-self-s…",2022-01-20T22:43:35,,https://scontent-bom1-2.xx.fbcdn.net/v/t39.30808-6/s480x480/272147088_2024182621092014_6532581039236849529_n.jpg?_nc_cat=100&ccb=1-5&_nc_sid=8024bb&_nc_ohc=j4_1PAndJTIAX82OLNq&_nc_ht=scontent-bom1-2.xx&oh=00_AT9us__TvC9eYBqRyQEwEtYSit9r2UKYg0gFoRK7Efrhyw&oe=61F17B71,https://www.facebook.com/MetaAI/photos/a.360372474139712/2024182624425347/?type=3&__xts__%5B0%5D=68.ARAse4eiZmZQDOZumNZEDR0tQkE5B6g50K6S66JJPccb-KaWJWg6Yz4v19BQFSZRMd04MeBmV24VqvqMB3oyjAwMDJUtpmgkMiITtSP8HOgy8QEx_vFlq1j-UEImZkzeEgSAJYINndnR5aSQn0GUwL54L3x2BsxEqL1lElL7SnHfTVvIFUDyNfAqUWIsXrkI8X5KjoDchUj7aHRga1HB5EE0x60dZcHogUMb1sJDRmKCcx8xisRgk5XzdZKCQDDdEkUqN-Ch9_NYTMtxlchz1KfR0w9wRt8y9l7E7BNhfLrmm4qyxo-ZpA&__tn__=-R
210210
...
211211
```
212212

@@ -438,6 +438,7 @@ URL for that post
438438
<ul>
439439
<li> <a href="https://www.selenium.dev/" target='_blank'>selenium</a>
440440
<li> <a href="https://pypi.org/project/webdriver-manager/" target='_blank'>webdriver manager</a>
441+
<li> <a href="https://pypi.org/project/python-dateutil/" target='_blank'>python dateutil</a>
441442
</ul>
442443
<br>
443444

changelog.MD

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
<h1> Changelog </h1>
22
<section>
3+
<h2> 0.1.10 </h2>
4+
<h3>Added</h3>
5+
<li>Support for new Facebook Layout</li>
6+
7+
<br>
8+
<h1> Changelog </h1>
9+
<section>
310
<h2> 0.1.9 </h2>
411
<h3>Added</h3>
512
<li>Added feature for using proxy while scraping</li>

facebook_page_scraper/driver_utilities.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
from selenium.webdriver.support import expected_conditions as EC
55
from selenium.webdriver.common.by import By
66
from selenium.common.exceptions import NoSuchElementException,WebDriverException
7-
import time
7+
from random import randint
8+
from selenium.webdriver.common.keys import Keys
89
import sys
910
except Exception as ex:
1011
print(ex)
11-
12+
1213
class Utilities:
1314

1415
@staticmethod
@@ -19,10 +20,10 @@ def __close_driver(driver):
1920
driver.quit()
2021
except Exception as ex:
2122
print("error at close_driver method : {}".format(ex))
22-
23+
2324
@staticmethod
2425
def __close_error_popup(driver):
25-
'''expects driver's instance as a argument and checks if error shows up
26+
'''expects driver's instance as a argument and checks if error shows up
2627
like "We could not process your request. Please try again later" ,
2728
than click on close button to skip that popup.'''
2829
try:
@@ -34,30 +35,36 @@ def __close_error_popup(driver):
3435
pass
3536
except NoSuchElementException:
3637
pass #passing this error silently because it may happen that popup never shows up
37-
38+
3839
except Exception as ex:
3940
#if any other error occured except the above one
4041
print("error at close_error_popup method : {}".format(ex))
41-
42+
4243
@staticmethod
4344
def __scroll_down_half(driver):
4445
try:
45-
driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 2);")
46+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 2);")
4647
except Exception as ex:
4748
#if any error occured than close the driver and exit
4849
Utilities.__close_driver(driver)
4950
print("error at scroll_down_half method : {}".format(ex))
5051

5152
@staticmethod
52-
def __scroll_down(driver):
53+
def __scroll_down(driver,layout):
5354
"""expects driver's instance as a argument, and it scrolls down page to the most bottom till the height"""
5455
try:
55-
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
56+
if layout == "old":
57+
driver.execute_script(
58+
"window.scrollTo(0, document.body.scrollHeight);")
59+
elif layout == "new":
60+
body = driver.find_element_by_css_selector("body")
61+
for _ in range(randint(2, 3)):
62+
body.send_keys(Keys.PAGE_DOWN)
5663
except Exception as ex:
5764
#if any error occured than close the driver and exit
5865
Utilities.__close_driver(driver)
5966
print("error at scroll_down method : {}".format(ex))
60-
67+
6168
@staticmethod
6269
def __close_popup(driver):
6370
"""expects driver's instance and closes modal that ask for login, by clicking "Not Now" button """
@@ -75,34 +82,42 @@ def __close_popup(driver):
7582
pass #passing this exception silently as modal may not show up
7683
except Exception as ex:
7784
print("error at close_popup method : {}".format(ex))
78-
85+
7986
@staticmethod
80-
def __wait_for_element_to_appear(driver):
87+
def __wait_for_element_to_appear(driver,layout):
8188
"""expects driver's instance, wait for posts to show.
8289
post's CSS class name is userContentWrapper
8390
"""
8491
try:
85-
#wait for page to load so posts are visible
86-
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.userContentWrapper')))
92+
if layout == "old":
93+
#wait for page to load so posts are visible
94+
body = driver.find_element_by_css_selector("body")
95+
for _ in range(randint(3, 5)):
96+
body.send_keys(Keys.PAGE_DOWN)
97+
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.userContentWrapper')))
98+
elif layout == "new":
99+
WebDriverWait(driver, 30).until(
100+
EC.presence_of_element_located((By.CSS_SELECTOR, "[aria-posinset]")))
101+
87102
except WebDriverException:
88103
#if it was not found,it means either page is not loading or it does not exists
89104
print("No posts were found!")
90105
Utilities.__close_driver(driver)
91106
sys.exit(1) #exit the program, because if posts does not exists,we cannot go further
92107
except Exception as ex:
93108
print("error at wait_for_element_to_appear method : {}".format(ex))
94-
Utilities.__close_driver(driver)
95-
96-
109+
Utilities.__close_driver(driver)
110+
111+
97112

98113
@staticmethod
99114
def __click_see_more(driver,content):
100115
"""expects driver's instance and selenium element, click on "see more" link to open hidden content"""
101116
try:
102117
#find element and click 'see more' button
103118
element = content.find_element_by_css_selector('span.see_more_link_inner')
104-
driver.execute_script("arguments[0].click();", element) #click button using js
105-
119+
driver.execute_script("arguments[0].click();", element) #click button using js
120+
106121
except NoSuchElementException:
107122
#if it doesn't exists than no need to raise any error
108123
pass

0 commit comments

Comments
 (0)