Skip to content

Commit 65431db

Browse files
Updated to version 0.1.9 after adding feature for using proxy
1 parent e087c9f commit 65431db

File tree

6 files changed

+125
-75
lines changed

6 files changed

+125
-75
lines changed

README.MD

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
<h3> Installing from source: </h3>
1919

2020
```
21-
git clone https://github.com/shaikhsajid1111/facebook_page_scraper
21+
git clone https://github.com/shaikhsajid1111/facebook_page_scraper
2222
```
2323

2424
<h4> Inside project's directory </h4>
@@ -47,12 +47,12 @@ from facebook_page_scraper import Facebook_scraper
4747
page_name = "facebookai"
4848
posts_count = 10
4949
browser = "firefox"
50-
51-
facebook_ai = Facebook_scraper(page_name,posts_count,browser)
50+
proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
51+
facebook_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy)
5252

5353
```
5454

55-
<h3> Parameters for <code>Facebook_scraper(page_name,posts_count,browser) </code> class </h3>
55+
<h3> Parameters for <code>Facebook_scraper(page_name,posts_count,browser,proxy) </code> class </h3>
5656
<table>
5757
<th>
5858
<tr>
@@ -98,6 +98,18 @@ which browser to use, either chrome or firefox. if not passed,default is chrome
9898
</td>
9999
</tr>
100100

101+
<tr>
102+
<td>
103+
proxy(optional)
104+
</td>
105+
<td>
106+
string
107+
</td>
108+
<td>
109+
optional argument, if user wants to set proxy, if proxy requires authentication then the format will be <code> user:password@IP:PORT </code>
110+
</td>
111+
</tr>
112+
101113
</table>
102114
<br>
103115
<hr>
@@ -152,7 +164,7 @@ Output Structure for JSON format:
152164
``` javascript
153165
{
154166
"id": {
155-
"name": string,
167+
"name": string,
156168
"shares": integer,
157169
"reactions": {
158170
"likes": integer,
@@ -321,7 +333,7 @@ reactions
321333
dictionary
322334
</td>
323335
<td>
324-
dictionary containing reactions as keys and its count as value. Keys => <code> ["likes","loves","wow","cares","sad","angry","haha"] </code>
336+
dictionary containing reactions as keys and its count as value. Keys => <code> ["likes","loves","wow","cares","sad","angry","haha"] </code>
325337
</td>
326338
</tr>
327339

changelog.MD

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
<h1> Changelog </h1>
2+
<section>
3+
<h2> 0.1.9 </h2>
4+
<h3>Added</h3>
5+
<li>Added feature for using proxy while scraping</li>
6+
7+
<br>
8+
29

310
<section>
411
<h2> 0.1.8 </h2>

facebook_page_scraper/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
from .driver_utilities import Utilities
44
from .element_finder import Finder
55
from .scraping_utilities import Scraping_utilities
6+
67
__all__ = ["Initializer","Facebook_scraper","Utilities","Finder","Scraping_utilities"]

facebook_page_scraper/driver_initialization.py

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
#!/usr/bin/env python3
22
try:
3-
from selenium import webdriver
4-
#to add capabilities for chrome and firefox, import their Options with different aliases
5-
from selenium.webdriver.chrome.options import Options as ChromeOptions
3+
from seleniumwire import webdriver
4+
# to add capabilities for chrome and firefox, import their Options with different aliases
5+
from selenium.webdriver.chrome.options import Options as ChromeOptions
66
from selenium.webdriver.firefox.options import Options as FirefoxOptions
7-
#import webdriver for downloading respective driver for the browser
7+
# import webdriver for downloading respective driver for the browser
88
from webdriver_manager.chrome import ChromeDriverManager
99
from webdriver_manager.firefox import GeckoDriverManager
1010
except Exception as ex:
@@ -13,36 +13,62 @@
1313

1414
class Initializer:
1515

16-
def __init__(self,browser_name):
16+
def __init__(self, browser_name, proxy=None):
1717
self.browser_name = browser_name
18+
self.proxy = proxy
1819

19-
def set_properties(self,browser_option):
20+
def set_properties(self, browser_option):
2021
"""adds capabilities to the driver"""
21-
browser_option.add_argument('--headless') #runs browser in headless mode
22-
browser_option.add_argument('--no-sandbox')
23-
22+
browser_option.add_argument(
23+
'--headless') # runs browser in headless mode
24+
browser_option.add_argument('--no-sandbox')
2425
browser_option.add_argument("--disable-dev-shm-usage")
2526
browser_option.add_argument('--ignore-certificate-errors')
2627
browser_option.add_argument('--disable-gpu')
2728
browser_option.add_argument('--log-level=3')
2829
browser_option.add_argument('--disable-notifications')
2930
browser_option.add_argument('--disable-popup-blocking')
31+
32+
# browser_option.add_argument(
33+
# "--proxy-server=http://{}".format(self.proxy.replace(" ", "")))
34+
3035
return browser_option
3136

32-
def set_driver_for_browser(self,browser_name):
37+
def set_driver_for_browser(self, browser_name):
3338
"""expects browser name and returns a driver instance"""
34-
#if browser is suppose to be chrome
39+
# if browser is suppose to be chrome
3540
if browser_name.lower() == "chrome":
3641
browser_option = ChromeOptions()
37-
#automatically installs chromedriver and initialize it and returns the instance
38-
return webdriver.Chrome(executable_path=ChromeDriverManager().install(),options=self.set_properties(browser_option))
42+
# automatically installs chromedriver and initialize it and returns the instance
43+
if self.proxy is not None:
44+
options = {
45+
'https': 'https://{}'.format(self.proxy.replace(" ", "")),
46+
'http': 'http://{}'.format(self.proxy.replace(" ", "")),
47+
'no_proxy': 'localhost, 127.0.0.1'
48+
}
49+
print("Using: {}".format(self.proxy))
50+
return webdriver.Chrome(executable_path=ChromeDriverManager().install(),
51+
options=self.set_properties(browser_option), seleniumwire_options=options)
52+
53+
return webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=self.set_properties(browser_option))
3954
elif browser_name.lower() == "firefox":
4055
browser_option = FirefoxOptions()
41-
#automatically installs geckodriver and initialize it and returns the instance
42-
return webdriver.Firefox(executable_path=GeckoDriverManager().install(),options=self.set_properties(browser_option))
56+
if self.proxy is not None:
57+
options = {
58+
'https': 'https://{}'.format(self.proxy.replace(" ", "")),
59+
'http': 'http://{}'.format(self.proxy.replace(" ", "")),
60+
'no_proxy': 'localhost, 127.0.0.1'
61+
}
62+
print("Using: {}".format(self.proxy))
63+
return webdriver.Firefox(executable_path=GeckoDriverManager().install(),
64+
options=self.set_properties(browser_option), seleniumwire_options=options)
65+
66+
# automatically installs geckodriver and initialize it and returns the instance
67+
return webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=self.set_properties(browser_option))
4368
else:
44-
#if browser_name is not chrome neither firefox than raise an exception
69+
# if browser_name is not chrome neither firefox than raise an exception
4570
raise Exception("Browser not supported!")
71+
4672
def init(self):
4773
"""returns driver instance"""
4874
driver = self.set_driver_for_browser(self.browser_name)

0 commit comments

Comments
 (0)