4
4
5
5
class Fetcher :
6
6
7
- def __init__ (self , proxy_sites ):
7
+ def __init__ (self , proxy_sites : list ):
8
+ # TODO: optimize fetch_proxy and workupload_downloader
9
+ """ Initialize fetcher class
10
+
11
+ Args:
12
+ proxy_sites (list): sites to scrape
13
+ """
8
14
self .PROXY_SITES = proxy_sites
9
15
self .PATTERN = r'(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3}):(\d{1,5})'
10
16
self .FILE_NAME = 'proxy.zip'
11
17
12
- def get_proxies_sites (self ):
18
+ def get_proxies_sites (self ) -> list :
19
+ """ PROXY_SITES getter
20
+
21
+ Returns:
22
+ list: proxy sites list
23
+ """
13
24
return self .PROXY_SITES
14
25
15
- def worker (self , site , result , cookies = None ):
26
+ def worker (self , site : str , result : list , cookies : dict = None ):
27
+ """ Request worker
28
+
29
+ Args:
30
+ site (str): http request url
31
+ result (list) callback var
32
+ cookies (dict, optional): used cookies. Defaults to None.
33
+ """
16
34
res = requests .get (
17
35
site ,
18
36
cookies = cookies ,
@@ -22,74 +40,103 @@ def worker(self, site, result, cookies=None):
22
40
if res .status_code == requests .codes .ok :
23
41
result .append (res )
24
42
25
- def workupload_downloader (self , download_id ):
43
+ def workupload_downloader (self , download_id : str ) -> any :
44
+ """ Workupload.com files downloader
45
+
46
+ Args:
47
+ download_id (str): file download ID
48
+
49
+ Yields:
50
+ any: result
51
+ """
26
52
token_id = []
27
53
self .worker (
28
54
f'https://workupload.com/file/{ download_id } ' ,
29
55
token_id
30
56
)
31
57
token_id = token_id [0 ].cookies ['token' ]
32
- temp_result = []
58
+ swap_result = []
33
59
self .worker (
34
60
f'https://workupload.com/start/{ download_id } ' ,
35
- temp_result ,
61
+ swap_result ,
36
62
{'token' : token_id }
37
63
)
64
+
38
65
if temp_result [0 ].status_code == requests .codes .ok :
39
- temp_result = []
66
+ swap_result = []
40
67
self .worker (
41
68
f'https://workupload.com/api/file/getDownloadServer/{ download_id } ' ,
42
- temp_result ,
69
+ swap_result ,
43
70
{'token' : token_id }
44
71
)
45
- temp_result = temp_result [0 ].json ()
46
- if temp_result ['success' ]:
72
+ swap_result = swap_result [0 ].json ()
73
+
74
+ if swap_result ['success' ]:
47
75
result = []
48
76
self .worker (
49
- temp_result ['data' ]['url' ],
77
+ swap_result ['data' ]['url' ],
50
78
result ,
51
79
{'token' : token_id }
52
80
)
81
+
53
82
yield result [0 ]
54
83
55
- def fetch_proxy (self , site ):
84
+ def fetch_proxy (self , site : str ) -> any :
85
+ """ Fetch proxies
86
+
87
+ Args:
88
+ site (str): site to scrape
89
+
90
+ Yields:
91
+ any: result
92
+ """
56
93
try :
57
94
res = []
58
95
th = threading .Thread (target = self .worker , args = (site , res ,))
59
96
th .start ()
60
97
th .join ()
98
+
61
99
for _ in res :
62
100
_ = _ .text
63
101
html = BeautifulSoup (_ , 'html.parser' )
64
102
posts = html .find_all ('h3' , {'class' : 'post-title' })
103
+
65
104
if len (posts ) > 0 :
105
+
66
106
for post in posts :
67
107
post = post .select ('.post-title a' )[0 ]['href' ]
68
108
fetched = re .findall (self .PATTERN , requests .get (post ).text )
109
+
69
110
if len (fetched ) > 0 :
111
+
70
112
for proxy in fetched :
71
113
yield '.' .join (proxy [:len (proxy )- 1 ]) + f':{ proxy [len (proxy )- 1 ]} '
72
114
else :
73
115
post = requests .get (post ).text
74
116
download_id = post .split ('="https://workupload.com/' )[1 ].split ('" ' )[0 ].split ('/' )[1 ]
117
+
75
118
for _ in self .workupload_downloader (download_id ):
119
+
76
120
with open (self .FILE_NAME , 'wb' ) as zip_file :
77
121
zip_file .write (_ .content )
78
122
zip_file .close ()
79
123
archive = zipfile .ZipFile ('proxy.zip' )
124
+
80
125
for _ in archive .namelist ():
126
+
81
127
if '.txt' in _ :
82
128
fetched = re .findall (self .PATTERN , archive .read (_ ).decode ())
129
+
83
130
for proxy in fetched :
84
131
yield '.' .join (proxy [:len (proxy )- 1 ]) + f':{ proxy [len (proxy )- 1 ]} '
132
+
85
133
archive .close ()
86
134
os .remove (self .FILE_NAME )
87
135
else :
88
136
fetched = re .findall (self .PATTERN , _ )
137
+
89
138
for proxy in fetched :
90
139
yield '.' .join (proxy [:len (proxy )- 1 ]) + f':{ proxy [len (proxy )- 1 ]} '
91
140
except Exception as e :
92
141
yield False
93
142
94
-
95
-
0 commit comments