1
- from websites .resources .data import WEBSITES
2
-
3
-
4
1
class DataExtractor :
5
2
"""
6
- Use to extract, cleanse and amend incorrect website data collection.
3
+ Use to extract, cleanse, sum and amend incorrect website data collection.
7
4
"""
8
5
def __init__ (self , data ):
9
6
self .data = data
10
7
11
8
def find_items (self , value = 4 ):
12
9
"""
13
- Find and return a new list of items where key "value" is greater than or equal to parameter value. Default = 4.
10
+ Find and return a new list of items where key "value" is greater than or equal to parameter value.
11
+ :param value: int, value to find items for, default = 4.
14
12
:return: list(dict), list of dictionaries matching the above filtering rule.
15
13
"""
16
14
return [item for item in self .data if item .get ('value' ) and item .get ('value' ) >= value ]
17
15
18
16
def amend_domain_values (self , prefix = 'www.' ):
19
17
"""
20
18
Fixes missing parts of the domain names.
21
- :param prefix: str, prefix to add to the domain name. Default = 'www'.
19
+ :param prefix: str, prefix to add to the domain name, default = 'www'.
22
20
:return: amended: list(dict), amended list of web records.
23
21
"""
24
22
amended = []
@@ -38,19 +36,19 @@ def cleanse_data(self):
38
36
for item in self .data :
39
37
url = item .get ('url' )
40
38
secure = item .get ('secure' )
41
- # https marked as secure = False
42
- if url and url .startswith ('https:' ) and not item .get ('secure' ):
43
- item ['secure' ] = True
44
- # http marked as secure = True
45
- elif url and url .startswith ('http:' ) and item .get ('secure' ):
46
- item ['secure' ] = False
39
+ if url :
40
+ # https marked as secure = False
41
+ if url .startswith ('https:' ) and not secure :
42
+ item ['secure' ] = True
43
+ # http marked as secure = True
44
+ elif url .startswith ('http:' ) and secure :
45
+ item ['secure' ] = False
47
46
amended .append (item )
48
47
return amended
49
48
50
-
51
-
52
- # data_extractor = DataExtractor(WEBSITES)
53
- # print(data_extractor.amend_domain_values())
54
- # print(data_extractor.find_items(4))
55
- # print(len(data_extractor.find_items(4)))
56
- # print(data_extractor.cleanse_data())
49
+ def get_value_sum (self ):
50
+ """
51
+ Returns sum of all value keys in the data set.
52
+ :return: int, sum of all value keys in the data set.
53
+ """
54
+ return sum ([item .get ('value' , 0 ) for item in self .data ])
0 commit comments