1
- from websites .resources .data import WEBSITES
2
-
3
-
4
1
class DataExtractor :
5
2
"""
6
- Use to extract, cleanse and amend incorrect website data collection.
3
+ Use to extract, cleanse, sum and amend incorrect website data collection.
7
4
"""
8
5
def __init__ (self , data ):
9
6
self .data = data
10
7
11
8
def find_items (self , value = 4 ):
12
9
"""
13
- Find and return a new list of items where key "value" is greater than or equal to parameter value. Default = 4.
10
+ Find and return a new list of items where key "value" is greater than or equal to parameter value.
11
+ :param value: int, value to find items for, default = 4.
14
12
:return: list(dict), list of dictionaries matching the above filtering rule.
15
13
"""
16
14
return [item for item in self .data if item .get ('value' ) and item .get ('value' ) >= value ]
17
15
18
16
def amend_domain_values (self , prefix = 'www.' ):
19
17
"""
20
18
Fixes missing parts of the domain names.
21
- :param prefix: str, prefix to add to the domain name. Default = 'www'.
19
+ :param prefix: str, prefix to add to the domain name, default = 'www'.
22
20
:return: amended: list(dict), amended list of web records.
23
21
"""
24
22
amended = []
@@ -39,18 +37,17 @@ def cleanse_data(self):
39
37
url = item .get ('url' )
40
38
secure = item .get ('secure' )
41
39
# https marked as secure = False
42
- if url and url .startswith ('https:' ) and not item . get ( ' secure' ) :
40
+ if url and url .startswith ('https:' ) and not secure :
43
41
item ['secure' ] = True
44
42
# http marked as secure = True
45
- elif url and url .startswith ('http:' ) and item . get ( ' secure' ) :
43
+ elif url and url .startswith ('http:' ) and secure :
46
44
item ['secure' ] = False
47
45
amended .append (item )
48
46
return amended
49
47
50
-
51
-
52
- # data_extractor = DataExtractor(WEBSITES)
53
- # print(data_extractor.amend_domain_values())
54
- # print(data_extractor.find_items(4))
55
- # print(len(data_extractor.find_items(4)))
56
- # print(data_extractor.cleanse_data())
48
+ def get_value_sum (self ):
49
+ """
50
+ Returns sum of all value keys in the data set.
51
+ :return: int, sum of all value keys in the data set.
52
+ """
53
+ return sum ([item .get ('value' , 0 ) for item in self .data ])
0 commit comments