Skip to content

Commit 179aea4

Browse files
author
Tomasz-Kluczkowski
committed
Add cleanse_data and tests.
Used test_data containing all possible states for ease of testing.
1 parent 304c48b commit 179aea4

File tree

2 files changed

+89
-3
lines changed

2 files changed

+89
-3
lines changed

data_extractor.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,29 @@ def amend_domain_values(self, prefix='www.'):
2828
amended.append(item)
2929
return amended
3030

31+
def cleanse_data(self):
32+
"""
33+
Fix errors in "secure" key values. All urls starting with https should be set to "secure": True, those starting
34+
with http "secure": False.
35+
:return: amended: list(dict), amended list of web records.
36+
"""
37+
amended = []
38+
for item in self.data:
39+
url = item.get('url')
40+
secure = item.get('secure')
41+
# https marked as secure = False
42+
if url and url.startswith('https:') and not item.get('secure'):
43+
item['secure'] = True
44+
# http marked as secure = True
45+
elif url and url.startswith('http:') and item.get('secure'):
46+
item['secure'] = False
47+
amended.append(item)
48+
return amended
49+
50+
3151

32-
data_extractor = DataExtractor(WEBSITES)
52+
# data_extractor = DataExtractor(WEBSITES)
3353
# print(data_extractor.amend_domain_values())
34-
print(data_extractor.find_items(4))
35-
print(len(data_extractor.find_items(4)))
54+
# print(data_extractor.find_items(4))
55+
# print(len(data_extractor.find_items(4)))
56+
# print(data_extractor.cleanse_data())

tests/test_data_extractor.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,68 @@ def test_amend_domain_values_retains_original_if_prefix_matching(self):
107107
]
108108
_data_extractor = DataExtractor(test_data)
109109
assert _data_extractor.amend_domain_values() == test_data
110+
111+
def test_cleanse_data(self):
112+
test_data = [
113+
{
114+
'name': 'Google',
115+
'url': 'https://www.google.co.uk',
116+
'domain': 'google.co.uk',
117+
'secure': False,
118+
'value': 5
119+
},
120+
{
121+
'name': 'Facebook',
122+
'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
123+
'domain': 'facebook.com',
124+
'secure': True,
125+
'value': 4
126+
},
127+
{
128+
'name': 'Bing',
129+
'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
130+
'domain': 'bing.com',
131+
'secure': False,
132+
'value': 3
133+
},
134+
{
135+
'name': 'Duck Duck Go',
136+
'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web',
137+
'domain': 'duckduckgo.com',
138+
'secure': True,
139+
'value': 2
140+
},
141+
]
142+
143+
expected = [
144+
{
145+
'name': 'Google',
146+
'url': 'https://www.google.co.uk',
147+
'domain': 'google.co.uk',
148+
'secure': True,
149+
'value': 5
150+
},
151+
{
152+
'name': 'Facebook',
153+
'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
154+
'domain': 'facebook.com',
155+
'secure': False,
156+
'value': 4
157+
},
158+
{
159+
'name': 'Bing',
160+
'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
161+
'domain': 'bing.com',
162+
'secure': False,
163+
'value': 3
164+
},
165+
{
166+
'name': 'Duck Duck Go',
167+
'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web',
168+
'domain': 'duckduckgo.com',
169+
'secure': True,
170+
'value': 2
171+
},
172+
]
173+
_data_extractor = DataExtractor(test_data)
174+
assert _data_extractor.cleanse_data() == expected

0 commit comments

Comments
 (0)