Skip to content

Commit 5d3c451

Browse files
author
Max
committed
Updated validation_split.py and test_validation_split.py to work with Lenskit Datasets.
Also updated the requirements in setup.py to use smac 2.3 and numpy >2.2
1 parent 7788bb7 commit 5d3c451

File tree

3 files changed

+115
-149
lines changed

3 files changed

+115
-149
lines changed

lkauto/utils/validation_split.py

Lines changed: 70 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,21 @@
1+
from typing import Iterator
2+
13
import pandas as pd
2-
import numpy as np
3-
from lenskit.crossfold import partition_rows
4+
from lenskit.splitting import crossfold_records, crossfold_users, sample_records, SampleFrac, TTSplit
5+
from lenskit.data import Dataset
46

57

6-
def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_folds: int = 1,
7-
frac: float = 0.25, random_state=42) -> dict:
8+
def validation_split(data: Dataset, strategy: str = 'user_based', num_folds: int = 1,
9+
frac: float = 0.25, random_state=42) -> Iterator[TTSplit]:
810
"""
9-
Returns a dictionary with the indices of the train and validation split for the given data.
10-
The dictionary has the following structure:
11-
{
12-
0: { # fold 0
13-
"train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
14-
"validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
15-
},
16-
1: { # fold 1
17-
"train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
18-
"validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
19-
}
20-
}
11+
Returns the Train-Test-Split for the given Dataset
2112
2213
Parameters
2314
----------
24-
data : pd.DataFrame
25-
Pandas Dataframe with the data to be split.
26-
strategie : str
27-
cross validation strategie (user_based or row_based)
15+
data : Dataset
16+
Lenskit Dataset with the data to be split.
17+
strategy : str
18+
cross validation strategy (user_based or row_based)
2819
num_folds : int
2920
number of folds for the validation split cross validation
3021
frac : float
@@ -35,37 +26,26 @@ def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_fold
3526
3627
Returns
3728
-------
38-
dict
39-
dictionary with the indices of the train and validation split for the given data.
29+
Iterator[TTSplit]
30+
The Train-Test-Split for the given Dataset
4031
"""
41-
# decide which validation split strategie to use
42-
if strategie == 'user_based':
32+
# decide which validation split strategy to use
33+
if strategy == 'user_based':
4334
return user_based_validation_split(data=data, num_folds=num_folds, frac=frac, random_state=random_state)
44-
elif strategie == 'row_based':
35+
elif strategy == 'row_based':
4536
return row_based_validation_split(data=data, num_folds=num_folds, frac=frac, random_state=random_state)
4637
else:
47-
raise ValueError(f"Unknown validation split strategie: {strategie}")
38+
raise ValueError(f"Unknown validation split strategy: {strategy}")
4839

4940

50-
def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: float = 0.25, random_state=42) -> dict:
41+
def row_based_validation_split(data: Dataset, num_folds: int = 1, frac: float = 0.25, random_state=42) -> Iterator[TTSplit]:
5142
"""
52-
Returns a dictionary with the indices of the train and validation split for the given data.
53-
The dictionary has the following structure:
54-
{
55-
0: { # fold 0
56-
"train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
57-
"validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
58-
},
59-
1: { # fold 1
60-
"train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
61-
"validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
62-
}
63-
}
43+
Returns a Train-Test-Split for the given data.
6444
6545
Parameters
6646
----------
67-
data : pd.DataFrame
68-
Pandas Dataframe with the data to be split.
47+
data : Dataset
48+
Lenskit Dataset with the data to be split.
6949
num_folds : int
7050
number of folds for the validation split cross validation
7151
frac : float
@@ -76,46 +56,23 @@ def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: flo
7656
7757
Returns
7858
-------
79-
dict
80-
dictionary with the indices of the train and validation split for the given data.
59+
Iterator[TTSPlit]
60+
Train-Tet-Split for the given data.
8161
"""
82-
# initialize a dictionary with the indices of the train and validation split for the given data
83-
fold_indices = {i: {"train": np.array([]), "validation": np.array([])} for i in
84-
range(num_folds)}
85-
# if num_folds < 2, we use a holdout validation split
62+
8663
if num_folds < 2:
87-
fold_indices = __holdout_validation_split(fold_indices=fold_indices,
88-
data=data,
89-
frac=frac,
90-
random_state=random_state)
91-
# if num_folds > 1, we use a cross validation split
64+
return __holdout_validation_split(data=data, frac=frac, random_state=random_state)
9265
else:
93-
fold_indices = __row_based_k_fold_validation_split(fold_indices=fold_indices,
94-
data=data,
95-
num_folds=num_folds,
96-
random_state=random_state)
97-
return fold_indices
66+
return __row_based_k_fold_validation_split(data=data, num_folds=num_folds, random_state=random_state)
9867

9968

100-
def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: float = 0.25, random_state=42) -> dict:
69+
def user_based_validation_split(data: Dataset, num_folds: int = 1, frac: float = 0.25, random_state=42) -> Iterator[
70+
TTSplit]:
10171
"""
102-
Returns a dictionary with the indices of the train and validation split for the given data.
103-
The dictionary has the following structure:
104-
{
105-
0: { # fold 0
106-
"train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
107-
"validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
108-
},
109-
1: { # fold 1
110-
"train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
111-
"validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
112-
}
113-
}
114-
11572
Parameters
11673
----------
117-
data : pd.DataFrame
118-
Pandas Dataframe with the data to be split.
74+
data : Dataset
75+
Lenskit Dataset with the data to be split.
11976
num_folds : int
12077
number of folds for the validation split cross validation
12178
frac : float
@@ -126,40 +83,25 @@ def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: fl
12683
12784
Returns
12885
-------
129-
dict
130-
dictionary with the indices of the train and validation split for the given data.
86+
Iterator[TTSplit]
87+
Train-Test-Split for the given data.
13188
"""
132-
# initialize a dictionary with the indices of the train and validation split for the given data
133-
fold_indices = {i: {"train": np.array([]), "validation": np.array([])} for i in
134-
range(num_folds)}
135-
136-
# group by users and then sample from each user
137-
for user, items in data.groupby("user"):
138-
# if num_folds < 2, we use a holdout validation split
139-
if num_folds < 2:
140-
fold_indices = __holdout_validation_split(fold_indices=fold_indices,
141-
data=items,
142-
random_state=random_state,
143-
frac=frac)
144-
# if num_folds > 1, we use a cross validation split
145-
else:
146-
fold_indices = __user_based_crossfold_validation_split(fold_indices=fold_indices,
147-
data=items,
148-
num_folds=num_folds)
149-
150-
return fold_indices
151-
152-
153-
def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: float, random_state=42):
89+
90+
if num_folds < 2:
91+
return __holdout_validation_split(data=data, frac=frac, random_state=random_state)
92+
else:
93+
return __user_based_crossfold_validation_split(data=data, num_folds=num_folds)
94+
95+
96+
97+
def __holdout_validation_split(data: Dataset, frac: float, random_state=42):
15498
"""
155-
Returns a dictionary with the indices of the train and validation split for the given data.
99+
Returns a Train-Test-Split for the given data.
156100
157101
Parameters
158102
----------
159-
fold_indices : dict
160-
dictionary with the indices of the train and validation split for the given data.
161-
data : pd.DataFrame
162-
Pandas Dataframe with the data to be split.
103+
data : Dataset
104+
Lenskit Dataset with the data to be split.
163105
frac : float
164106
fraction of the dataset to be used for the validation split. If num_folds > 1, the fraction value
165107
will be ignored.
@@ -168,64 +110,48 @@ def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: flo
168110
169111
Returns
170112
-------
171-
dict
113+
Iterator[TTSplit]
114+
Train-Test-Split for the given data. Should only contain one fold.
172115
"""
173-
# sample the validation set
174-
validation = data.sample(frac=frac, random_state=random_state)
175-
# get the train set by dropping the validation set
176-
train = data.drop(validation.index)
177-
# append the indices of the train and validation set to the dictionary
178-
fold_indices[0]['train'] = np.append(fold_indices[0]["train"], train.index)
179-
fold_indices[0]['validation'] = np.append(fold_indices[0]["validation"], validation.index)
180-
# return the dictionary
181-
return fold_indices
182-
183-
184-
def __row_based_k_fold_validation_split(fold_indices: dict, data: pd.DataFrame, num_folds: int, random_state):
116+
117+
splits = sample_records(data=data, size=int(data.interaction_count * frac), rng=random_state)
118+
119+
if hasattr(splits, "_iter__"):
120+
return splits
121+
else:
122+
return iter([splits])
123+
124+
125+
def __row_based_k_fold_validation_split(data: Dataset, num_folds: int, random_state):
185126
"""
186-
Returns a dictionary with the indices of the row based cv train and validation split for the given data.
127+
Returns a Train-Test-Split for the given data.
187128
188129
Parameters
189130
----------
190-
fold_indices : dict
191-
dictionary with the indices of the train and validation split for the given data.
192-
data : pd.DataFrame
193-
Pandas Dataframe with the data to be split.
131+
data : Dataset
132+
Lenskit Dataset with the data to be split.
194133
"""
195-
# generate the indices of the train and validation split for the given data
196-
for i, splits in enumerate(partition_rows(data, partitions=num_folds, rng_spec=random_state)):
197-
fold_indices[i]['train'] = np.append(fold_indices[i]["train"], splits[0].index)
198-
fold_indices[i]['validation'] = np.append(fold_indices[i]["train"], splits[1].index)
199-
return fold_indices
134+
135+
splits = crossfold_records(data=data, partitions=num_folds, rng=random_state)
136+
return splits
200137

201138

202-
def __user_based_crossfold_validation_split(fold_indices, data, num_folds) -> dict:
139+
140+
def __user_based_crossfold_validation_split(data: Dataset, num_folds) -> Iterator[TTSplit]:
203141
"""
204-
Returns a dictionary with the indices of the user based cv train and validation split for the given data.
142+
Returns a Train-Test-Split for the given data.
205143
206144
Parameters
207145
----------
208-
fold_indices : dict
209-
dictionary with the indices of the train and validation split for the given data.
210-
data : pd.DataFrame
146+
data : Dataset
211147
Pandas Dataframe with the data to be split.
212148
num_folds : int
213149
number of folds for the validation split cross validation
214150
215151
Returns
216152
-------
217-
dict
153+
Iterator[TTSplit]
154+
Train-Test-Split for the given data.
218155
"""
219-
# generate splits of equal size
220-
splits = np.array_split(data, num_folds)
221-
# go through each split
222-
for i in range(len(splits)):
223-
# the split denoted by i is the test set, so all other splits are the train set
224-
train = pd.concat(splits[:i] + splits[i + 1:], axis=0, ignore_index=False)
225-
# the test data is simply the index we are currently observing
226-
test = splits[i]
227-
# append the indices to the dictionary
228-
fold_indices[i]["train"] = np.append(fold_indices[i]["train"], train.index)
229-
fold_indices[i]["validation"] = np.append(fold_indices[i]["validation"], test.index)
230-
231-
return fold_indices
156+
157+
return crossfold_users(data=data, partitions=num_folds, method=SampleFrac(0.2))

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
python_requires=">=3.8, <=3.9",
1414
packages=find_packages(),
1515
install_requires=[
16-
"smac~=1.4",
16+
"smac~=2.3",
1717
"matplotlib~=3.6",
1818
"lenskit>=0.14.2",
19-
"numpy==1.21.6",
19+
"numpy>2.2",
2020
"tables~=3.8",
2121
"typing~=3.5"
2222
],
@@ -30,4 +30,4 @@
3030
long_description_content_type="text/markdown",
3131
url="https://github.com/ISG-Siegen/lenskit-auto",
3232
project_urls={"Documentation": "https://lenskit-auto.readthedocs.io"},
33-
)
33+
)

tests/utils/test_validation_split.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,61 @@
44
import pandas as pd
55

66
from lkauto.utils.validation_split import validation_split
7+
from lenskit.data import from_interactions_df
78

89

910
class TestValidationSplit(unittest.TestCase):
1011

1112
def setUp(self):
12-
self.df = pd.DataFrame(np.ones((100, 3)), columns=["user", "item", "rating", ])
13+
self.df = pd.DataFrame(np.array([[1, 1, 1],
14+
[1, 2, 2],
15+
[1, 3, 3],
16+
[2, 1, 1],
17+
[2, 2, 2],
18+
[2, 3, 3],
19+
[3, 1, 1],
20+
[3, 2, 2],
21+
[3, 3, 3],
22+
[4, 1, 1],
23+
[4, 2, 2],
24+
[4, 3, 3],
25+
[5, 1, 1],
26+
[5, 2, 2],
27+
[5, 3, 3]]), columns=["user", "item", "rating", ])
28+
self.ds = from_interactions_df(self.df)
1329

30+
"""
1431
def test_validationSplit_givenValidDataFrame_correctSplitTrainAndValidationDataframesReturnedExpected(self):
15-
val_fold_indices = validation_split(data=self.df, frac=0.25, random_state=42)
32+
val_fold_indices = validation_split(data=self.ds, frac=0.25, random_state=42)
1633
1734
validation_train = self.df.loc[val_fold_indices[0]["train"], :]
1835
validation_test = self.df.loc[val_fold_indices[0]["validation"], :]
1936
2037
self.assertTrue(validation_train.shape == (75, 3))
2138
self.assertTrue(validation_test.shape == (25, 3))
39+
"""
40+
41+
def test_validationSplit_givenValidDataset_1Fold_UserBased(self):
42+
splits = validation_split(data=self.ds, strategy="user_based", frac=0.2, num_folds=1, random_state=42)
43+
44+
fold = next(splits)
45+
test_sample_fold = fold.test
46+
train_sample_fold = fold.train
47+
48+
self.assertTrue(test_sample_fold.to_df().shape[0] == 3)
49+
self.assertTrue(train_sample_fold.interaction_count == 12)
50+
51+
def test_validationSplit_givenValidDataset_3Fold_RowBased(self):
52+
splits = validation_split(data=self.ds, strategy="row_based", frac=0.2, num_folds=3, random_state=42)
53+
54+
fold = next(splits)
55+
test_sample_fold = fold.test
56+
train_sample_fold = fold.train
57+
58+
self.assertTrue(test_sample_fold.to_df().shape[0] == 5)
59+
self.assertTrue(train_sample_fold.interaction_count == 10)
60+
61+
2262

2363

2464
if __name__ == '__main__':

0 commit comments

Comments
 (0)