1
+ from typing import Iterator
2
+
1
3
import pandas as pd
4
+
2
5
import numpy as np
3
6
# from lenskit.crossfold import partition_rows
4
7
from lenskit .splitting import crossfold_records
5
8
6
- def validation_split (data : pd .DataFrame , strategie : str = 'user_based' , num_folds : int = 1 ,
7
- frac : float = 0.25 , random_state = 42 ) -> dict :
9
+ from lenskit .splitting import crossfold_records , crossfold_users , sample_records , SampleFrac , TTSplit
10
+ from lenskit .data import Dataset
11
+
12
+
13
+ def validation_split (data : Dataset , strategy : str = 'user_based' , num_folds : int = 1 ,
14
+ frac : float = 0.25 , random_state = 42 ) -> Iterator [TTSplit ]:
8
15
"""
9
- Returns a dictionary with the indices of the train and validation split for the given data.
10
- The dictionary has the following structure:
11
- {
12
- 0: { # fold 0
13
- "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
14
- "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
15
- },
16
- 1: { # fold 1
17
- "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
18
- "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
19
- }
20
- }
16
+ Returns the Train-Test-Split for the given Dataset
21
17
22
18
Parameters
23
19
----------
24
- data : pd.DataFrame
25
- Pandas Dataframe with the data to be split.
26
- strategie : str
27
- cross validation strategie (user_based or row_based)
20
+ data : Dataset
21
+ Lenskit Dataset with the data to be split.
22
+ strategy : str
23
+ cross validation strategy (user_based or row_based)
28
24
num_folds : int
29
25
number of folds for the validation split cross validation
30
26
frac : float
@@ -35,37 +31,26 @@ def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_fold
35
31
36
32
Returns
37
33
-------
38
- dict
39
- dictionary with the indices of the train and validation split for the given data.
34
+ Iterator[TTSplit]
35
+ The Train-Test-Split for the given Dataset
40
36
"""
41
- # decide which validation split strategie to use
42
- if strategie == 'user_based' :
37
+ # decide which validation split strategy to use
38
+ if strategy == 'user_based' :
43
39
return user_based_validation_split (data = data , num_folds = num_folds , frac = frac , random_state = random_state )
44
- elif strategie == 'row_based' :
40
+ elif strategy == 'row_based' :
45
41
return row_based_validation_split (data = data , num_folds = num_folds , frac = frac , random_state = random_state )
46
42
else :
47
- raise ValueError (f"Unknown validation split strategie : { strategie } " )
43
+ raise ValueError (f"Unknown validation split strategy : { strategy } " )
48
44
49
45
50
- def row_based_validation_split (data : pd . DataFrame , num_folds : int = 1 , frac : float = 0.25 , random_state = 42 ) -> dict :
46
+ def row_based_validation_split (data : Dataset , num_folds : int = 1 , frac : float = 0.25 , random_state = 42 ) -> Iterator [ TTSplit ] :
51
47
"""
52
- Returns a dictionary with the indices of the train and validation split for the given data.
53
- The dictionary has the following structure:
54
- {
55
- 0: { # fold 0
56
- "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
57
- "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
58
- },
59
- 1: { # fold 1
60
- "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
61
- "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
62
- }
63
- }
48
+ Returns a Train-Test-Split for the given data.
64
49
65
50
Parameters
66
51
----------
67
- data : pd.DataFrame
68
- Pandas Dataframe with the data to be split.
52
+ data : Dataset
53
+ Lenskit Dataset with the data to be split.
69
54
num_folds : int
70
55
number of folds for the validation split cross validation
71
56
frac : float
@@ -76,46 +61,23 @@ def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: flo
76
61
77
62
Returns
78
63
-------
79
- dict
80
- dictionary with the indices of the train and validation split for the given data.
64
+ Iterator[TTSPlit]
65
+ Train-Tet-Split for the given data.
81
66
"""
82
- # initialize a dictionary with the indices of the train and validation split for the given data
83
- fold_indices = {i : {"train" : np .array ([]), "validation" : np .array ([])} for i in
84
- range (num_folds )}
85
- # if num_folds < 2, we use a holdout validation split
67
+
86
68
if num_folds < 2 :
87
- fold_indices = __holdout_validation_split (fold_indices = fold_indices ,
88
- data = data ,
89
- frac = frac ,
90
- random_state = random_state )
91
- # if num_folds > 1, we use a cross validation split
69
+ return __holdout_validation_split (data = data , frac = frac , random_state = random_state )
92
70
else :
93
- fold_indices = __row_based_k_fold_validation_split (fold_indices = fold_indices ,
94
- data = data ,
95
- num_folds = num_folds ,
96
- random_state = random_state )
97
- return fold_indices
71
+ return __row_based_k_fold_validation_split (data = data , num_folds = num_folds , random_state = random_state )
98
72
99
73
100
- def user_based_validation_split (data : pd .DataFrame , num_folds : int = 1 , frac : float = 0.25 , random_state = 42 ) -> dict :
74
+ def user_based_validation_split (data : Dataset , num_folds : int = 1 , frac : float = 0.25 , random_state = 42 ) -> Iterator [
75
+ TTSplit ]:
101
76
"""
102
- Returns a dictionary with the indices of the train and validation split for the given data.
103
- The dictionary has the following structure:
104
- {
105
- 0: { # fold 0
106
- "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
107
- "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
108
- },
109
- 1: { # fold 1
110
- "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
111
- "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
112
- }
113
- }
114
-
115
77
Parameters
116
78
----------
117
- data : pd.DataFrame
118
- Pandas Dataframe with the data to be split.
79
+ data : Dataset
80
+ Lenskit Dataset with the data to be split.
119
81
num_folds : int
120
82
number of folds for the validation split cross validation
121
83
frac : float
@@ -126,40 +88,25 @@ def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: fl
126
88
127
89
Returns
128
90
-------
129
- dict
130
- dictionary with the indices of the train and validation split for the given data.
91
+ Iterator[TTSplit]
92
+ Train-Test-Split for the given data.
131
93
"""
132
- # initialize a dictionary with the indices of the train and validation split for the given data
133
- fold_indices = {i : {"train" : np .array ([]), "validation" : np .array ([])} for i in
134
- range (num_folds )}
135
-
136
- # group by users and then sample from each user
137
- for user , items in data .groupby ("user" ):
138
- # if num_folds < 2, we use a holdout validation split
139
- if num_folds < 2 :
140
- fold_indices = __holdout_validation_split (fold_indices = fold_indices ,
141
- data = items ,
142
- random_state = random_state ,
143
- frac = frac )
144
- # if num_folds > 1, we use a cross validation split
145
- else :
146
- fold_indices = __user_based_crossfold_validation_split (fold_indices = fold_indices ,
147
- data = items ,
148
- num_folds = num_folds )
149
94
150
- return fold_indices
95
+ if num_folds < 2 :
96
+ return __holdout_validation_split (data = data , frac = frac , random_state = random_state )
97
+ else :
98
+ return __user_based_crossfold_validation_split (data = data , num_folds = num_folds )
151
99
152
100
153
- def __holdout_validation_split (fold_indices : dict , data : pd .DataFrame , frac : float , random_state = 42 ):
101
+
102
+ def __holdout_validation_split (data : Dataset , frac : float , random_state = 42 ):
154
103
"""
155
- Returns a dictionary with the indices of the train and validation split for the given data.
104
+ Returns a Train-Test-Split for the given data.
156
105
157
106
Parameters
158
107
----------
159
- fold_indices : dict
160
- dictionary with the indices of the train and validation split for the given data.
161
- data : pd.DataFrame
162
- Pandas Dataframe with the data to be split.
108
+ data : Dataset
109
+ Lenskit Dataset with the data to be split.
163
110
frac : float
164
111
fraction of the dataset to be used for the validation split. If num_folds > 1, the fraction value
165
112
will be ignored.
@@ -168,64 +115,55 @@ def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: flo
168
115
169
116
Returns
170
117
-------
171
- dict
118
+ Iterator[TTSplit]
119
+ Train-Test-Split for the given data. Should only contain one fold.
172
120
"""
173
- # sample the validation set
174
- validation = data .sample (frac = frac , random_state = random_state )
175
- # get the train set by dropping the validation set
176
- train = data .drop (validation .index )
177
- # append the indices of the train and validation set to the dictionary
178
- fold_indices [0 ]['train' ] = np .append (fold_indices [0 ]["train" ], train .index )
179
- fold_indices [0 ]['validation' ] = np .append (fold_indices [0 ]["validation" ], validation .index )
180
- # return the dictionary
181
- return fold_indices
121
+
122
+ splits = sample_records (data = data , size = int (data .interaction_count * frac ), rng = random_state )
123
+
124
+ if hasattr (splits , "_iter__" ):
125
+ return splits
126
+ else :
127
+ return iter ([splits ])
182
128
183
129
184
- def __row_based_k_fold_validation_split (fold_indices : dict , data : pd . DataFrame , num_folds : int , random_state ):
130
+ def __row_based_k_fold_validation_split (data : Dataset , num_folds : int , random_state ):
185
131
"""
186
- Returns a dictionary with the indices of the row based cv train and validation split for the given data.
132
+ Returns a Train-Test-Split for the given data.
187
133
188
134
Parameters
189
135
----------
190
- fold_indices : dict
191
- dictionary with the indices of the train and validation split for the given data.
192
- data : pd.DataFrame
193
- Pandas Dataframe with the data to be split.
136
+ data : Dataset
137
+ Lenskit Dataset with the data to be split.
194
138
"""
139
+
195
140
# generate the indices of the train and validation split for the given data
196
141
for i , splits in enumerate (crossfold_records (data , partitions = num_folds , rng_spec = random_state )):
197
142
fold_indices [i ]['train' ] = np .append (fold_indices [i ]["train" ], splits [0 ].index )
198
143
fold_indices [i ]['validation' ] = np .append (fold_indices [i ]["validation" ], splits [1 ].index )
199
144
return fold_indices
200
145
146
+ splits = crossfold_records (data = data , partitions = num_folds , rng = random_state )
147
+ return splits
148
+
201
149
202
- def __user_based_crossfold_validation_split (fold_indices , data , num_folds ) -> dict :
150
+
151
+
152
+ def __user_based_crossfold_validation_split (data : Dataset , num_folds ) -> Iterator [TTSplit ]:
203
153
"""
204
- Returns a dictionary with the indices of the user based cv train and validation split for the given data.
154
+ Returns a Train-Test-Split for the given data.
205
155
206
156
Parameters
207
157
----------
208
- fold_indices : dict
209
- dictionary with the indices of the train and validation split for the given data.
210
- data : pd.DataFrame
158
+ data : Dataset
211
159
Pandas Dataframe with the data to be split.
212
160
num_folds : int
213
161
number of folds for the validation split cross validation
214
162
215
163
Returns
216
164
-------
217
- dict
165
+ Iterator[TTSplit]
166
+ Train-Test-Split for the given data.
218
167
"""
219
- # generate splits of equal size
220
- splits = np .array_split (data , num_folds )
221
- # go through each split
222
- for i in range (len (splits )):
223
- # the split denoted by i is the test set, so all other splits are the train set
224
- train = pd .concat (splits [:i ] + splits [i + 1 :], axis = 0 , ignore_index = False )
225
- # the test data is simply the index we are currently observing
226
- test = splits [i ]
227
- # append the indices to the dictionary
228
- fold_indices [i ]["train" ] = np .append (fold_indices [i ]["train" ], train .index )
229
- fold_indices [i ]["validation" ] = np .append (fold_indices [i ]["validation" ], test .index )
230
168
231
- return fold_indices
169
+ return crossfold_users ( data = data , partitions = num_folds , method = SampleFrac ( 0.2 ))
0 commit comments