1
+ from typing import Iterator
2
+
1
3
import pandas as pd
2
- import numpy as np
3
- from lenskit .crossfold import partition_rows
4
+ from lenskit . splitting import crossfold_records , crossfold_users , sample_records , SampleFrac , TTSplit
5
+ from lenskit .data import Dataset
4
6
5
7
6
- def validation_split (data : pd . DataFrame , strategie : str = 'user_based' , num_folds : int = 1 ,
7
- frac : float = 0.25 , random_state = 42 ) -> dict :
8
+ def validation_split (data : Dataset , strategy : str = 'user_based' , num_folds : int = 1 ,
9
+ frac : float = 0.25 , random_state = 42 ) -> Iterator [ TTSplit ] :
8
10
"""
9
- Returns a dictionary with the indices of the train and validation split for the given data.
10
- The dictionary has the following structure:
11
- {
12
- 0: { # fold 0
13
- "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
14
- "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
15
- },
16
- 1: { # fold 1
17
- "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
18
- "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
19
- }
20
- }
11
+ Returns the Train-Test-Split for the given Dataset
21
12
22
13
Parameters
23
14
----------
24
- data : pd.DataFrame
25
- Pandas Dataframe with the data to be split.
26
- strategie : str
27
- cross validation strategie (user_based or row_based)
15
+ data : Dataset
16
+ Lenskit Dataset with the data to be split.
17
+ strategy : str
18
+ cross validation strategy (user_based or row_based)
28
19
num_folds : int
29
20
number of folds for the validation split cross validation
30
21
frac : float
@@ -35,37 +26,26 @@ def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_fold
35
26
36
27
Returns
37
28
-------
38
- dict
39
- dictionary with the indices of the train and validation split for the given data.
29
+ Iterator[TTSplit]
30
+ The Train-Test-Split for the given Dataset
40
31
"""
41
- # decide which validation split strategie to use
42
- if strategie == 'user_based' :
32
+ # decide which validation split strategy to use
33
+ if strategy == 'user_based' :
43
34
return user_based_validation_split (data = data , num_folds = num_folds , frac = frac , random_state = random_state )
44
- elif strategie == 'row_based' :
35
+ elif strategy == 'row_based' :
45
36
return row_based_validation_split (data = data , num_folds = num_folds , frac = frac , random_state = random_state )
46
37
else :
47
- raise ValueError (f"Unknown validation split strategie : { strategie } " )
38
+ raise ValueError (f"Unknown validation split strategy : { strategy } " )
48
39
49
40
50
- def row_based_validation_split (data : pd . DataFrame , num_folds : int = 1 , frac : float = 0.25 , random_state = 42 ) -> dict :
41
+ def row_based_validation_split (data : Dataset , num_folds : int = 1 , frac : float = 0.25 , random_state = 42 ) -> Iterator [ TTSplit ] :
51
42
"""
52
- Returns a dictionary with the indices of the train and validation split for the given data.
53
- The dictionary has the following structure:
54
- {
55
- 0: { # fold 0
56
- "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
57
- "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
58
- },
59
- 1: { # fold 1
60
- "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
61
- "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
62
- }
63
- }
43
+ Returns a Train-Test-Split for the given data.
64
44
65
45
Parameters
66
46
----------
67
- data : pd.DataFrame
68
- Pandas Dataframe with the data to be split.
47
+ data : Dataset
48
+ Lenskit Dataset with the data to be split.
69
49
num_folds : int
70
50
number of folds for the validation split cross validation
71
51
frac : float
@@ -76,46 +56,23 @@ def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: flo
76
56
77
57
Returns
78
58
-------
79
- dict
80
- dictionary with the indices of the train and validation split for the given data.
59
+ Iterator[TTSPlit]
60
+ Train-Tet-Split for the given data.
81
61
"""
82
- # initialize a dictionary with the indices of the train and validation split for the given data
83
- fold_indices = {i : {"train" : np .array ([]), "validation" : np .array ([])} for i in
84
- range (num_folds )}
85
- # if num_folds < 2, we use a holdout validation split
62
+
86
63
if num_folds < 2 :
87
- fold_indices = __holdout_validation_split (fold_indices = fold_indices ,
88
- data = data ,
89
- frac = frac ,
90
- random_state = random_state )
91
- # if num_folds > 1, we use a cross validation split
64
+ return __holdout_validation_split (data = data , frac = frac , random_state = random_state )
92
65
else :
93
- fold_indices = __row_based_k_fold_validation_split (fold_indices = fold_indices ,
94
- data = data ,
95
- num_folds = num_folds ,
96
- random_state = random_state )
97
- return fold_indices
66
+ return __row_based_k_fold_validation_split (data = data , num_folds = num_folds , random_state = random_state )
98
67
99
68
100
- def user_based_validation_split (data : pd .DataFrame , num_folds : int = 1 , frac : float = 0.25 , random_state = 42 ) -> dict :
69
+ def user_based_validation_split (data : Dataset , num_folds : int = 1 , frac : float = 0.25 , random_state = 42 ) -> Iterator [
70
+ TTSplit ]:
101
71
"""
102
- Returns a dictionary with the indices of the train and validation split for the given data.
103
- The dictionary has the following structure:
104
- {
105
- 0: { # fold 0
106
- "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
107
- "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
108
- },
109
- 1: { # fold 1
110
- "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
111
- "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
112
- }
113
- }
114
-
115
72
Parameters
116
73
----------
117
- data : pd.DataFrame
118
- Pandas Dataframe with the data to be split.
74
+ data : Dataset
75
+ Lenskit Dataset with the data to be split.
119
76
num_folds : int
120
77
number of folds for the validation split cross validation
121
78
frac : float
@@ -126,40 +83,25 @@ def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: fl
126
83
127
84
Returns
128
85
-------
129
- dict
130
- dictionary with the indices of the train and validation split for the given data.
86
+ Iterator[TTSplit]
87
+ Train-Test-Split for the given data.
131
88
"""
132
- # initialize a dictionary with the indices of the train and validation split for the given data
133
- fold_indices = {i : {"train" : np .array ([]), "validation" : np .array ([])} for i in
134
- range (num_folds )}
135
-
136
- # group by users and then sample from each user
137
- for user , items in data .groupby ("user" ):
138
- # if num_folds < 2, we use a holdout validation split
139
- if num_folds < 2 :
140
- fold_indices = __holdout_validation_split (fold_indices = fold_indices ,
141
- data = items ,
142
- random_state = random_state ,
143
- frac = frac )
144
- # if num_folds > 1, we use a cross validation split
145
- else :
146
- fold_indices = __user_based_crossfold_validation_split (fold_indices = fold_indices ,
147
- data = items ,
148
- num_folds = num_folds )
149
-
150
- return fold_indices
151
-
152
-
153
- def __holdout_validation_split (fold_indices : dict , data : pd .DataFrame , frac : float , random_state = 42 ):
89
+
90
+ if num_folds < 2 :
91
+ return __holdout_validation_split (data = data , frac = frac , random_state = random_state )
92
+ else :
93
+ return __user_based_crossfold_validation_split (data = data , num_folds = num_folds )
94
+
95
+
96
+
97
+ def __holdout_validation_split (data : Dataset , frac : float , random_state = 42 ):
154
98
"""
155
- Returns a dictionary with the indices of the train and validation split for the given data.
99
+ Returns a Train-Test-Split for the given data.
156
100
157
101
Parameters
158
102
----------
159
- fold_indices : dict
160
- dictionary with the indices of the train and validation split for the given data.
161
- data : pd.DataFrame
162
- Pandas Dataframe with the data to be split.
103
+ data : Dataset
104
+ Lenskit Dataset with the data to be split.
163
105
frac : float
164
106
fraction of the dataset to be used for the validation split. If num_folds > 1, the fraction value
165
107
will be ignored.
@@ -168,64 +110,48 @@ def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: flo
168
110
169
111
Returns
170
112
-------
171
- dict
113
+ Iterator[TTSplit]
114
+ Train-Test-Split for the given data. Should only contain one fold.
172
115
"""
173
- # sample the validation set
174
- validation = data .sample (frac = frac , random_state = random_state )
175
- # get the train set by dropping the validation set
176
- train = data .drop (validation .index )
177
- # append the indices of the train and validation set to the dictionary
178
- fold_indices [0 ]['train' ] = np .append (fold_indices [0 ]["train" ], train .index )
179
- fold_indices [0 ]['validation' ] = np .append (fold_indices [0 ]["validation" ], validation .index )
180
- # return the dictionary
181
- return fold_indices
182
-
183
-
184
- def __row_based_k_fold_validation_split (fold_indices : dict , data : pd .DataFrame , num_folds : int , random_state ):
116
+
117
+ splits = sample_records (data = data , size = int (data .interaction_count * frac ), rng = random_state )
118
+
119
+ if hasattr (splits , "_iter__" ):
120
+ return splits
121
+ else :
122
+ return iter ([splits ])
123
+
124
+
125
+ def __row_based_k_fold_validation_split (data : Dataset , num_folds : int , random_state ):
185
126
"""
186
- Returns a dictionary with the indices of the row based cv train and validation split for the given data.
127
+ Returns a Train-Test-Split for the given data.
187
128
188
129
Parameters
189
130
----------
190
- fold_indices : dict
191
- dictionary with the indices of the train and validation split for the given data.
192
- data : pd.DataFrame
193
- Pandas Dataframe with the data to be split.
131
+ data : Dataset
132
+ Lenskit Dataset with the data to be split.
194
133
"""
195
- # generate the indices of the train and validation split for the given data
196
- for i , splits in enumerate (partition_rows (data , partitions = num_folds , rng_spec = random_state )):
197
- fold_indices [i ]['train' ] = np .append (fold_indices [i ]["train" ], splits [0 ].index )
198
- fold_indices [i ]['validation' ] = np .append (fold_indices [i ]["train" ], splits [1 ].index )
199
- return fold_indices
134
+
135
+ splits = crossfold_records (data = data , partitions = num_folds , rng = random_state )
136
+ return splits
200
137
201
138
202
- def __user_based_crossfold_validation_split (fold_indices , data , num_folds ) -> dict :
139
+
140
+ def __user_based_crossfold_validation_split (data : Dataset , num_folds ) -> Iterator [TTSplit ]:
203
141
"""
204
- Returns a dictionary with the indices of the user based cv train and validation split for the given data.
142
+ Returns a Train-Test-Split for the given data.
205
143
206
144
Parameters
207
145
----------
208
- fold_indices : dict
209
- dictionary with the indices of the train and validation split for the given data.
210
- data : pd.DataFrame
146
+ data : Dataset
211
147
Pandas Dataframe with the data to be split.
212
148
num_folds : int
213
149
number of folds for the validation split cross validation
214
150
215
151
Returns
216
152
-------
217
- dict
153
+ Iterator[TTSplit]
154
+ Train-Test-Split for the given data.
218
155
"""
219
- # generate splits of equal size
220
- splits = np .array_split (data , num_folds )
221
- # go through each split
222
- for i in range (len (splits )):
223
- # the split denoted by i is the test set, so all other splits are the train set
224
- train = pd .concat (splits [:i ] + splits [i + 1 :], axis = 0 , ignore_index = False )
225
- # the test data is simply the index we are currently observing
226
- test = splits [i ]
227
- # append the indices to the dictionary
228
- fold_indices [i ]["train" ] = np .append (fold_indices [i ]["train" ], train .index )
229
- fold_indices [i ]["validation" ] = np .append (fold_indices [i ]["validation" ], test .index )
230
-
231
- return fold_indices
156
+
157
+ return crossfold_users (data = data , partitions = num_folds , method = SampleFrac (0.2 ))
0 commit comments