Skip to content

Commit 7d62904

Browse files
committed
updated bayesian_optimization.py, explicit_evaler.py
2 parents 4f4a57a + 46bcfa6 commit 7d62904

File tree

7 files changed

+164
-146
lines changed

7 files changed

+164
-146
lines changed

lkauto/explicit/explicit_evaler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def __init__(self,
7373
self.top_n_runs = pd.DataFrame(columns=['run_id', 'model', 'error'])
7474
if self.validation is None:
7575
self.val_fold_indices = validation_split(data=self.train,
76-
strategie=self.split_strategie,
76+
strategy=self.split_strategie,
7777
num_folds=self.split_folds,
7878
frac=self.split_frac,
7979
random_state=self.random_state)

lkauto/optimization_strategies/bayesian_optimization.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import pandas as pd
22
from ConfigSpace import Configuration, ConfigurationSpace
3+
34
# from smac.facade.smac_hpo_facade import SMAC4HPO
45
# from smac.scenario.scenario import Scenario
56

6-
from smac import HyperparameterOptimizationFacade, Scenario
7+
from smac import HyperparameterOptimizationFacade
78
from smac.initial_design import RandomInitialDesign
89
from smac.intensifier import Intensifier
10+
from smac.scenario import Scenario
911

1012
from lkauto.explicit.explicit_evaler import ExplicitEvaler
1113
from lkauto.implicit.implicit_evaler import ImplicitEvaler

lkauto/utils/get_default_configuration_space.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import pandas as pd
2+
3+
from typing import Iterator, Union
24
from ConfigSpace import Categorical
35
from ConfigSpace import ConfigurationSpace
6+
from lenskit.data import Dataset
7+
from lenskit.splitting import TTSplit
48

59
from lkauto.algorithms.als import BiasedMF
610
from lkauto.algorithms.als import ImplicitMF
@@ -11,13 +15,13 @@
1115
from lkauto.algorithms.user_knn import UserUser
1216

1317

14-
def get_default_configuration_space(data: pd.DataFrame,
18+
def get_default_configuration_space(data: Union[Dataset, Iterator[TTSplit]],
1519
val_fold_indices,
1620
feedback: str,
17-
validation: pd.DataFrame = None,
21+
validation: Iterator[TTSplit] = None,
1822
random_state=42) -> ConfigurationSpace:
1923
"""
20-
returns the default configuration space for all included rating predictions algorithms
24+
returns the default configuration space for all included rating prediction algorithms
2125
2226
Parameters
2327
----------
@@ -42,6 +46,7 @@ def get_default_configuration_space(data: pd.DataFrame,
4246

4347
# get minimum number of items and users for the given train split
4448

49+
"""
4550
num_items = 0
4651
num_users = 0
4752
if validation is None:
@@ -57,6 +62,22 @@ def get_default_configuration_space(data: pd.DataFrame,
5762
num_items = data['item'].nunique()
5863
if data['user'].nunique() < num_users or num_users == 0:
5964
num_users = data['user'].nunique()
65+
"""
66+
67+
num_items = 0
68+
num_users = 0
69+
70+
if validation is None and not isinstance(data, Dataset):
71+
for fold in data:
72+
if fold.train.item_count < num_items or num_items == 0:
73+
num_items = fold.train.item_count
74+
if fold.train.user_count < num_users or num_users == 0:
75+
num_users = fold.train.user_count
76+
else:
77+
if data.item_count < num_items or num_items == 0:
78+
num_items = data.item_count
79+
if data.user_count < num_users or num_users == 0:
80+
num_users = data.user_count
6081

6182
# define configuration space
6283
cs = ConfigurationSpace(

lkauto/utils/validation_split.py

Lines changed: 69 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,26 @@
1+
from typing import Iterator
2+
13
import pandas as pd
4+
25
import numpy as np
36
# from lenskit.crossfold import partition_rows
47
from lenskit.splitting import crossfold_records
58

6-
def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_folds: int = 1,
7-
frac: float = 0.25, random_state=42) -> dict:
9+
from lenskit.splitting import crossfold_records, crossfold_users, sample_records, SampleFrac, TTSplit
10+
from lenskit.data import Dataset
11+
12+
13+
def validation_split(data: Dataset, strategy: str = 'user_based', num_folds: int = 1,
14+
frac: float = 0.25, random_state=42) -> Iterator[TTSplit]:
815
"""
9-
Returns a dictionary with the indices of the train and validation split for the given data.
10-
The dictionary has the following structure:
11-
{
12-
0: { # fold 0
13-
"train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
14-
"validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
15-
},
16-
1: { # fold 1
17-
"train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
18-
"validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
19-
}
20-
}
16+
Returns the Train-Test-Split for the given Dataset
2117
2218
Parameters
2319
----------
24-
data : pd.DataFrame
25-
Pandas Dataframe with the data to be split.
26-
strategie : str
27-
cross validation strategie (user_based or row_based)
20+
data : Dataset
21+
Lenskit Dataset with the data to be split.
22+
strategy : str
23+
cross validation strategy (user_based or row_based)
2824
num_folds : int
2925
number of folds for the validation split cross validation
3026
frac : float
@@ -35,37 +31,26 @@ def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_fold
3531
3632
Returns
3733
-------
38-
dict
39-
dictionary with the indices of the train and validation split for the given data.
34+
Iterator[TTSplit]
35+
The Train-Test-Split for the given Dataset
4036
"""
41-
# decide which validation split strategie to use
42-
if strategie == 'user_based':
37+
# decide which validation split strategy to use
38+
if strategy == 'user_based':
4339
return user_based_validation_split(data=data, num_folds=num_folds, frac=frac, random_state=random_state)
44-
elif strategie == 'row_based':
40+
elif strategy == 'row_based':
4541
return row_based_validation_split(data=data, num_folds=num_folds, frac=frac, random_state=random_state)
4642
else:
47-
raise ValueError(f"Unknown validation split strategie: {strategie}")
43+
raise ValueError(f"Unknown validation split strategy: {strategy}")
4844

4945

50-
def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: float = 0.25, random_state=42) -> dict:
46+
def row_based_validation_split(data: Dataset, num_folds: int = 1, frac: float = 0.25, random_state=42) -> Iterator[TTSplit]:
5147
"""
52-
Returns a dictionary with the indices of the train and validation split for the given data.
53-
The dictionary has the following structure:
54-
{
55-
0: { # fold 0
56-
"train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
57-
"validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
58-
},
59-
1: { # fold 1
60-
"train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
61-
"validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
62-
}
63-
}
48+
Returns a Train-Test-Split for the given data.
6449
6550
Parameters
6651
----------
67-
data : pd.DataFrame
68-
Pandas Dataframe with the data to be split.
52+
data : Dataset
53+
Lenskit Dataset with the data to be split.
6954
num_folds : int
7055
number of folds for the validation split cross validation
7156
frac : float
@@ -76,46 +61,23 @@ def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: flo
7661
7762
Returns
7863
-------
79-
dict
80-
dictionary with the indices of the train and validation split for the given data.
64+
Iterator[TTSPlit]
65+
Train-Tet-Split for the given data.
8166
"""
82-
# initialize a dictionary with the indices of the train and validation split for the given data
83-
fold_indices = {i: {"train": np.array([]), "validation": np.array([])} for i in
84-
range(num_folds)}
85-
# if num_folds < 2, we use a holdout validation split
67+
8668
if num_folds < 2:
87-
fold_indices = __holdout_validation_split(fold_indices=fold_indices,
88-
data=data,
89-
frac=frac,
90-
random_state=random_state)
91-
# if num_folds > 1, we use a cross validation split
69+
return __holdout_validation_split(data=data, frac=frac, random_state=random_state)
9270
else:
93-
fold_indices = __row_based_k_fold_validation_split(fold_indices=fold_indices,
94-
data=data,
95-
num_folds=num_folds,
96-
random_state=random_state)
97-
return fold_indices
71+
return __row_based_k_fold_validation_split(data=data, num_folds=num_folds, random_state=random_state)
9872

9973

100-
def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: float = 0.25, random_state=42) -> dict:
74+
def user_based_validation_split(data: Dataset, num_folds: int = 1, frac: float = 0.25, random_state=42) -> Iterator[
75+
TTSplit]:
10176
"""
102-
Returns a dictionary with the indices of the train and validation split for the given data.
103-
The dictionary has the following structure:
104-
{
105-
0: { # fold 0
106-
"train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
107-
"validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
108-
},
109-
1: { # fold 1
110-
"train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
111-
"validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
112-
}
113-
}
114-
11577
Parameters
11678
----------
117-
data : pd.DataFrame
118-
Pandas Dataframe with the data to be split.
79+
data : Dataset
80+
Lenskit Dataset with the data to be split.
11981
num_folds : int
12082
number of folds for the validation split cross validation
12183
frac : float
@@ -126,40 +88,25 @@ def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: fl
12688
12789
Returns
12890
-------
129-
dict
130-
dictionary with the indices of the train and validation split for the given data.
91+
Iterator[TTSplit]
92+
Train-Test-Split for the given data.
13193
"""
132-
# initialize a dictionary with the indices of the train and validation split for the given data
133-
fold_indices = {i: {"train": np.array([]), "validation": np.array([])} for i in
134-
range(num_folds)}
135-
136-
# group by users and then sample from each user
137-
for user, items in data.groupby("user"):
138-
# if num_folds < 2, we use a holdout validation split
139-
if num_folds < 2:
140-
fold_indices = __holdout_validation_split(fold_indices=fold_indices,
141-
data=items,
142-
random_state=random_state,
143-
frac=frac)
144-
# if num_folds > 1, we use a cross validation split
145-
else:
146-
fold_indices = __user_based_crossfold_validation_split(fold_indices=fold_indices,
147-
data=items,
148-
num_folds=num_folds)
14994

150-
return fold_indices
95+
if num_folds < 2:
96+
return __holdout_validation_split(data=data, frac=frac, random_state=random_state)
97+
else:
98+
return __user_based_crossfold_validation_split(data=data, num_folds=num_folds)
15199

152100

153-
def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: float, random_state=42):
101+
102+
def __holdout_validation_split(data: Dataset, frac: float, random_state=42):
154103
"""
155-
Returns a dictionary with the indices of the train and validation split for the given data.
104+
Returns a Train-Test-Split for the given data.
156105
157106
Parameters
158107
----------
159-
fold_indices : dict
160-
dictionary with the indices of the train and validation split for the given data.
161-
data : pd.DataFrame
162-
Pandas Dataframe with the data to be split.
108+
data : Dataset
109+
Lenskit Dataset with the data to be split.
163110
frac : float
164111
fraction of the dataset to be used for the validation split. If num_folds > 1, the fraction value
165112
will be ignored.
@@ -168,64 +115,55 @@ def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: flo
168115
169116
Returns
170117
-------
171-
dict
118+
Iterator[TTSplit]
119+
Train-Test-Split for the given data. Should only contain one fold.
172120
"""
173-
# sample the validation set
174-
validation = data.sample(frac=frac, random_state=random_state)
175-
# get the train set by dropping the validation set
176-
train = data.drop(validation.index)
177-
# append the indices of the train and validation set to the dictionary
178-
fold_indices[0]['train'] = np.append(fold_indices[0]["train"], train.index)
179-
fold_indices[0]['validation'] = np.append(fold_indices[0]["validation"], validation.index)
180-
# return the dictionary
181-
return fold_indices
121+
122+
splits = sample_records(data=data, size=int(data.interaction_count * frac), rng=random_state)
123+
124+
if hasattr(splits, "_iter__"):
125+
return splits
126+
else:
127+
return iter([splits])
182128

183129

184-
def __row_based_k_fold_validation_split(fold_indices: dict, data: pd.DataFrame, num_folds: int, random_state):
130+
def __row_based_k_fold_validation_split(data: Dataset, num_folds: int, random_state):
185131
"""
186-
Returns a dictionary with the indices of the row based cv train and validation split for the given data.
132+
Returns a Train-Test-Split for the given data.
187133
188134
Parameters
189135
----------
190-
fold_indices : dict
191-
dictionary with the indices of the train and validation split for the given data.
192-
data : pd.DataFrame
193-
Pandas Dataframe with the data to be split.
136+
data : Dataset
137+
Lenskit Dataset with the data to be split.
194138
"""
139+
195140
# generate the indices of the train and validation split for the given data
196141
for i, splits in enumerate(crossfold_records(data, partitions=num_folds, rng_spec=random_state)):
197142
fold_indices[i]['train'] = np.append(fold_indices[i]["train"], splits[0].index)
198143
fold_indices[i]['validation'] = np.append(fold_indices[i]["validation"], splits[1].index)
199144
return fold_indices
200145

146+
splits = crossfold_records(data=data, partitions=num_folds, rng=random_state)
147+
return splits
148+
201149

202-
def __user_based_crossfold_validation_split(fold_indices, data, num_folds) -> dict:
150+
151+
152+
def __user_based_crossfold_validation_split(data: Dataset, num_folds) -> Iterator[TTSplit]:
203153
"""
204-
Returns a dictionary with the indices of the user based cv train and validation split for the given data.
154+
Returns a Train-Test-Split for the given data.
205155
206156
Parameters
207157
----------
208-
fold_indices : dict
209-
dictionary with the indices of the train and validation split for the given data.
210-
data : pd.DataFrame
158+
data : Dataset
211159
Pandas Dataframe with the data to be split.
212160
num_folds : int
213161
number of folds for the validation split cross validation
214162
215163
Returns
216164
-------
217-
dict
165+
Iterator[TTSplit]
166+
Train-Test-Split for the given data.
218167
"""
219-
# generate splits of equal size
220-
splits = np.array_split(data, num_folds)
221-
# go through each split
222-
for i in range(len(splits)):
223-
# the split denoted by i is the test set, so all other splits are the train set
224-
train = pd.concat(splits[:i] + splits[i + 1:], axis=0, ignore_index=False)
225-
# the test data is simply the index we are currently observing
226-
test = splits[i]
227-
# append the indices to the dictionary
228-
fold_indices[i]["train"] = np.append(fold_indices[i]["train"], train.index)
229-
fold_indices[i]["validation"] = np.append(fold_indices[i]["validation"], test.index)
230168

231-
return fold_indices
169+
return crossfold_users(data=data, partitions=num_folds, method=SampleFrac(0.2))

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
python_requires=">=3.8, <=3.9",
1414
packages=find_packages(),
1515
install_requires=[
16-
"smac~=1.4",
16+
"smac~=2.3",
1717
"matplotlib~=3.6",
1818
"lenskit>=0.14.2",
19-
"numpy==1.21.6",
19+
"numpy>2.2",
2020
"tables~=3.8",
2121
"typing~=3.5"
2222
],
@@ -30,4 +30,4 @@
3030
long_description_content_type="text/markdown",
3131
url="https://github.com/ISG-Siegen/lenskit-auto",
3232
project_urls={"Documentation": "https://lenskit-auto.readthedocs.io"},
33-
)
33+
)

0 commit comments

Comments
 (0)