Skip to content

Commit ab104ad

Browse files
author
Max
committed
Updated explicit_evaler.py to use Datasets instead of Dataframes (now correct)
Also updated the ExplicitEvaler construction calls in the other files
1 parent 0864783 commit ab104ad

File tree

6 files changed

+21
-45
lines changed

6 files changed

+21
-45
lines changed

lkauto/explicit/explicit_evaler.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import numpy as np
44
import pandas as pd
55

6-
from typing import Iterator
7-
from lenskit.data import Dataset
6+
from typing import Iterator, Union
7+
from lenskit.data import Dataset, ItemListCollection
88
from lenskit.pipeline import predict_pipeline, topn_pipeline
99
from lenskit.batch import recommend
1010
from lenskit.metrics import RunAnalysis
@@ -56,12 +56,10 @@ class ExplicitEvaler:
5656
"""
5757

5858
def __init__(self,
59-
data: Dataset,
60-
train: pd.DataFrame,
59+
train: Dataset,
6160
optimization_metric,
6261
filer: Filer,
63-
ttsplits: Iterator[TTSplit] = None,
64-
validation=None,
62+
validation: ItemListCollection = None,
6563
random_state=42,
6664
split_folds: int = 1,
6765
split_strategie: str = 'user_based',
@@ -70,10 +68,8 @@ def __init__(self,
7068
minimize_error_metric_val: bool = True,
7169
) -> None:
7270
self.logger = logging.getLogger('lenskit-auto')
73-
self.data = data
7471
self.train = train
7572
self.filer = filer
76-
self.ttsplits = ttsplits
7773
self.validation = validation
7874
self.random_state = random_state
7975
self.split_folds = split_folds
@@ -84,14 +80,14 @@ def __init__(self,
8480
self.run_id = 0
8581
self.ensemble_size = ensemble_size
8682
self.top_n_runs = pd.DataFrame(columns=['run_id', 'model', 'error'])
87-
if self.ttsplits is None:
88-
self.train_test_splits = validation_split(data=self.data,
83+
if self.validation is None:
84+
self.train_test_splits = validation_split(data=self.train,
8985
strategy=self.split_strategie,
9086
num_folds=self.split_folds,
9187
frac=self.split_frac,
9288
random_state=self.random_state)
9389
else:
94-
self.train_test_splits = self.ttsplits
90+
self.train_test_splits = iter([TTSplit(train, validation)])
9591

9692
def evaluate(self, config_space: ConfigurationSpace) -> float:
9793
""" evaluates model defined in config_space

lkauto/lkauto.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@
1919
from lenskit.metrics.predict import RMSE
2020
from lenskit.metrics import NDCG
2121
from lenskit.pipeline import Component
22+
from lenskit.data import Dataset, ItemListCollection
2223

2324
from typing import Tuple
2425

2526

26-
def get_best_prediction_model(train: pd.DataFrame,
27-
validation: pd.DataFrame = None,
27+
def get_best_prediction_model(train: Dataset,
28+
validation: ItemListCollection = None,
2829
cs: ConfigurationSpace = None,
2930
optimization_metric=RMSE,
3031
optimization_strategie: str = 'bayesian',
@@ -166,7 +167,7 @@ def get_best_prediction_model(train: pd.DataFrame,
166167
drop_na_values=drop_na_values,
167168
drop_duplicates=drop_duplicates)
168169

169-
# decide which optimization strategie to use
170+
# decide which optimization strategy to use
170171
if optimization_strategie == 'bayesian':
171172
incumbent, top_n_runs = bayesian_optimization(train=train,
172173
cs=cs,

lkauto/optimization_strategies/bayesian_optimization.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from smac.intensifier import Intensifier
1010
from smac.scenario import Scenario
1111

12+
from lenskit.data import Dataset, ItemListCollection
13+
1214
from lkauto.explicit.explicit_evaler import ExplicitEvaler
1315
from lkauto.implicit.implicit_evaler import ImplicitEvaler
1416
from lkauto.utils.filer import Filer
@@ -19,9 +21,9 @@
1921
import logging
2022

2123

22-
def bayesian_optimization(train: pd.DataFrame,
24+
def bayesian_optimization(train: Dataset,
2325
user_feedback: str,
24-
validation: pd.DataFrame = None,
26+
validation: ItemListCollection = None,
2527
cs: ConfigurationSpace = None,
2628
optimization_metric=None,
2729
time_limit_in_sec: int = 2700,
@@ -118,7 +120,7 @@ def bayesian_optimization(train: pd.DataFrame,
118120
if cs is None:
119121
logger.debug('initializing default ConfigurationSpace')
120122
cs = get_default_configuration_space(data=train,
121-
val_fold_indices=evaler.val_fold_indices,
123+
val_fold_indices=evaler.train_test_splits,
122124
validation=validation,
123125
feedback='explicit',
124126
random_state=random_state)

lkauto/optimization_strategies/random_search.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
from ConfigSpace import ConfigurationSpace, Configuration
66

7+
from lenskit.data import Dataset, ItemListCollection
8+
79
from lkauto.explicit.explicit_evaler import ExplicitEvaler
810
from lkauto.implicit.implicit_evaler import ImplicitEvaler
911
from lkauto.utils.get_default_configurations import get_default_configurations
@@ -15,11 +17,11 @@
1517

1618

1719
def random_search(cs: ConfigurationSpace,
18-
train: pd.DataFrame,
20+
train: Dataset,
1921
user_feedback: str,
2022
optimization_metric,
2123
filer: Filer,
22-
validation: pd.DataFrame = None,
24+
validation: ItemListCollection = None,
2325
time_limit_in_sec: int = 3600,
2426
num_evaluations: int = None,
2527
split_folds: int = 1,
@@ -117,7 +119,7 @@ def random_search(cs: ConfigurationSpace,
117119
if cs is None:
118120
logger.debug('initializing default ConfigurationSpace')
119121
cs = get_default_configuration_space(data=train,
120-
val_fold_indices=evaler.val_fold_indices,
122+
val_fold_indices=evaler.train_test_splits,
121123
validation=validation,
122124
feedback='explicit',
123125
random_state=random_state)

lkauto/utils/get_default_configuration_space.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,25 +45,6 @@ def get_default_configuration_space(data: Union[Dataset, Iterator[TTSplit]],
4545
raise ValueError("Unknown feedback type: {}".format(feedback))
4646

4747
# get minimum number of items and users for the given train split
48-
49-
"""
50-
num_items = 0
51-
num_users = 0
52-
if validation is None:
53-
val_fold_indices = val_fold_indices
54-
for fold in range(len(val_fold_indices)):
55-
tmp = data.loc[val_fold_indices[fold]["train"], :]
56-
if tmp['item'].nunique() < num_items or num_items == 0:
57-
num_items = tmp['item'].nunique()
58-
if tmp['user'].nunique() < num_users or num_users == 0:
59-
num_users = tmp['user'].nunique()
60-
else:
61-
if data['item'].nunique() < num_items or num_items == 0:
62-
num_items = data['item'].nunique()
63-
if data['user'].nunique() < num_users or num_users == 0:
64-
num_users = data['user'].nunique()
65-
"""
66-
6748
num_items = 0
6849
num_users = 0
6950

lkauto/utils/validation_split.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,6 @@ def __row_based_k_fold_validation_split(data: Dataset, num_folds: int, random_st
137137
Lenskit Dataset with the data to be split.
138138
"""
139139

140-
# generate the indices of the train and validation split for the given data
141-
for i, splits in enumerate(crossfold_records(data, partitions=num_folds, rng_spec=random_state)):
142-
fold_indices[i]['train'] = np.append(fold_indices[i]["train"], splits[0].index)
143-
fold_indices[i]['validation'] = np.append(fold_indices[i]["validation"], splits[1].index)
144-
return fold_indices
145-
146140
splits = crossfold_records(data=data, partitions=num_folds, rng=random_state)
147141
return splits
148142

0 commit comments

Comments
 (0)