Skip to content

Commit b97cd4f

Browse files
committed
Merge branch 'update_lkauto' of https://github.com/ISG-Siegen/lenskit-auto into update_lkauto
2 parents e1475a6 + ab104ad commit b97cd4f

File tree

7 files changed

+94
-81
lines changed

7 files changed

+94
-81
lines changed

lkauto/explicit/explicit_evaler.py

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
1+
import logging
2+
13
import numpy as np
24
import pandas as pd
5+
6+
from typing import Iterator, Union
7+
from lenskit.data import Dataset, ItemListCollection
8+
from lenskit.pipeline import predict_pipeline, topn_pipeline
9+
from lenskit.batch import recommend
10+
from lenskit.metrics import RunAnalysis
11+
from lenskit.splitting import TTSplit
312
from ConfigSpace import ConfigurationSpace
4-
import logging
13+
from sklearn.model_selection import train_test_split
514

615
from lkauto.utils.filer import Filer
716
from lkauto.utils.get_model_from_cs import get_model_from_cs
@@ -47,10 +56,10 @@ class ExplicitEvaler:
4756
"""
4857

4958
def __init__(self,
50-
train: pd.DataFrame,
59+
train: Dataset,
5160
optimization_metric,
5261
filer: Filer,
53-
validation=None,
62+
validation: ItemListCollection = None,
5463
random_state=42,
5564
split_folds: int = 1,
5665
split_strategie: str = 'user_based',
@@ -72,13 +81,13 @@ def __init__(self,
7281
self.ensemble_size = ensemble_size
7382
self.top_n_runs = pd.DataFrame(columns=['run_id', 'model', 'error'])
7483
if self.validation is None:
75-
self.val_fold_indices = validation_split(data=self.train,
76-
strategy=self.split_strategie,
77-
num_folds=self.split_folds,
78-
frac=self.split_frac,
79-
random_state=self.random_state)
84+
self.train_test_splits = validation_split(data=self.train,
85+
strategy=self.split_strategie,
86+
num_folds=self.split_folds,
87+
frac=self.split_frac,
88+
random_state=self.random_state)
8089
else:
81-
self.val_fold_indices = None
90+
self.train_test_splits = iter([TTSplit(train, validation)])
8291

8392
def evaluate(self, config_space: ConfigurationSpace) -> float:
8493
""" evaluates model defined in config_space
@@ -104,34 +113,54 @@ def evaluate(self, config_space: ConfigurationSpace) -> float:
104113
# get model from configuration space
105114
model = get_model_from_cs(config_space, feedback='explicit')
106115

116+
'''
107117
# loop over validation folds
108118
for fold in range(self.split_folds):
109119
if self.validation is None:
110120
# get validation split by fold index
111-
validation_train = self.train.loc[self.val_fold_indices[fold]["train"], :]
112-
validation_test = self.train.loc[self.val_fold_indices[fold]["validation"], :]
121+
validation_train = self.train.loc[self.train_test_splits[fold]["train"], :]
122+
validation_test = self.train.loc[self.train_test_splits[fold]["validation"], :]
113123
else:
114124
validation_train = self.train
115125
validation_test = self.validation
116126
117127
# split validation data into X and y
118-
X_validation_test = validation_test.copy()
128+
x_validation_test = validation_test.copy()
119129
y_validation_test = validation_test.copy()
120130
121131
# process validation split
122-
X_validation_test = X_validation_test.drop('rating', inplace=False, axis=1)
132+
x_validation_test = x_validation_test.drop('rating', inplace=False, axis=1)
123133
y_validation_test = y_validation_test[['rating']].iloc[:, 0]
124134
135+
125136
# fit and predict model from configuration
126137
model.fit(validation_train)
127-
predictions = model.predict(X_validation_test)
128-
predictions.index = X_validation_test.index
138+
predictions = model.predict(x_validation_test)
139+
predictions.index = x_validation_test.index
129140
130141
# calculate error_metric and append to numpy array
131142
error_metric = np.append(error_metric,
132143
self.optimization_metric(predictions, y_validation_test, missing='ignore'))
133144
134145
validation_data = pd.concat([validation_data, predictions], axis=0)
146+
'''
147+
148+
for fold in self.train_test_splits:
149+
validation_train = fold.train
150+
validation_test = fold.test
151+
152+
pipeline = predict_pipeline(scorer=model)
153+
fit_pipeline = pipeline.clone()
154+
fit_pipeline.train(data=validation_train)
155+
156+
recs = recommend(fit_pipeline, validation_test.keys())
157+
158+
run_analysis = RunAnalysis()
159+
run_analysis.add_metric(self.optimization_metric)
160+
error_results = run_analysis.measure(recs, validation_test)
161+
162+
error_metric = np.append(error_metric, error_results)
163+
validation_data = pd.concat([validation_data, recs], ignore_index=True)
135164

136165
# Save validation data for reproducibility and ensembling
137166
self.top_n_runs = update_top_n_runs(config_space=config_space,

lkauto/lkauto.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@
1919
from lenskit.metrics.predict import RMSE
2020
from lenskit.metrics import NDCG
2121
from lenskit.pipeline import Component
22+
from lenskit.data import Dataset, ItemListCollection
2223

2324
from typing import Tuple
2425

2526

26-
def get_best_prediction_model(train: pd.DataFrame,
27-
validation: pd.DataFrame = None,
27+
def get_best_prediction_model(train: Dataset,
28+
validation: ItemListCollection = None,
2829
cs: ConfigurationSpace = None,
2930
optimization_metric=RMSE,
3031
optimization_strategie: str = 'bayesian',
@@ -166,7 +167,7 @@ def get_best_prediction_model(train: pd.DataFrame,
166167
drop_na_values=drop_na_values,
167168
drop_duplicates=drop_duplicates)
168169

169-
# decide which optimization strategie to use
170+
# decide which optimization strategy to use
170171
if optimization_strategie == 'bayesian':
171172
incumbent, top_n_runs = bayesian_optimization(train=train,
172173
cs=cs,

lkauto/optimization_strategies/bayesian_optimization.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from smac.intensifier import Intensifier
1010
from smac.scenario import Scenario
1111

12+
from lenskit.data import Dataset, ItemListCollection
13+
1214
from lkauto.explicit.explicit_evaler import ExplicitEvaler
1315
from lkauto.implicit.implicit_evaler import ImplicitEvaler
1416
from lkauto.utils.filer import Filer
@@ -19,9 +21,9 @@
1921
import logging
2022

2123

22-
def bayesian_optimization(train: pd.DataFrame,
24+
def bayesian_optimization(train: Dataset,
2325
user_feedback: str,
24-
validation: pd.DataFrame = None,
26+
validation: ItemListCollection = None,
2527
cs: ConfigurationSpace = None,
2628
optimization_metric=None,
2729
time_limit_in_sec: int = 2700,
@@ -118,7 +120,7 @@ def bayesian_optimization(train: pd.DataFrame,
118120
if cs is None:
119121
logger.debug('initializing default ConfigurationSpace')
120122
cs = get_default_configuration_space(data=train,
121-
val_fold_indices=evaler.val_fold_indices,
123+
val_fold_indices=evaler.train_test_splits,
122124
validation=validation,
123125
feedback='explicit',
124126
random_state=random_state)

lkauto/optimization_strategies/random_search.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
from ConfigSpace import ConfigurationSpace, Configuration
66

7+
from lenskit.data import Dataset, ItemListCollection
8+
79
from lkauto.explicit.explicit_evaler import ExplicitEvaler
810
from lkauto.implicit.implicit_evaler import ImplicitEvaler
911
from lkauto.utils.get_default_configurations import get_default_configurations
@@ -15,11 +17,11 @@
1517

1618

1719
def random_search(cs: ConfigurationSpace,
18-
train: pd.DataFrame,
20+
train: Dataset,
1921
user_feedback: str,
2022
optimization_metric,
2123
filer: Filer,
22-
validation: pd.DataFrame = None,
24+
validation: ItemListCollection = None,
2325
time_limit_in_sec: int = 3600,
2426
num_evaluations: int = None,
2527
split_folds: int = 1,
@@ -117,7 +119,7 @@ def random_search(cs: ConfigurationSpace,
117119
if cs is None:
118120
logger.debug('initializing default ConfigurationSpace')
119121
cs = get_default_configuration_space(data=train,
120-
val_fold_indices=evaler.val_fold_indices,
122+
val_fold_indices=evaler.train_test_splits,
121123
validation=validation,
122124
feedback='explicit',
123125
random_state=random_state)

lkauto/preprocessing/pruning.py

Lines changed: 36 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,63 @@
1-
import pandas as pd
1+
#import pandas as pd
2+
from lenskit.data import Dataset,from_interactions_df
23

3-
4-
def min_ratings_per_user(df: pd.DataFrame, num_ratings: int, count_duplicates: bool = False):
4+
def min_ratings_per_user(dataset: Dataset, num_ratings: int, count_duplicates: bool = False):
55
"""Prune users with less than num_ratings ratings
66
77
Parameters
88
----------
9-
df: pd.DataFrame
10-
Dataframe with columns "user", "item", "rating"
9+
dataset: Dataset
10+
LensKit Dataset object containing user-item interactions with ratings
1111
num_ratings: int
1212
Minimum number of ratings per user
1313
count_duplicates: bool = False
1414
If True, all ratings are counted, otherwise only unique ratings are counted
1515
1616
Returns
1717
-------
18-
pd.DataFrame
19-
Dataframe with columns "user", "item", "rating"
18+
Dataset
19+
Filtered Dataset with only users meeting the minimum rating threshold
20+
the Dataset will contain the columns "user_id", "item_id", "rating"
2021
"""
21-
# get all relevant user_ids
22-
uids = (
23-
df['user']
24-
if count_duplicates
25-
else df.drop_duplicates(['user', 'item'])['user']
26-
)
27-
cnt_items_per_user = uids.value_counts()
28-
users_of_interest = list(cnt_items_per_user[cnt_items_per_user >= num_ratings].index)
22+
# get the user statistics from the dataset
23+
user_stats = dataset.user_stats()
24+
if count_duplicates:
25+
valid_users = user_stats[user_stats['count'] >= num_ratings].index # count: total number of ratings (including duplicates)
26+
else:
27+
valid_users = user_stats[user_stats['item_count'] >= num_ratings].index # item_count: number of unique items rated
28+
# convert the interaction table to a pandas DataFrame and filter by valid users
29+
users_of_interest = dataset.interaction_table(format='pandas', original_ids=True)
30+
users_of_interest = users_of_interest[users_of_interest['user_id'].isin(valid_users)]
31+
return from_interactions_df(users_of_interest)
32+
2933

30-
return df[df['user'].isin(users_of_interest)]
3134

3235

33-
def max_ratings_per_user(df: pd.DataFrame, num_ratings: int, count_duplicates: bool = False):
36+
def max_ratings_per_user(dataset: Dataset, num_ratings: int, count_duplicates: bool = False):
3437
"""Prune users with more than num_ratings ratings
3538
3639
Parameters
3740
----------
38-
df: pd.DataFrame
39-
Dataframe with columns "user", "item", "rating"
41+
dataset: Dataset
42+
LensKit Dataset object containing user-item interactions with ratings
4043
num_ratings: int
41-
Minimum number of ratings per user
44+
Maximum number of ratings per user
4245
count_duplicates: bool = False
4346
If True, all ratings are counted, otherwise only unique ratings are counted
4447
4548
Returns
4649
-------
47-
pd.DataFrame
48-
Dataframe with columns "user", "item", "rating"
50+
Dataset
51+
Filtered Dataset with only users meeting the minimum rating threshold
52+
the Dataset will contain the columns "user_id", "item_id", "rating"
4953
"""
50-
# get all relevant user_ids
51-
uids = (
52-
df['user']
53-
if count_duplicates
54-
else df.drop_duplicates(['user', 'item'])['user']
55-
)
56-
cnt_items_per_user = uids.value_counts()
57-
users_of_interest = list(cnt_items_per_user[cnt_items_per_user <= num_ratings].index)
58-
59-
return df[df['user'].isin(users_of_interest)]
54+
55+
user_stats = dataset.user_stats()
56+
if count_duplicates:
57+
valid_users = user_stats[user_stats['count'] <= num_ratings].index # count: total number of ratings (including duplicates)
58+
else:
59+
valid_users = user_stats[user_stats['item_count'] <= num_ratings].index # item_count: number of unique items rated
60+
# convert the interaction table to a pandas DataFrame and filter by valid users
61+
users_of_interest = dataset.interaction_table(format='pandas', original_ids=True)
62+
users_of_interest = users_of_interest[users_of_interest['user_id'].isin(valid_users)]
63+
return from_interactions_df(users_of_interest)

lkauto/utils/get_default_configuration_space.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,25 +45,6 @@ def get_default_configuration_space(data: Union[Dataset, Iterator[TTSplit]],
4545
raise ValueError("Unknown feedback type: {}".format(feedback))
4646

4747
# get minimum number of items and users for the given train split
48-
49-
"""
50-
num_items = 0
51-
num_users = 0
52-
if validation is None:
53-
val_fold_indices = val_fold_indices
54-
for fold in range(len(val_fold_indices)):
55-
tmp = data.loc[val_fold_indices[fold]["train"], :]
56-
if tmp['item'].nunique() < num_items or num_items == 0:
57-
num_items = tmp['item'].nunique()
58-
if tmp['user'].nunique() < num_users or num_users == 0:
59-
num_users = tmp['user'].nunique()
60-
else:
61-
if data['item'].nunique() < num_items or num_items == 0:
62-
num_items = data['item'].nunique()
63-
if data['user'].nunique() < num_users or num_users == 0:
64-
num_users = data['user'].nunique()
65-
"""
66-
6748
num_items = 0
6849
num_users = 0
6950

lkauto/utils/validation_split.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,6 @@ def __row_based_k_fold_validation_split(data: Dataset, num_folds: int, random_st
137137
Lenskit Dataset with the data to be split.
138138
"""
139139

140-
# generate the indices of the train and validation split for the given data
141-
for i, splits in enumerate(crossfold_records(data, partitions=num_folds, rng_spec=random_state)):
142-
fold_indices[i]['train'] = np.append(fold_indices[i]["train"], splits[0].index)
143-
fold_indices[i]['validation'] = np.append(fold_indices[i]["validation"], splits[1].index)
144-
return fold_indices
145-
146140
splits = crossfold_records(data=data, partitions=num_folds, rng=random_state)
147141
return splits
148142

0 commit comments

Comments
 (0)