updated bayesian_optimization.py, explicit_evaler.py

Catnatsuki · Catnatsuki · commit 7d62904c45fd · 2025-06-01T18:42:12.000+02:00
diff --git a/lkauto/explicit/explicit_evaler.py b/lkauto/explicit/explicit_evaler.py
@@ -73,7 +73,7 @@ def __init__(self,
         self.top_n_runs = pd.DataFrame(columns=['run_id', 'model', 'error'])
         if self.validation is None:
             self.val_fold_indices = validation_split(data=self.train,
-                                                     strategie=self.split_strategie,
+                                                     strategy=self.split_strategie,
                                                      num_folds=self.split_folds,
                                                      frac=self.split_frac,
                                                      random_state=self.random_state)
diff --git a/lkauto/optimization_strategies/bayesian_optimization.py b/lkauto/optimization_strategies/bayesian_optimization.py
@@ -1,11 +1,13 @@
 import pandas as pd
 from ConfigSpace import Configuration, ConfigurationSpace
+
 # from smac.facade.smac_hpo_facade import SMAC4HPO
 # from smac.scenario.scenario import Scenario
 
-from smac import HyperparameterOptimizationFacade, Scenario
+from smac import HyperparameterOptimizationFacade
 from smac.initial_design import RandomInitialDesign
 from smac.intensifier import Intensifier
+from smac.scenario import Scenario
 
 from lkauto.explicit.explicit_evaler import ExplicitEvaler
 from lkauto.implicit.implicit_evaler import ImplicitEvaler
diff --git a/lkauto/utils/get_default_configuration_space.py b/lkauto/utils/get_default_configuration_space.py
@@ -1,6 +1,10 @@
 import pandas as pd
+
+from typing import Iterator, Union
 from ConfigSpace import Categorical
 from ConfigSpace import ConfigurationSpace
+from lenskit.data import Dataset
+from lenskit.splitting import TTSplit
 
 from lkauto.algorithms.als import BiasedMF
 from lkauto.algorithms.als import ImplicitMF
@@ -11,13 +15,13 @@
 from lkauto.algorithms.user_knn import UserUser
 
 
-def get_default_configuration_space(data: pd.DataFrame,
+def get_default_configuration_space(data: Union[Dataset, Iterator[TTSplit]],
                                     val_fold_indices,
                                     feedback: str,
-                                    validation: pd.DataFrame = None,
+                                    validation: Iterator[TTSplit] = None,
                                     random_state=42) -> ConfigurationSpace:
     """
-        returns the default configuration space for all included rating predictions algorithms
+        returns the default configuration space for all included rating prediction algorithms
 
         Parameters
         ----------
@@ -42,6 +46,7 @@ def get_default_configuration_space(data: pd.DataFrame,
 
     # get minimum number of items and users for the given train split
 
+    """
     num_items = 0
     num_users = 0
     if validation is None:
@@ -57,6 +62,22 @@ def get_default_configuration_space(data: pd.DataFrame,
             num_items = data['item'].nunique()
         if data['user'].nunique() < num_users or num_users == 0:
             num_users = data['user'].nunique()
+            """
+
+    num_items = 0
+    num_users = 0
+
+    if validation is None and not isinstance(data, Dataset):
+        for fold in data:
+            if fold.train.item_count < num_items or num_items == 0:
+                num_items = fold.train.item_count
+            if fold.train.user_count < num_users or num_users == 0:
+                num_users = fold.train.user_count
+    else:
+        if data.item_count < num_items or num_items == 0:
+            num_items = data.item_count
+        if data.user_count < num_users or num_users == 0:
+            num_users = data.user_count
 
     # define configuration space
     cs = ConfigurationSpace(
diff --git a/lkauto/utils/validation_split.py b/lkauto/utils/validation_split.py
@@ -1,30 +1,26 @@
+from typing import Iterator
+
 import pandas as pd
+
 import numpy as np
 # from lenskit.crossfold import partition_rows
 from lenskit.splitting import crossfold_records
 
-def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_folds: int = 1,
-                     frac: float = 0.25, random_state=42) -> dict:
+from lenskit.splitting import crossfold_records, crossfold_users, sample_records, SampleFrac, TTSplit
+from lenskit.data import Dataset
+
+
+def validation_split(data: Dataset, strategy: str = 'user_based', num_folds: int = 1,
+                     frac: float = 0.25, random_state=42) -> Iterator[TTSplit]:
     """
-    Returns a dictionary with the indices of the train and validation split for the given data.
-    The dictionary has the following structure:
-    {
-        0: {    # fold 0
-            "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-            "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
-        },
-        1: {    # fold 1
-            "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
-            "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        }
-    }
+    Returns the Train-Test-Split for the given Dataset
 
     Parameters
     ----------
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
-    strategie : str
-        cross validation strategie (user_based or row_based)
+    data : Dataset
+        Lenskit Dataset with the data to be split.
+    strategy : str
+        cross validation strategy (user_based or row_based)
     num_folds : int
         number of folds for the validation split cross validation
     frac : float
@@ -35,37 +31,26 @@ def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_fold
 
     Returns
     -------
-    dict
-        dictionary with the indices of the train and validation split for the given data.
+    Iterator[TTSplit]
+        The Train-Test-Split for the given Dataset
     """
-    # decide which validation split strategie to use
-    if strategie == 'user_based':
+    # decide which validation split strategy to use
+    if strategy == 'user_based':
         return user_based_validation_split(data=data, num_folds=num_folds, frac=frac, random_state=random_state)
-    elif strategie == 'row_based':
+    elif strategy == 'row_based':
         return row_based_validation_split(data=data, num_folds=num_folds, frac=frac, random_state=random_state)
     else:
-        raise ValueError(f"Unknown validation split strategie: {strategie}")
+        raise ValueError(f"Unknown validation split strategy: {strategy}")
 
 
-def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: float = 0.25, random_state=42) -> dict:
+def row_based_validation_split(data: Dataset, num_folds: int = 1, frac: float = 0.25, random_state=42) -> Iterator[TTSplit]:
     """
-    Returns a dictionary with the indices of the train and validation split for the given data.
-    The dictionary has the following structure:
-    {
-        0: {    # fold 0
-            "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-            "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
-        },
-        1: {    # fold 1
-            "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
-            "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        }
-    }
+    Returns a Train-Test-Split for the given data.
 
     Parameters
     ----------
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
+    data : Dataset
+        Lenskit Dataset with the data to be split.
     num_folds : int
         number of folds for the validation split cross validation
     frac : float
@@ -76,46 +61,23 @@ def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: flo
 
     Returns
     -------
-    dict
-        dictionary with the indices of the train and validation split for the given data.
+    Iterator[TTSPlit]
+        Train-Tet-Split for the given data.
     """
-    # initialize a dictionary with the indices of the train and validation split for the given data
-    fold_indices = {i: {"train": np.array([]), "validation": np.array([])} for i in
-                    range(num_folds)}
-    # if num_folds < 2, we use a holdout validation split
+
     if num_folds < 2:
-        fold_indices = __holdout_validation_split(fold_indices=fold_indices,
-                                                  data=data,
-                                                  frac=frac,
-                                                  random_state=random_state)
-    # if num_folds > 1, we use a cross validation split
+        return __holdout_validation_split(data=data, frac=frac, random_state=random_state)
     else:
-        fold_indices = __row_based_k_fold_validation_split(fold_indices=fold_indices,
-                                                           data=data,
-                                                           num_folds=num_folds,
-                                                           random_state=random_state)
-    return fold_indices
+        return __row_based_k_fold_validation_split(data=data, num_folds=num_folds, random_state=random_state)
 
 
-def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: float = 0.25, random_state=42) -> dict:
+def user_based_validation_split(data: Dataset, num_folds: int = 1, frac: float = 0.25, random_state=42) -> Iterator[
+    TTSplit]:
     """
-    Returns a dictionary with the indices of the train and validation split for the given data.
-    The dictionary has the following structure:
-    {
-        0: {    # fold 0
-            "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-            "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
-        },
-        1: {    # fold 1
-            "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
-            "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        }
-    }
-
     Parameters
     ----------
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
+    data : Dataset
+        Lenskit Dataset with the data to be split.
     num_folds : int
         number of folds for the validation split cross validation
     frac : float
@@ -126,40 +88,25 @@ def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: fl
 
     Returns
     -------
-    dict
-        dictionary with the indices of the train and validation split for the given data.
+    Iterator[TTSplit]
+        Train-Test-Split for the given data.
     """
-    # initialize a dictionary with the indices of the train and validation split for the given data
-    fold_indices = {i: {"train": np.array([]), "validation": np.array([])} for i in
-                    range(num_folds)}
-
-    # group by users and then sample from each user
-    for user, items in data.groupby("user"):
-        # if num_folds < 2, we use a holdout validation split
-        if num_folds < 2:
-            fold_indices = __holdout_validation_split(fold_indices=fold_indices,
-                                                      data=items,
-                                                      random_state=random_state,
-                                                      frac=frac)
-        # if num_folds > 1, we use a cross validation split
-        else:
-            fold_indices = __user_based_crossfold_validation_split(fold_indices=fold_indices,
-                                                                   data=items,
-                                                                   num_folds=num_folds)
 
-    return fold_indices
+    if num_folds < 2:
+        return __holdout_validation_split(data=data, frac=frac, random_state=random_state)
+    else:
+        return __user_based_crossfold_validation_split(data=data, num_folds=num_folds)
 
 
-def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: float, random_state=42):
+
+def __holdout_validation_split(data: Dataset, frac: float, random_state=42):
     """
-    Returns a dictionary with the indices of the train and validation split for the given data.
+    Returns a Train-Test-Split for the given data.
 
     Parameters
     ----------
-    fold_indices : dict
-        dictionary with the indices of the train and validation split for the given data.
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
+    data : Dataset
+        Lenskit Dataset with the data to be split.
     frac : float
         fraction of the dataset to be used for the validation split. If num_folds > 1, the fraction value
         will be ignored.
@@ -168,64 +115,55 @@ def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: flo
 
     Returns
     -------
-    dict
+    Iterator[TTSplit]
+        Train-Test-Split for the given data. Should only contain one fold.
     """
-    # sample the validation set
-    validation = data.sample(frac=frac, random_state=random_state)
-    # get the train set by dropping the validation set
-    train = data.drop(validation.index)
-    # append the indices of the train and validation set to the dictionary
-    fold_indices[0]['train'] = np.append(fold_indices[0]["train"], train.index)
-    fold_indices[0]['validation'] = np.append(fold_indices[0]["validation"], validation.index)
-    # return the dictionary
-    return fold_indices
+
+    splits = sample_records(data=data, size=int(data.interaction_count * frac), rng=random_state)
+
+    if hasattr(splits, "_iter__"):
+        return splits
+    else:
+        return iter([splits])
 
 
-def __row_based_k_fold_validation_split(fold_indices: dict, data: pd.DataFrame, num_folds: int, random_state):
+def __row_based_k_fold_validation_split(data: Dataset, num_folds: int, random_state):
     """
-    Returns a dictionary with the indices of the row based cv train and validation split for the given data.
+    Returns a Train-Test-Split for the given data.
 
     Parameters
     ----------
-    fold_indices : dict
-        dictionary with the indices of the train and validation split for the given data.
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
+    data : Dataset
+        Lenskit Dataset with the data to be split.
     """
+
     # generate the indices of the train and validation split for the given data
     for i, splits in enumerate(crossfold_records(data, partitions=num_folds, rng_spec=random_state)):
         fold_indices[i]['train'] = np.append(fold_indices[i]["train"], splits[0].index)
         fold_indices[i]['validation'] = np.append(fold_indices[i]["validation"], splits[1].index)
     return fold_indices
 
+    splits = crossfold_records(data=data, partitions=num_folds, rng=random_state)
+    return splits
+
 
-def __user_based_crossfold_validation_split(fold_indices, data, num_folds) -> dict:
+
+
+def __user_based_crossfold_validation_split(data: Dataset, num_folds) -> Iterator[TTSplit]:
     """
-    Returns a dictionary with the indices of the user based cv train and validation split for the given data.
+    Returns a Train-Test-Split for the given data.
 
     Parameters
     ----------
-    fold_indices : dict
-        dictionary with the indices of the train and validation split for the given data.
-    data : pd.DataFrame
+    data : Dataset
         Pandas Dataframe with the data to be split.
     num_folds : int
         number of folds for the validation split cross validation
 
     Returns
     -------
-    dict
+    Iterator[TTSplit]
+        Train-Test-Split for the given data.
     """
-    # generate splits of equal size
-    splits = np.array_split(data, num_folds)
-    # go through each split
-    for i in range(len(splits)):
-        # the split denoted by i is the test set, so all other splits are the train set
-        train = pd.concat(splits[:i] + splits[i + 1:], axis=0, ignore_index=False)
-        # the test data is simply the index we are currently observing
-        test = splits[i]
-        # append the indices to the dictionary
-        fold_indices[i]["train"] = np.append(fold_indices[i]["train"], train.index)
-        fold_indices[i]["validation"] = np.append(fold_indices[i]["validation"], test.index)
 
-    return fold_indices
+    return crossfold_users(data=data, partitions=num_folds, method=SampleFrac(0.2))
diff --git a/setup.py b/setup.py
@@ -13,10 +13,10 @@
     python_requires=">=3.8, <=3.9",
     packages=find_packages(),
     install_requires=[
-        "smac~=1.4",
+        "smac~=2.3",
         "matplotlib~=3.6",
         "lenskit>=0.14.2",
-        "numpy==1.21.6",
+        "numpy>2.2",
         "tables~=3.8",
         "typing~=3.5"
     ],
@@ -30,4 +30,4 @@
     long_description_content_type="text/markdown",
     url="https://github.com/ISG-Siegen/lenskit-auto",
     project_urls={"Documentation": "https://lenskit-auto.readthedocs.io"},
-)
+)
diff --git a/tests/utils/test_get_default_configuration_space.py b/tests/utils/test_get_default_configuration_space.py
diff --git a/tests/utils/test_validation_split.py b/tests/utils/test_validation_split.py