Updated validation_split.py and test_validation_split.py to work with Lenskit Datasets.

Max · Max · commit 5d3c45148361 · 2025-05-29T19:35:24.000+02:00
Also updated the requirements in setup.py to use smac 2.3 and numpy &gt;2.2
diff --git a/lkauto/utils/validation_split.py b/lkauto/utils/validation_split.py
@@ -1,30 +1,21 @@
+from typing import Iterator
+
 import pandas as pd
-import numpy as np
-from lenskit.crossfold import partition_rows
+from lenskit.splitting import crossfold_records, crossfold_users, sample_records, SampleFrac, TTSplit
+from lenskit.data import Dataset
 
 
-def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_folds: int = 1,
-                     frac: float = 0.25, random_state=42) -> dict:
+def validation_split(data: Dataset, strategy: str = 'user_based', num_folds: int = 1,
+                     frac: float = 0.25, random_state=42) -> Iterator[TTSplit]:
     """
-    Returns a dictionary with the indices of the train and validation split for the given data.
-    The dictionary has the following structure:
-    {
-        0: {    # fold 0
-            "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-            "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
-        },
-        1: {    # fold 1
-            "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
-            "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        }
-    }
+    Returns the Train-Test-Split for the given Dataset
 
     Parameters
     ----------
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
-    strategie : str
-        cross validation strategie (user_based or row_based)
+    data : Dataset
+        Lenskit Dataset with the data to be split.
+    strategy : str
+        cross validation strategy (user_based or row_based)
     num_folds : int
         number of folds for the validation split cross validation
     frac : float
@@ -35,37 +26,26 @@ def validation_split(data: pd.DataFrame, strategie: str = 'user_based', num_fold
 
     Returns
     -------
-    dict
-        dictionary with the indices of the train and validation split for the given data.
+    Iterator[TTSplit]
+        The Train-Test-Split for the given Dataset
     """
-    # decide which validation split strategie to use
-    if strategie == 'user_based':
+    # decide which validation split strategy to use
+    if strategy == 'user_based':
         return user_based_validation_split(data=data, num_folds=num_folds, frac=frac, random_state=random_state)
-    elif strategie == 'row_based':
+    elif strategy == 'row_based':
         return row_based_validation_split(data=data, num_folds=num_folds, frac=frac, random_state=random_state)
     else:
-        raise ValueError(f"Unknown validation split strategie: {strategie}")
+        raise ValueError(f"Unknown validation split strategy: {strategy}")
 
 
-def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: float = 0.25, random_state=42) -> dict:
+def row_based_validation_split(data: Dataset, num_folds: int = 1, frac: float = 0.25, random_state=42) -> Iterator[TTSplit]:
     """
-    Returns a dictionary with the indices of the train and validation split for the given data.
-    The dictionary has the following structure:
-    {
-        0: {    # fold 0
-            "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-            "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
-        },
-        1: {    # fold 1
-            "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
-            "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        }
-    }
+    Returns a Train-Test-Split for the given data.
 
     Parameters
     ----------
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
+    data : Dataset
+        Lenskit Dataset with the data to be split.
     num_folds : int
         number of folds for the validation split cross validation
     frac : float
@@ -76,46 +56,23 @@ def row_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: flo
 
     Returns
     -------
-    dict
-        dictionary with the indices of the train and validation split for the given data.
+    Iterator[TTSPlit]
+        Train-Tet-Split for the given data.
     """
-    # initialize a dictionary with the indices of the train and validation split for the given data
-    fold_indices = {i: {"train": np.array([]), "validation": np.array([])} for i in
-                    range(num_folds)}
-    # if num_folds < 2, we use a holdout validation split
+
     if num_folds < 2:
-        fold_indices = __holdout_validation_split(fold_indices=fold_indices,
-                                                  data=data,
-                                                  frac=frac,
-                                                  random_state=random_state)
-    # if num_folds > 1, we use a cross validation split
+        return __holdout_validation_split(data=data, frac=frac, random_state=random_state)
     else:
-        fold_indices = __row_based_k_fold_validation_split(fold_indices=fold_indices,
-                                                           data=data,
-                                                           num_folds=num_folds,
-                                                           random_state=random_state)
-    return fold_indices
+        return __row_based_k_fold_validation_split(data=data, num_folds=num_folds, random_state=random_state)
 
 
-def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: float = 0.25, random_state=42) -> dict:
+def user_based_validation_split(data: Dataset, num_folds: int = 1, frac: float = 0.25, random_state=42) -> Iterator[
+    TTSplit]:
     """
-    Returns a dictionary with the indices of the train and validation split for the given data.
-    The dictionary has the following structure:
-    {
-        0: {    # fold 0
-            "train": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-            "validation": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
-        },
-        1: {    # fold 1
-            "train": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
-            "validation": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        }
-    }
-
     Parameters
     ----------
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
+    data : Dataset
+        Lenskit Dataset with the data to be split.
     num_folds : int
         number of folds for the validation split cross validation
     frac : float
@@ -126,40 +83,25 @@ def user_based_validation_split(data: pd.DataFrame, num_folds: int = 1, frac: fl
 
     Returns
     -------
-    dict
-        dictionary with the indices of the train and validation split for the given data.
+    Iterator[TTSplit]
+        Train-Test-Split for the given data.
     """
-    # initialize a dictionary with the indices of the train and validation split for the given data
-    fold_indices = {i: {"train": np.array([]), "validation": np.array([])} for i in
-                    range(num_folds)}
-
-    # group by users and then sample from each user
-    for user, items in data.groupby("user"):
-        # if num_folds < 2, we use a holdout validation split
-        if num_folds < 2:
-            fold_indices = __holdout_validation_split(fold_indices=fold_indices,
-                                                      data=items,
-                                                      random_state=random_state,
-                                                      frac=frac)
-        # if num_folds > 1, we use a cross validation split
-        else:
-            fold_indices = __user_based_crossfold_validation_split(fold_indices=fold_indices,
-                                                                   data=items,
-                                                                   num_folds=num_folds)
-
-    return fold_indices
-
-
-def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: float, random_state=42):
+
+    if num_folds < 2:
+        return __holdout_validation_split(data=data, frac=frac, random_state=random_state)
+    else:
+        return __user_based_crossfold_validation_split(data=data, num_folds=num_folds)
+
+
+
+def __holdout_validation_split(data: Dataset, frac: float, random_state=42):
     """
-    Returns a dictionary with the indices of the train and validation split for the given data.
+    Returns a Train-Test-Split for the given data.
 
     Parameters
     ----------
-    fold_indices : dict
-        dictionary with the indices of the train and validation split for the given data.
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
+    data : Dataset
+        Lenskit Dataset with the data to be split.
     frac : float
         fraction of the dataset to be used for the validation split. If num_folds > 1, the fraction value
         will be ignored.
@@ -168,64 +110,48 @@ def __holdout_validation_split(fold_indices: dict, data: pd.DataFrame, frac: flo
 
     Returns
     -------
-    dict
+    Iterator[TTSplit]
+        Train-Test-Split for the given data. Should only contain one fold.
     """
-    # sample the validation set
-    validation = data.sample(frac=frac, random_state=random_state)
-    # get the train set by dropping the validation set
-    train = data.drop(validation.index)
-    # append the indices of the train and validation set to the dictionary
-    fold_indices[0]['train'] = np.append(fold_indices[0]["train"], train.index)
-    fold_indices[0]['validation'] = np.append(fold_indices[0]["validation"], validation.index)
-    # return the dictionary
-    return fold_indices
-
-
-def __row_based_k_fold_validation_split(fold_indices: dict, data: pd.DataFrame, num_folds: int, random_state):
+
+    splits = sample_records(data=data, size=int(data.interaction_count * frac), rng=random_state)
+
+    if hasattr(splits, "_iter__"):
+        return splits
+    else:
+        return iter([splits])
+
+
+def __row_based_k_fold_validation_split(data: Dataset, num_folds: int, random_state):
     """
-    Returns a dictionary with the indices of the row based cv train and validation split for the given data.
+    Returns a Train-Test-Split for the given data.
 
     Parameters
     ----------
-    fold_indices : dict
-        dictionary with the indices of the train and validation split for the given data.
-    data : pd.DataFrame
-        Pandas Dataframe with the data to be split.
+    data : Dataset
+        Lenskit Dataset with the data to be split.
     """
-    # generate the indices of the train and validation split for the given data
-    for i, splits in enumerate(partition_rows(data, partitions=num_folds, rng_spec=random_state)):
-        fold_indices[i]['train'] = np.append(fold_indices[i]["train"], splits[0].index)
-        fold_indices[i]['validation'] = np.append(fold_indices[i]["train"], splits[1].index)
-    return fold_indices
+
+    splits = crossfold_records(data=data, partitions=num_folds, rng=random_state)
+    return splits
 
 
-def __user_based_crossfold_validation_split(fold_indices, data, num_folds) -> dict:
+
+def __user_based_crossfold_validation_split(data: Dataset, num_folds) -> Iterator[TTSplit]:
     """
-    Returns a dictionary with the indices of the user based cv train and validation split for the given data.
+    Returns a Train-Test-Split for the given data.
 
     Parameters
     ----------
-    fold_indices : dict
-        dictionary with the indices of the train and validation split for the given data.
-    data : pd.DataFrame
+    data : Dataset
         Pandas Dataframe with the data to be split.
     num_folds : int
         number of folds for the validation split cross validation
 
     Returns
     -------
-    dict
+    Iterator[TTSplit]
+        Train-Test-Split for the given data.
     """
-    # generate splits of equal size
-    splits = np.array_split(data, num_folds)
-    # go through each split
-    for i in range(len(splits)):
-        # the split denoted by i is the test set, so all other splits are the train set
-        train = pd.concat(splits[:i] + splits[i + 1:], axis=0, ignore_index=False)
-        # the test data is simply the index we are currently observing
-        test = splits[i]
-        # append the indices to the dictionary
-        fold_indices[i]["train"] = np.append(fold_indices[i]["train"], train.index)
-        fold_indices[i]["validation"] = np.append(fold_indices[i]["validation"], test.index)
-
-    return fold_indices
+
+    return crossfold_users(data=data, partitions=num_folds, method=SampleFrac(0.2))
diff --git a/setup.py b/setup.py
@@ -13,10 +13,10 @@
     python_requires=">=3.8, <=3.9",
     packages=find_packages(),
     install_requires=[
-        "smac~=1.4",
+        "smac~=2.3",
         "matplotlib~=3.6",
         "lenskit>=0.14.2",
-        "numpy==1.21.6",
+        "numpy>2.2",
         "tables~=3.8",
         "typing~=3.5"
     ],
@@ -30,4 +30,4 @@
     long_description_content_type="text/markdown",
     url="https://github.com/ISG-Siegen/lenskit-auto",
     project_urls={"Documentation": "https://lenskit-auto.readthedocs.io"},
-)
+)
diff --git a/tests/utils/test_validation_split.py b/tests/utils/test_validation_split.py
@@ -4,21 +4,61 @@
 import pandas as pd
 
 from lkauto.utils.validation_split import validation_split
+from lenskit.data import from_interactions_df
 
 
 class TestValidationSplit(unittest.TestCase):
 
     def setUp(self):
-        self.df = pd.DataFrame(np.ones((100, 3)), columns=["user", "item", "rating", ])
+        self.df = pd.DataFrame(np.array([[1, 1, 1],
+                                         [1, 2, 2],
+                                         [1, 3, 3],
+                                         [2, 1, 1],
+                                         [2, 2, 2],
+                                         [2, 3, 3],
+                                         [3, 1, 1],
+                                         [3, 2, 2],
+                                         [3, 3, 3],
+                                         [4, 1, 1],
+                                         [4, 2, 2],
+                                         [4, 3, 3],
+                                         [5, 1, 1],
+                                         [5, 2, 2],
+                                         [5, 3, 3]]), columns=["user", "item", "rating", ])
+        self.ds = from_interactions_df(self.df)
 
+    """
     def test_validationSplit_givenValidDataFrame_correctSplitTrainAndValidationDataframesReturnedExpected(self):
-        val_fold_indices = validation_split(data=self.df, frac=0.25, random_state=42)
+        val_fold_indices = validation_split(data=self.ds, frac=0.25, random_state=42)
 
         validation_train = self.df.loc[val_fold_indices[0]["train"], :]
         validation_test = self.df.loc[val_fold_indices[0]["validation"], :]
 
         self.assertTrue(validation_train.shape == (75, 3))
         self.assertTrue(validation_test.shape == (25, 3))
+        """
+
+    def test_validationSplit_givenValidDataset_1Fold_UserBased(self):
+        splits = validation_split(data=self.ds, strategy="user_based", frac=0.2, num_folds=1, random_state=42)
+
+        fold = next(splits)
+        test_sample_fold = fold.test
+        train_sample_fold = fold.train
+
+        self.assertTrue(test_sample_fold.to_df().shape[0] == 3)
+        self.assertTrue(train_sample_fold.interaction_count == 12)
+
+    def test_validationSplit_givenValidDataset_3Fold_RowBased(self):
+        splits = validation_split(data=self.ds, strategy="row_based", frac=0.2, num_folds=3, random_state=42)
+
+        fold = next(splits)
+        test_sample_fold = fold.test
+        train_sample_fold = fold.train
+
+        self.assertTrue(test_sample_fold.to_df().shape[0] == 5)
+        self.assertTrue(train_sample_fold.interaction_count == 10)
+
+
 
 
 if __name__ == '__main__':