Merge pull request #92 from nlesc-nano/dev

felipeZ · web-flow · commit b75afeae6421 · 2021-06-18T14:48:58.000+02:00
Minor clean
diff --git a/scripts/predict_gp.py b/scripts/predict_gp.py
@@ -30,7 +30,6 @@
 researcher.load_model("swan_chk.pt")
 
 fingers = data.fingerprints
-print("shape fingers: ", fingers.shape)
 predicted = researcher.predict(fingers)
 df = pd.DataFrame(
     {"smiles": data.dataframe.smiles, "mean": predicted.mean, "lower": predicted.lower, "upper": predicted.upper})
diff --git a/swan/modeller/base_modeller.py b/swan/modeller/base_modeller.py
@@ -67,3 +67,25 @@ def load_model(self, path_model: Optional[PathLike]) -> None:
     def save_model(self, *args, **kwargs):
         """Store the trained model."""
         raise NotImplementedError
+
+    def store_trainset_in_state(self, indices: T_co, ntrain: int, store_features: bool = True) -> None:
+        """Store features, indices, smiles, etc. into the state file."""
+        self.state.store_array("indices", indices, "int")
+        self.state.store_array("ntrain", ntrain, "int")
+        self.state.store_array("smiles_train", self.smiles[indices[:ntrain]], dtype="str")
+        self.state.store_array("smiles_validate", self.smiles[indices[ntrain:]], dtype="str")
+
+        if isinstance(self.labels_trainset, torch.Tensor):
+            self.state.store_array("labels_trainset", self.labels_trainset.numpy())
+            self.state.store_array("labels_validset", self.labels_validset.numpy())
+        else:
+            self.state.store_array("labels_trainset", self.labels_trainset)
+            self.state.store_array("labels_validset", self.labels_validset)
+
+        if store_features:
+            if isinstance(self.features_trainset, torch.Tensor):
+                self.state.store_array("features_trainset", self.features_trainset.numpy())
+                self.state.store_array("features_validset", self.features_validset.numpy())
+            else:
+                self.state.store_array("features_trainset", self.features_trainset)
+                self.state.store_array("features_validset", self.features_validset)
diff --git a/swan/modeller/gp_modeller.py b/swan/modeller/gp_modeller.py
@@ -68,16 +68,7 @@ def split_data(self, partition: SplitDataset) -> None:
             self.labels_validset = partition.labels_validset
             warnings.warn("The labels have not been scaled. Is this the intended behavior?", UserWarning)
 
-        indices = partition.indices
-        ntrain = partition.ntrain
-        self.state.store_array("smiles_train", self.smiles[indices[:ntrain]], dtype="str")
-        self.state.store_array("smiles_validate", self.smiles[indices[ntrain:]], dtype="str")
-        self.state.store_array("features_trainset", self.features_trainset.numpy())
-        self.state.store_array("features_validset", self.features_validset.numpy())
-        self.state.store_array("labels_trainset", self.labels_trainset.numpy())
-        self.state.store_array("labels_validset", self.labels_validset.numpy())
-        self.state.store_array("indices", indices, "int")
-        self.state.store_array("ntrain", ntrain, "int")
+        self.store_trainset_in_state(partition.indices, partition.ntrain)
 
     def train_model(self,
                     nepoch: int,
@@ -174,8 +165,6 @@ def predict(self, inp_data: Tensor) -> GPMultivariate:
         self.network.likelihood.eval()
 
         with torch.no_grad(), gp.settings.fast_pred_var():
-            first = self.network(inp_data)
-            print(first.mean)
             output = self.network.likelihood(self.network(inp_data))
         return self._create_result_object(output)
 
diff --git a/swan/modeller/scikit_modeller.py b/swan/modeller/scikit_modeller.py
@@ -69,10 +69,7 @@ def split_data(self, frac: Tuple[float, float]) -> None:
         self.labels_validset = partition.labels_validset
 
         # Split the smiles using the same partition than the features
-        indices = partition.indices
-        ntrain = partition.ntrain
-        self.state.store_array("smiles_train", self.smiles[indices[:ntrain]], dtype="str")
-        self.state.store_array("smiles_validate", self.smiles[indices[ntrain:]], dtype="str")
+        self.store_trainset_in_state(partition.indices, partition.ntrain)
 
     def save_model(self):
         """Store the trained model."""
@@ -105,7 +102,7 @@ def predict(self, inp_data: np.ndarray) -> np.ndarray:
         -------
         Array containing the predicted results
         """
-        return self.model.predict(inp_data)
+        return self.inverse_transform(self.model.predict(inp_data))
 
     def inverse_transform(self, arr: np.ndarray) -> np.ndarray:
         """Unscale ``arr`` using the fitted scaler.
diff --git a/swan/modeller/torch_modeller.py b/swan/modeller/torch_modeller.py
@@ -121,10 +121,9 @@ def split_data(self, frac: Tuple[float, float], batch_size: int):
         """
         # create the dataloader
         indices_train, indices_validate = self.data.create_data_loader(frac=frac, batch_size=batch_size)
-
-        # Store the smiles used for training and validation
-        self.state.store_array("smiles_train", self.smiles[indices_train], dtype="str")
-        self.state.store_array("smiles_validate", self.smiles[indices_validate], dtype="str")
+        self.labels_trainset = self.data.labels[indices_train]
+        self.labels_validset = self.data.labels[indices_validate]
+        self.store_trainset_in_state(np.concatenate((indices_train, indices_validate)), len(indices_validate), store_features=False)
 
     def train_model(self,
                     nepoch: int,
diff --git a/swan/state/state.py b/swan/state/state.py
@@ -1,7 +1,7 @@
 """Module to interact with HDF5."""
 
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, List, Optional, Union
 
 import h5py
 import numpy as np
@@ -22,7 +22,7 @@ def __init__(self, path_hdf5: Optional[PathLike] = None, replace_state: bool = F
         if not self.path.exists():
             self.path.touch()
 
-    def has_data(self, data: ArrayLike) -> bool:
+    def has_data(self, data: Union[str, List[str]]) -> bool:
         """Search if the node exists in the HDF5 file.
 
         Parameters
diff --git a/tests/test_mpnn.py b/tests/test_mpnn.py
@@ -18,7 +18,7 @@ def setUp(self):
         self.data = PATH_TEST / "thousand.csv"
         self.data = TorchGeometricGraphData(self.data, properties=["Hardness (eta)"])
         self.net = MPNN()
-        self.modeller = TorchModeller(self.net, self.data)
+        self.modeller = TorchModeller(self.net, self.data, replace_state=True)
 
     def test_train_data_mpnn(self):
 
diff --git a/tests/test_scikit_models.py b/tests/test_scikit_models.py
@@ -7,30 +7,45 @@
 
 from .utils_test import PATH_TEST
 
-DATA = FingerprintsData(PATH_TEST / "thousand.csv", properties=["Hardness (eta)"], sanitize=False)
-DATA.scale_labels()
-
 
 def run_test(model: str, **kwargs):
     """Run the training and validation step for the given model."""
-    modeller = SKModeller(model, DATA)
+    data = FingerprintsData(PATH_TEST / "thousand.csv", properties=["Hardness (eta)"], sanitize=False)
+    data.scale_labels()
+    modeller = SKModeller(model, data)
     modeller.train_model()
     predicted, expected = modeller.validate_model()
     reg = stats.linregress(predicted.flatten(), expected.flatten())
     assert not np.isnan(reg.rvalue)
 
 
+def run_prediction(model: str):
+    """Check the prediction functionality."""
+    data = FingerprintsData(PATH_TEST / "smiles.csv", sanitize=False)
+    modeller = SKModeller(model, data)
+    modeller.load_model("swan_skmodeller.pkl")
+    modeller.data.load_scale()
+    predicted = modeller.predict(data.fingerprints)
+    assert not np.isnan(predicted).all()
+
+
 def test_decision_tree():
     """Check the interface to the Decisiontree class."""
-    run_test("decision_tree")
+    model = "decision_tree"
+    run_test(model)
+    run_prediction(model)
 
 
 def test_svm():
     """Check the interface to the support vector machine."""
-    run_test("svm")
+    model = "svm"
+    run_test(model)
+    run_prediction(model)
 
 
 def test_gaussian_process():
     """Check the interface to the support vector machine."""
     kernel = ConstantKernel(constant_value=10)
-    run_test("gaussian_process", kernel=kernel)
+    model = "gaussian_process"
+    run_test(model, kernel=kernel)
+    run_prediction(model)
diff --git a/tests/test_se3_transformer.py b/tests/test_se3_transformer.py
@@ -15,12 +15,12 @@
 torch.set_default_dtype(torch.float32)
 
 CSV_FILE = PATH_TEST / "thousand.csv"
-DATA = DGLGraphData(CSV_FILE, properties=["Hardness (eta)"])
+DATA = DGLGraphData(CSV_FILE, properties=["Hardness (eta)"], optimize_molecule=True)
 
 
 def run_modeller(net: torch.nn.Module):
     """Run a given model."""
-    modeller = TorchModeller(net, DATA, use_cuda=False, replace_state=False)
+    modeller = TorchModeller(net, DATA, use_cuda=False, replace_state=True)
 
     modeller.data.scale_labels()
     modeller.train_model(nepoch=1, batch_size=64)
diff --git a/tests/test_state.py b/tests/test_state.py
@@ -25,11 +25,14 @@ def test_state(tmp_path: Path, capsys):
     out, _ = capsys.readouterr()
     assert "Available data" in out
 
+    assert not all(state.has_data(f"non_existing_{i}") for i in range(2))
+
 
 def test_state_unknown_key(tmp_path: Path):
     """Check that an error is raised if there is not data."""
     path_hdf5 = tmp_path / "swan_state.h5"
-    state = StateH5(path_hdf5)
+    path_hdf5.touch()
+    state = StateH5(path_hdf5, replace_state=True)
 
     with pytest.raises(KeyError):
         state.retrieve_data("nonexisting property")
@@ -46,3 +49,5 @@ def store_smiles_in_state(tmp_path: Path):
     state.store_array("smiles", smiles, "str")
     data = [x.decode() for x in state.retrieve_data("smiles")]
     assert data == smiles.tolist()
+
+