creating integration test cases

rafaelgreca · rafaelgreca · commit ad24076a7140 · 2024-11-12T21:29:25.000-03:00
diff --git a/data/download_data.sh b/data/download_data.sh
@@ -19,7 +19,7 @@ function parse_yaml {
 }
 
 # setting important variables
-eval $(parse_yaml ../credentials.yaml "CONFIG_")
+eval $(parse_yaml ../config/credentials.yaml "CONFIG_")
 
 # defining important variables
 export KAGGLE_USERNAME="$CONFIG_KAGGLE_USERNAME"
diff --git a/src/model/inference.py b/src/model/inference.py
@@ -62,8 +62,12 @@ def predict(self, x: np.ndarray, transform_to_str: bool = True) -> np.ndarray:
         """
         prediction = self.model.predict(x)
 
+        print(prediction.shape)
+
         if transform_to_str:
             prediction = label_encoder.inverse_transform(prediction)
+        else:
+            prediction = np.max(prediction, axis=1)
 
         logger.info(f"Prediction: {prediction}.")
         return prediction
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
diff --git a/tests/integration/test_data_processing.py b/tests/integration/test_data_processing.py
@@ -0,0 +1,30 @@
+import pathlib
+
+import pandas as pd
+import numpy as np
+
+from src.config.settings import general_settings
+from src.config.model import model_settings
+from src.data.processing import data_processing_inference, load_dataset
+
+
+# loading the raw dataset that was used to train the model
+dataset = load_dataset(
+    path=pathlib.Path.joinpath(
+        general_settings.DATA_PATH,
+        general_settings.RAW_FILE_NAME
+    )
+)
+
+def test_data_processing_pipeline():
+    """
+    Testing the integration of the entire data processing pipeline.
+    """
+    _dataset = dataset.copy()
+    _dataset = _dataset.drop(columns=general_settings.TARGET_COLUMN)
+
+    X = data_processing_inference(dataframe=_dataset)
+
+    assert isinstance(_dataset, pd.DataFrame)
+    assert isinstance(X, np.ndarray)
+    assert X.shape[1] == len(model_settings.FEATURES)
diff --git a/tests/integration/test_model_inference.py b/tests/integration/test_model_inference.py
@@ -0,0 +1,52 @@
+import pathlib
+
+import pandas as pd
+import numpy as np
+
+from src.config.settings import general_settings
+from src.config.model import model_settings
+from src.data.processing import data_processing_inference, load_dataset
+from src.model.inference import ModelServe
+
+# loading the raw dataset that was used to train the model
+dataset = load_dataset(
+    path=pathlib.Path.joinpath(
+        general_settings.DATA_PATH,
+        general_settings.RAW_FILE_NAME
+    )
+)
+
+def test_model_inference_pipeline():
+    """
+    Testing the integration of the entire model inference pipeline.
+    """
+    _dataset = dataset.copy()
+    _dataset = _dataset.drop(columns=general_settings.TARGET_COLUMN)
+
+    X = data_processing_inference(dataframe=_dataset)
+
+    assert isinstance(_dataset, pd.DataFrame)
+    assert isinstance(X, np.ndarray)
+    assert X.shape[1] == len(model_settings.FEATURES)
+
+    loaded_model = ModelServe(
+        model_name=model_settings.MODEL_NAME,
+        model_flavor=model_settings.MODEL_FLAVOR,
+        model_version=model_settings.VERSION,
+    )
+    loaded_model.load()
+
+    assert loaded_model.model is not None
+
+    predictions = loaded_model.predict(X, transform_to_str=False)
+
+    assert isinstance(predictions, np.ndarray)
+    assert predictions.shape[0] == X.shape[0]
+    assert isinstance(predictions.dtype, type(np.dtype("float64")))
+
+    # FIXME: fix this
+    # predictions = loaded_model.predict(X, transform_to_str=True)
+
+    # assert isinstance(predictions, List)
+    # assert len(predictions) == X.shape[0]
+    # assert isinstance(type(predictions[0]), str)
diff --git a/tests/unit/test_model_functions.py b/tests/unit/test_model_functions.py
@@ -104,7 +104,7 @@ def test_model_performance() -> None:
     )
     y_train = np.max(y_train, axis=1)
 
-    train_predictions = np.max(loaded_model.predict(X_train, transform_to_str=False), axis=1)
+    train_predictions = loaded_model.predict(X_train, transform_to_str=False)
     train_score = f1_score(y_true=y_train, y_pred=train_predictions, average="weighted")
 
     X_valid = load_feature(
@@ -117,7 +117,7 @@ def test_model_performance() -> None:
     )
     y_valid = np.max(y_valid, axis=1)
 
-    valid_predictions = np.max(loaded_model.predict(X_valid, transform_to_str=False), axis=1)
+    valid_predictions = loaded_model.predict(X_valid, transform_to_str=False)
     valid_score = f1_score(y_true=y_valid, y_pred=valid_predictions, average="weighted")
 
     assert train_score == train_score

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ function parse_yaml {`
`19`	`19`	`}`
`20`	`20`
`21`	`21`	`# setting important variables`
`22`		`-eval $(parse_yaml ../credentials.yaml "CONFIG_")`
	`22`	`+eval $(parse_yaml ../config/credentials.yaml "CONFIG_")`
`23`	`23`
`24`	`24`	`# defining important variables`
`25`	`25`	`export KAGGLE_USERNAME="$CONFIG_KAGGLE_USERNAME"`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ def test_model_performance() -> None:`
`104`	`104`	`)`
`105`	`105`	`y_train = np.max(y_train, axis=1)`
`106`	`106`
`107`		`- train_predictions = np.max(loaded_model.predict(X_train, transform_to_str=False), axis=1)`
	`107`	`+ train_predictions = loaded_model.predict(X_train, transform_to_str=False)`
`108`	`108`	`train_score = f1_score(y_true=y_train, y_pred=train_predictions, average="weighted")`
`109`	`109`
`110`	`110`	`X_valid = load_feature(`
`@@ -117,7 +117,7 @@ def test_model_performance() -> None:`
`117`	`117`	`)`
`118`	`118`	`y_valid = np.max(y_valid, axis=1)`
`119`	`119`
`120`		`- valid_predictions = np.max(loaded_model.predict(X_valid, transform_to_str=False), axis=1)`
	`120`	`+ valid_predictions = loaded_model.predict(X_valid, transform_to_str=False)`
`121`	`121`	`valid_score = f1_score(y_true=y_valid, y_pred=valid_predictions, average="weighted")`
`122`	`122`
`123`	`123`	`assert train_score == train_score`