adding logging for the code source

rafaelgreca · rafaelgreca · commit baf96d720dd2 · 2024-11-10T14:37:40.000-03:00
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 ipynb_checkpoints/
-mlruns/
-mlartifacts/
+mlruns
+mlartifacts
 *.csv
 
 # Byte-compiled / optimized / DLL files
diff --git a/config/settings.yaml b/config/settings.yaml
@@ -11,4 +11,6 @@ DATA_PATH: '/media/greca/HD/GitHub/e2e-mlops-project/data/'
 RAW_FILE_NAME: 'Original_ObesityDataSet.csv'
 ARTIFACTS_PATH: '/media/greca/HD/GitHub/e2e-mlops-project/models/artifacts/'
 FEATURES_PATH: '/media/greca/HD/GitHub/e2e-mlops-project/models/features/'
-TARGET_COLUMN: 'NObeyesdad'
+TARGET_COLUMN: 'NObeyesdad'
+LOG_LEVEL: 'INFO'
+LOG_PATH: '/media/greca/HD/GitHub/e2e-mlops-project/'
diff --git a/src/config/settings.py b/src/config/settings.py
@@ -1,4 +1,7 @@
+import os
 from pathlib import Path
+
+from loguru import logger
 from pydantic import BaseModel, DirectoryPath
 
 from .utils import read_yaml_credentials_file
@@ -15,6 +18,8 @@ class GeneralSettings(BaseModel):
     ARTIFACTS_PATH: DirectoryPath
     FEATURES_PATH: DirectoryPath
     TARGET_COLUMN: str
+    LOG_LEVEL: str
+    LOG_PATH: DirectoryPath
 
 general_settings = GeneralSettings(
     **read_yaml_credentials_file(
@@ -25,3 +30,12 @@ class GeneralSettings(BaseModel):
         file_name="settings.yaml",
     )
 )
+
+os.makedirs(general_settings.LOG_PATH, exist_ok=True)
+logger.remove()
+logger.add(
+    Path.joinpath(general_settings.LOG_PATH, "logs", "app.log"),
+    rotation="1 day",
+    retention="7 days",
+    compression="zip"
+)
diff --git a/src/data/processing.py b/src/data/processing.py
@@ -1,12 +1,16 @@
-import pandas as pd
-import numpy as np
 import pathlib
 from typing import List, Dict, Tuple
+
+import numpy as np
+import pandas as pd
+from loguru import logger
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
+
 from .utils import custom_combiner, load_feature
 from ..config.settings import general_settings
 from ..config.model import model_settings
 
+
 def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
     """Applies the data processing pipeline.
 
@@ -17,18 +21,26 @@ def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
         Tuple[np.ndarray, np.ndarray]: the features and labels array, respectively.
     """
     # First step) removing duplicates, changing the height unit, removing outliers
+    logger.info("Removing duplicates from the dataset.")
     dataframe = _remove_duplicates(dataframe)
+
+    logger.info("Changing the height units to centimeters.")
     dataframe = _change_height_units(dataframe)
+
+    logger.info("Removing outliers from the dataset.")
     dataframe = _remove_outliers(dataframe)
 
     # Feature engineering step)
     # Creating the BMI feature
+    logger.info("Creating a new column for the BMI values from the data samples.")
     dataframe = _create_bmi_feature(dataframe)
 
     # Creating the BMR feature
+    logger.info("Creating a new column for the BMR values from the data samples.")
     dataframe = _create_bmr_feature(dataframe)
 
     # Creating the IS feature
+    logger.info("Creating a new column for the IS values from the data samples.")
     dataframe = _create_is_feature(dataframe)
 
     # Feature transformation step)
@@ -41,9 +53,11 @@ def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
         "NCP",
         "CH2O",
     ]
+    logger.info(f"Dropping the columns {columns_to_drop}.")
     dataframe = _drop_features(dataframe=dataframe, features=columns_to_drop)
 
     # Transforming the AGE and IS columns into a categorical columns
+    logger.info("Categorizing the numerical columns ('Age' and 'IS').")
     dataframe = _categorize_numerical_columns(dataframe)
 
     # Transforming (Log Transformation) numerical columns
@@ -69,12 +83,15 @@ def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
 
     # Selecting only the features that are important for the model
     dataframe = dataframe[model_settings.FEATURES + [general_settings.TARGET_COLUMN]]
+    logger.info(f"Filtering the features columns, keeping only {model_settings.FEATURES} columns.")
 
     # Splitting the data into X (features) and y (label)
+    logger.info("Splitting the dataset into X and y arrays.")
     X = dataframe.drop(columns=[general_settings.TARGET_COLUMN]).values
     y = dataframe[general_settings.TARGET_COLUMN].values
 
     # Encoding the labels array
+    logger.info(f"Encoding the target column ({general_settings.TARGET_COLUMN}).")
     label_encoder = load_feature(
         path=general_settings.ARTIFACTS_PATH,
         feature_name='label_ohe'
@@ -252,6 +269,7 @@ def _transform_numerical_columns(
         pd.DataFrame: the dataframe with all numerical columns transformed.
     """
     numerical_columns = dataframe.select_dtypes(exclude="object").columns.tolist()
+    logger.info(f"Applying Log Transformation to the {numerical_columns} columns.")
 
     for nc in numerical_columns:
         dataframe[nc] = np.log1p(dataframe[nc].values + epsilon)
@@ -273,6 +291,7 @@ def _scale_numerical_columns(
         pd.DataFrame: the dataframe with all numerical columns encoded.
     """
     numerical_columns = dataframe.select_dtypes(exclude="object").columns.tolist()
+    logger.info(f"Scaling the {numerical_columns} columns.")
 
     for nc in numerical_columns:
         dataframe[nc] = sc[nc].transform(dataframe[nc].values.reshape(-1, 1))
@@ -297,6 +316,8 @@ def _encode_categorical_columns(
     """
     categorical_columns = dataframe.select_dtypes(include="object").columns.tolist()
     categorical_columns.remove(target_column)
+    logger.info(f"Encoding the {categorical_columns} columns.")
+
     new_dataframe = pd.DataFrame()
 
     for cc in categorical_columns:
@@ -334,4 +355,5 @@ def load_dataset(path: pathlib.Path) -> pd.DataFrame:
     Returns:
         pd.DataFrame: the dataframe.
     """
+    logger.info(f"Loading dataset from path {path}.")
     return pd.read_csv(path, sep=",")
diff --git a/src/data/utils.py b/src/data/utils.py
@@ -1,10 +1,11 @@
 import os
 import pathlib
 import joblib
-import numpy as np
 from typing import Union
 
 import boto3
+import numpy as np
+from loguru import logger
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 
 from ..config.aws import aws_credentials
@@ -24,6 +25,7 @@ def load_feature(
     Returns:
         Union[np.ndarray, StandardScaler, OneHotEncoder]: the feature's content.
     """
+    logger.info(f"Loading feature/encoder/scaler from file {path}.")
     return joblib.load(pathlib.PosixPath.joinpath(path, f"{feature_name}.pkl"))
 
 
@@ -40,7 +42,7 @@ def custom_combiner(feature, category) -> str:
     """
     return str(category)
 
-
+@logger.catch
 def download_dataset(
     name: str,
 ) -> None:
@@ -52,6 +54,7 @@ def download_dataset(
     kaggle_user = kaggle_credentials.KAGGLE_USERNAME
     kaggle_key = kaggle_credentials.KAGGLE_KEY
     path = '../data/'
+    logger.info(f"Downloading dataset {name} and saving into the folder {path}.")
 
     # Downloading data using the Kaggle API through the terminal
     os.system(f'export KAGGLE_USERNAME={kaggle_user}; export KAGGLE_KEY={kaggle_key};')
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,24 @@
+import pathlib
+
+from .data.processing import load_dataset, data_processing
+from .data.utils import custom_combiner
+from .config.settings import general_settings
+from .config.model import model_settings
+from .model import Model
+
+if __name__ == "__main__":
+    dataset = load_dataset(
+        path=pathlib.Path.joinpath(general_settings.DATA_PATH, general_settings.RAW_FILE_NAME)
+    )
+    X, y = data_processing(dataset)
+
+    loaded_model = Model(
+        model_name=model_settings.MODEL_NAME,
+        model_flavor=model_settings.MODEL_FLAVOR,
+        model_version=model_settings.VERSION,
+    )
+    loaded_model.load()
+
+
+    print(loaded_model.predict(X))
+    print(loaded_model.score(X, y))
diff --git a/src/model.py b/src/model.py
@@ -1,12 +1,10 @@
 import mlflow
 import numpy as np
-import pathlib
+from loguru import logger
 from sklearn.metrics import f1_score
 
 from .config.model import model_settings
-from .data.processing import load_dataset, data_processing
-from .data.utils import custom_combiner
-from .config.settings import general_settings
+
 
 class Model:
     """The trained model's class.
@@ -29,17 +27,21 @@ def __init__(
         self.model_version = model_version
         self.model = None
 
+    @logger.catch
     def load(self) -> None:
         """Loads the trained model.
 
         Raises:
             NotImplementedError: raises NotImplementedError if the model's
                 flavor value is not 'xgboost'.
         """
+        logger.info(f"Loading the model {model_settings.MODEL_NAME} from run ID {model_settings.RUN_ID}.")
+
         if self.model_flavor == "xgboost":
             model_uri = f"runs:/{model_settings.RUN_ID}/{model_settings.MODEL_NAME}"
             self.model = mlflow.xgboost.load_model(model_uri)
         else:
+            logger.critical(f"Couldn't load the model using the flavor {model_settings.MODEL_FLAVOR}.")
             raise NotImplementedError()
 
     def predict(self, x: np.ndarray) -> np.ndarray:
@@ -51,7 +53,9 @@ def predict(self, x: np.ndarray) -> np.ndarray:
         Returns:
             np.ndarray: the predictions array.
         """
-        return np.argmax(self.model.predict(x), axis=1)
+        prediction = np.argmax(self.model.predict(x), axis=1)
+        logger.info(f"Prediction: {prediction}.")
+        return prediction
 
     def score(self, x: np.ndarray, y: np.ndarray) -> float:
         """Calculates the F1-Score of a trained model given a pair of features
@@ -67,21 +71,6 @@ def score(self, x: np.ndarray, y: np.ndarray) -> float:
         prediction = self.predict(x).reshape(-1)
         _y = np.argmax(y, axis=1).reshape(-1)
 
-        return f1_score(y_true=_y, y_pred=prediction, average="weighted")
-
-# if __name__ == "__main__":
-#     dataset = load_dataset(
-#         path=pathlib.Path.joinpath(general_settings.DATA_PATH, general_settings.RAW_FILE_NAME)
-#     )
-#     X, y = data_processing(dataset)
-
-#     loaded_model = Model(
-#         model_name=model_settings.MODEL_NAME,
-#         model_flavor=model_settings.MODEL_FLAVOR,
-#         model_version=model_settings.VERSION,
-#     )
-#     loaded_model.load()
-
-
-#     print(loaded_model.predict(X))
-#     print(loaded_model.score(X, y))
+        score = f1_score(y_true=_y, y_pred=prediction, average="weighted")
+        logger.info(f"Achieved a weighted F1-Score of {score}.")
+        return score