Skip to content

Commit baf96d7

Browse files
committed
adding logging for the code source
1 parent 665be88 commit baf96d7

File tree

7 files changed

+84
-30
lines changed

7 files changed

+84
-30
lines changed

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
ipynb_checkpoints/
2-
mlruns/
3-
mlartifacts/
2+
mlruns
3+
mlartifacts
44
*.csv
55

66
# Byte-compiled / optimized / DLL files

config/settings.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,6 @@ DATA_PATH: '/media/greca/HD/GitHub/e2e-mlops-project/data/'
1111
RAW_FILE_NAME: 'Original_ObesityDataSet.csv'
1212
ARTIFACTS_PATH: '/media/greca/HD/GitHub/e2e-mlops-project/models/artifacts/'
1313
FEATURES_PATH: '/media/greca/HD/GitHub/e2e-mlops-project/models/features/'
14-
TARGET_COLUMN: 'NObeyesdad'
14+
TARGET_COLUMN: 'NObeyesdad'
15+
LOG_LEVEL: 'INFO'
16+
LOG_PATH: '/media/greca/HD/GitHub/e2e-mlops-project/'

src/config/settings.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
import os
12
from pathlib import Path
3+
4+
from loguru import logger
25
from pydantic import BaseModel, DirectoryPath
36

47
from .utils import read_yaml_credentials_file
@@ -15,6 +18,8 @@ class GeneralSettings(BaseModel):
1518
ARTIFACTS_PATH: DirectoryPath
1619
FEATURES_PATH: DirectoryPath
1720
TARGET_COLUMN: str
21+
LOG_LEVEL: str
22+
LOG_PATH: DirectoryPath
1823

1924
general_settings = GeneralSettings(
2025
**read_yaml_credentials_file(
@@ -25,3 +30,12 @@ class GeneralSettings(BaseModel):
2530
file_name="settings.yaml",
2631
)
2732
)
33+
34+
os.makedirs(general_settings.LOG_PATH, exist_ok=True)
35+
logger.remove()
36+
logger.add(
37+
Path.joinpath(general_settings.LOG_PATH, "logs", "app.log"),
38+
rotation="1 day",
39+
retention="7 days",
40+
compression="zip"
41+
)

src/data/processing.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
1-
import pandas as pd
2-
import numpy as np
31
import pathlib
42
from typing import List, Dict, Tuple
3+
4+
import numpy as np
5+
import pandas as pd
6+
from loguru import logger
57
from sklearn.preprocessing import StandardScaler, OneHotEncoder
8+
69
from .utils import custom_combiner, load_feature
710
from ..config.settings import general_settings
811
from ..config.model import model_settings
912

13+
1014
def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
1115
"""Applies the data processing pipeline.
1216
@@ -17,18 +21,26 @@ def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
1721
Tuple[np.ndarray, np.ndarray]: the features and labels array, respectively.
1822
"""
1923
# First step) removing duplicates, changing the height unit, removing outliers
24+
logger.info("Removing duplicates from the dataset.")
2025
dataframe = _remove_duplicates(dataframe)
26+
27+
logger.info("Changing the height units to centimeters.")
2128
dataframe = _change_height_units(dataframe)
29+
30+
logger.info("Removing outliers from the dataset.")
2231
dataframe = _remove_outliers(dataframe)
2332

2433
# Feature engineering step)
2534
# Creating the BMI feature
35+
logger.info("Creating a new column for the BMI values from the data samples.")
2636
dataframe = _create_bmi_feature(dataframe)
2737

2838
# Creating the BMR feature
39+
logger.info("Creating a new column for the BMR values from the data samples.")
2940
dataframe = _create_bmr_feature(dataframe)
3041

3142
# Creating the IS feature
43+
logger.info("Creating a new column for the IS values from the data samples.")
3244
dataframe = _create_is_feature(dataframe)
3345

3446
# Feature transformation step)
@@ -41,9 +53,11 @@ def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
4153
"NCP",
4254
"CH2O",
4355
]
56+
logger.info(f"Dropping the columns {columns_to_drop}.")
4457
dataframe = _drop_features(dataframe=dataframe, features=columns_to_drop)
4558

4659
# Transforming the AGE and IS columns into a categorical columns
60+
logger.info("Categorizing the numerical columns ('Age' and 'IS').")
4761
dataframe = _categorize_numerical_columns(dataframe)
4862

4963
# Transforming (Log Transformation) numerical columns
@@ -69,12 +83,15 @@ def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
6983

7084
# Selecting only the features that are important for the model
7185
dataframe = dataframe[model_settings.FEATURES + [general_settings.TARGET_COLUMN]]
86+
logger.info(f"Filtering the features columns, keeping only {model_settings.FEATURES} columns.")
7287

7388
# Splitting the data into X (features) and y (label)
89+
logger.info("Splitting the dataset into X and y arrays.")
7490
X = dataframe.drop(columns=[general_settings.TARGET_COLUMN]).values
7591
y = dataframe[general_settings.TARGET_COLUMN].values
7692

7793
# Encoding the labels array
94+
logger.info(f"Encoding the target column ({general_settings.TARGET_COLUMN}).")
7895
label_encoder = load_feature(
7996
path=general_settings.ARTIFACTS_PATH,
8097
feature_name='label_ohe'
@@ -252,6 +269,7 @@ def _transform_numerical_columns(
252269
pd.DataFrame: the dataframe with all numerical columns transformed.
253270
"""
254271
numerical_columns = dataframe.select_dtypes(exclude="object").columns.tolist()
272+
logger.info(f"Applying Log Transformation to the {numerical_columns} columns.")
255273

256274
for nc in numerical_columns:
257275
dataframe[nc] = np.log1p(dataframe[nc].values + epsilon)
@@ -273,6 +291,7 @@ def _scale_numerical_columns(
273291
pd.DataFrame: the dataframe with all numerical columns encoded.
274292
"""
275293
numerical_columns = dataframe.select_dtypes(exclude="object").columns.tolist()
294+
logger.info(f"Scaling the {numerical_columns} columns.")
276295

277296
for nc in numerical_columns:
278297
dataframe[nc] = sc[nc].transform(dataframe[nc].values.reshape(-1, 1))
@@ -297,6 +316,8 @@ def _encode_categorical_columns(
297316
"""
298317
categorical_columns = dataframe.select_dtypes(include="object").columns.tolist()
299318
categorical_columns.remove(target_column)
319+
logger.info(f"Encoding the {categorical_columns} columns.")
320+
300321
new_dataframe = pd.DataFrame()
301322

302323
for cc in categorical_columns:
@@ -334,4 +355,5 @@ def load_dataset(path: pathlib.Path) -> pd.DataFrame:
334355
Returns:
335356
pd.DataFrame: the dataframe.
336357
"""
358+
logger.info(f"Loading dataset from path {path}.")
337359
return pd.read_csv(path, sep=",")

src/data/utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import os
22
import pathlib
33
import joblib
4-
import numpy as np
54
from typing import Union
65

76
import boto3
7+
import numpy as np
8+
from loguru import logger
89
from sklearn.preprocessing import StandardScaler, OneHotEncoder
910

1011
from ..config.aws import aws_credentials
@@ -24,6 +25,7 @@ def load_feature(
2425
Returns:
2526
Union[np.ndarray, StandardScaler, OneHotEncoder]: the feature's content.
2627
"""
28+
logger.info(f"Loading feature/encoder/scaler from file {path}.")
2729
return joblib.load(pathlib.PosixPath.joinpath(path, f"{feature_name}.pkl"))
2830

2931

@@ -40,7 +42,7 @@ def custom_combiner(feature, category) -> str:
4042
"""
4143
return str(category)
4244

43-
45+
@logger.catch
4446
def download_dataset(
4547
name: str,
4648
) -> None:
@@ -52,6 +54,7 @@ def download_dataset(
5254
kaggle_user = kaggle_credentials.KAGGLE_USERNAME
5355
kaggle_key = kaggle_credentials.KAGGLE_KEY
5456
path = '../data/'
57+
logger.info(f"Downloading dataset {name} and saving into the folder {path}.")
5558

5659
# Downloading data using the Kaggle API through the terminal
5760
os.system(f'export KAGGLE_USERNAME={kaggle_user}; export KAGGLE_KEY={kaggle_key};')

src/main.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import pathlib
2+
3+
from .data.processing import load_dataset, data_processing
4+
from .data.utils import custom_combiner
5+
from .config.settings import general_settings
6+
from .config.model import model_settings
7+
from .model import Model
8+
9+
if __name__ == "__main__":
10+
dataset = load_dataset(
11+
path=pathlib.Path.joinpath(general_settings.DATA_PATH, general_settings.RAW_FILE_NAME)
12+
)
13+
X, y = data_processing(dataset)
14+
15+
loaded_model = Model(
16+
model_name=model_settings.MODEL_NAME,
17+
model_flavor=model_settings.MODEL_FLAVOR,
18+
model_version=model_settings.VERSION,
19+
)
20+
loaded_model.load()
21+
22+
23+
print(loaded_model.predict(X))
24+
print(loaded_model.score(X, y))

src/model.py

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
import mlflow
22
import numpy as np
3-
import pathlib
3+
from loguru import logger
44
from sklearn.metrics import f1_score
55

66
from .config.model import model_settings
7-
from .data.processing import load_dataset, data_processing
8-
from .data.utils import custom_combiner
9-
from .config.settings import general_settings
7+
108

119
class Model:
1210
"""The trained model's class.
@@ -29,17 +27,21 @@ def __init__(
2927
self.model_version = model_version
3028
self.model = None
3129

30+
@logger.catch
3231
def load(self) -> None:
3332
"""Loads the trained model.
3433
3534
Raises:
3635
NotImplementedError: raises NotImplementedError if the model's
3736
flavor value is not 'xgboost'.
3837
"""
38+
logger.info(f"Loading the model {model_settings.MODEL_NAME} from run ID {model_settings.RUN_ID}.")
39+
3940
if self.model_flavor == "xgboost":
4041
model_uri = f"runs:/{model_settings.RUN_ID}/{model_settings.MODEL_NAME}"
4142
self.model = mlflow.xgboost.load_model(model_uri)
4243
else:
44+
logger.critical(f"Couldn't load the model using the flavor {model_settings.MODEL_FLAVOR}.")
4345
raise NotImplementedError()
4446

4547
def predict(self, x: np.ndarray) -> np.ndarray:
@@ -51,7 +53,9 @@ def predict(self, x: np.ndarray) -> np.ndarray:
5153
Returns:
5254
np.ndarray: the predictions array.
5355
"""
54-
return np.argmax(self.model.predict(x), axis=1)
56+
prediction = np.argmax(self.model.predict(x), axis=1)
57+
logger.info(f"Prediction: {prediction}.")
58+
return prediction
5559

5660
def score(self, x: np.ndarray, y: np.ndarray) -> float:
5761
"""Calculates the F1-Score of a trained model given a pair of features
@@ -67,21 +71,6 @@ def score(self, x: np.ndarray, y: np.ndarray) -> float:
6771
prediction = self.predict(x).reshape(-1)
6872
_y = np.argmax(y, axis=1).reshape(-1)
6973

70-
return f1_score(y_true=_y, y_pred=prediction, average="weighted")
71-
72-
# if __name__ == "__main__":
73-
# dataset = load_dataset(
74-
# path=pathlib.Path.joinpath(general_settings.DATA_PATH, general_settings.RAW_FILE_NAME)
75-
# )
76-
# X, y = data_processing(dataset)
77-
78-
# loaded_model = Model(
79-
# model_name=model_settings.MODEL_NAME,
80-
# model_flavor=model_settings.MODEL_FLAVOR,
81-
# model_version=model_settings.VERSION,
82-
# )
83-
# loaded_model.load()
84-
85-
86-
# print(loaded_model.predict(X))
87-
# print(loaded_model.score(X, y))
74+
score = f1_score(y_true=_y, y_pred=prediction, average="weighted")
75+
logger.info(f"Achieved a weighted F1-Score of {score}.")
76+
return score

0 commit comments

Comments
 (0)