now every csv is normalized on its own and then concatenated

PhilippSchmelter · PhilippSchmelter · commit 2e0e4ed16a05 · 2025-04-29T01:42:44.000+02:00
diff --git a/src/main.py b/src/main.py
@@ -1,7 +1,6 @@
 import matplotlib.pyplot as plt
 
-from src.preprocessing.loader import load_raw_timeseries
-from src.preprocessing.scaling import discretize_power, normalize_power
+from src.preprocessing.loader import load_timeseries
 
 
 def plot_state_distribution(df):
@@ -18,13 +17,13 @@ def plot_state_distribution(df):
 
 
 def main():
-    df = load_raw_timeseries()
+    df = load_timeseries()
     print(df)
-    df = normalize_power(df)
-    print(df)
-    df = discretize_power(df)
-    print(df)
-    plot_state_distribution(df)
+    df_norm = load_timeseries(normalize=True)
+    print(df_norm)
+    df_disc = load_timeseries(normalize=True, discretize=True)
+    print(df_disc)
+    plot_state_distribution(df_disc)
 
 
 if __name__ == "__main__":
diff --git a/src/preprocessing/loader.py b/src/preprocessing/loader.py
@@ -3,14 +3,19 @@
 
 import pandas as pd
 
+from .scaling import discretize_power, normalize_power
+
 RAW_DATA_DIR = Path(__file__).resolve().parents[2] / "data" / "raw"
 
-__all__ = ["load_raw_timeseries", "RAW_DATA_DIR"]
+__all__ = ["load_timeseries", "RAW_DATA_DIR"]
 
 
-def load_raw_timeseries(
+def load_timeseries(
     *,
     value_dtype: str = "float32",
+    normalize: bool = False,
+    discretize: bool = False,
+    eps: float = 1e-12,
 ) -> pd.DataFrame:
 
     # Collect all .csv files under data/raw
@@ -19,19 +24,22 @@ def load_raw_timeseries(
         raise FileNotFoundError(f"No CSV files found in {RAW_DATA_DIR}.")
 
     # Read columns (ts, value) from each file
-    frames = [
-        pd.read_csv(
+    frames: List[pd.DataFrame] = []
+    for path in csv_files:
+        df = pd.read_csv(
             path,
             usecols=["ts", "value"],
             dtype={"value": value_dtype},
             parse_dates=["ts"],
         )
-        for path in csv_files
-    ]
+        df = df.rename(columns={"ts": "timestamp", "value": "power"})
+
+        if normalize:
+            df = normalize_power(df, col="power", eps=eps)
 
-    df = pd.concat(frames, ignore_index=True)
+        if discretize:
+            df = discretize_power(df, col="power", state_col="state")
 
-    # Rename columns to the meaningful names
-    df = df.rename(columns={"ts": "timestamp", "value": "power"})
+        frames.append(df)
 
-    return df
+    return pd.concat(frames, ignore_index=True)
diff --git a/src/preprocessing/scaling.py b/src/preprocessing/scaling.py
@@ -13,7 +13,7 @@ def normalize_power(
 
     p_min = df[col].min()
     p_max = df[col].max()
-    denom = (p_max - p_min) or eps  # schützt vor ZeroDivisionError
+    denom = (p_max - p_min) or eps  # anti zero division
 
     df[col] = (df[col] - p_min) / denom
     return df