Skip to content

Commit 2e0e4ed

Browse files
now every csv is normalized on its own and then concatenated
1 parent 6c36ba4 commit 2e0e4ed

File tree

3 files changed

+26
-19
lines changed

3 files changed

+26
-19
lines changed

src/main.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import matplotlib.pyplot as plt
22

3-
from src.preprocessing.loader import load_raw_timeseries
4-
from src.preprocessing.scaling import discretize_power, normalize_power
3+
from src.preprocessing.loader import load_timeseries
54

65

76
def plot_state_distribution(df):
@@ -18,13 +17,13 @@ def plot_state_distribution(df):
1817

1918

2019
def main():
21-
df = load_raw_timeseries()
20+
df = load_timeseries()
2221
print(df)
23-
df = normalize_power(df)
24-
print(df)
25-
df = discretize_power(df)
26-
print(df)
27-
plot_state_distribution(df)
22+
df_norm = load_timeseries(normalize=True)
23+
print(df_norm)
24+
df_disc = load_timeseries(normalize=True, discretize=True)
25+
print(df_disc)
26+
plot_state_distribution(df_disc)
2827

2928

3029
if __name__ == "__main__":

src/preprocessing/loader.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,19 @@
33

44
import pandas as pd
55

6+
from .scaling import discretize_power, normalize_power
7+
68
RAW_DATA_DIR = Path(__file__).resolve().parents[2] / "data" / "raw"
79

8-
__all__ = ["load_raw_timeseries", "RAW_DATA_DIR"]
10+
__all__ = ["load_timeseries", "RAW_DATA_DIR"]
911

1012

11-
def load_raw_timeseries(
13+
def load_timeseries(
1214
*,
1315
value_dtype: str = "float32",
16+
normalize: bool = False,
17+
discretize: bool = False,
18+
eps: float = 1e-12,
1419
) -> pd.DataFrame:
1520

1621
# Collect all .csv files under data/raw
@@ -19,19 +24,22 @@ def load_raw_timeseries(
1924
raise FileNotFoundError(f"No CSV files found in {RAW_DATA_DIR}.")
2025

2126
# Read columns (ts, value) from each file
22-
frames = [
23-
pd.read_csv(
27+
frames: List[pd.DataFrame] = []
28+
for path in csv_files:
29+
df = pd.read_csv(
2430
path,
2531
usecols=["ts", "value"],
2632
dtype={"value": value_dtype},
2733
parse_dates=["ts"],
2834
)
29-
for path in csv_files
30-
]
35+
df = df.rename(columns={"ts": "timestamp", "value": "power"})
36+
37+
if normalize:
38+
df = normalize_power(df, col="power", eps=eps)
3139

32-
df = pd.concat(frames, ignore_index=True)
40+
if discretize:
41+
df = discretize_power(df, col="power", state_col="state")
3342

34-
# Rename columns to the meaningful names
35-
df = df.rename(columns={"ts": "timestamp", "value": "power"})
43+
frames.append(df)
3644

37-
return df
45+
return pd.concat(frames, ignore_index=True)

src/preprocessing/scaling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def normalize_power(
1313

1414
p_min = df[col].min()
1515
p_max = df[col].max()
16-
denom = (p_max - p_min) or eps # schützt vor ZeroDivisionError
16+
denom = (p_max - p_min) or eps # anti zero division
1717

1818
df[col] = (df[col] - p_min) / denom
1919
return df

0 commit comments

Comments
 (0)