diff --git a/CHANGELOG.md b/CHANGELOG.md index aa13fe6..04964c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - `CODEOWNERS` file and dependabot automation [#7](https://github.com/ie3-institute/simonaMarkovLoad/issues/7) +- Added full Markov-model pipeline [#10](https://github.com/ie3-institute/simonaMarkovLoad/issues/10) ### Changed - Compute instantaneous kW from cumulative kWh via 15-minute differencing [#1](https://github.com/ie3-institute/simonaMarkovLoad/issues/1) diff --git a/config.yml b/config.yml index 735f499..76b8ecb 100644 --- a/config.yml +++ b/config.yml @@ -3,3 +3,7 @@ input: timestamp_col: "Zeitstempel" value_col: "Messwert" factor: 4 # 15-min → kW depends on sampling + +model: + n_states: 10 + laplace_alpha: 1.0 diff --git a/src/config.py b/src/config.py index 107c7f1..a43f3e9 100644 --- a/src/config.py +++ b/src/config.py @@ -1,4 +1,3 @@ -# src/config.py import os from pathlib import Path diff --git a/src/main.py b/src/main.py index 8e867e3..099795a 100644 --- a/src/main.py +++ b/src/main.py @@ -1,30 +1,31 @@ import matplotlib.pyplot as plt +import numpy as np +from src.markov.transition_counts import build_transition_counts +from src.markov.transitions import build_transition_matrices from src.preprocessing.loader import load_timeseries +df = load_timeseries(normalize=True, discretize=True) -def plot_state_distribution(df): - counts = df["state"].value_counts().sort_index() +counts = build_transition_counts(df) +probs = build_transition_matrices(df, alpha=1.0) - plt.figure() - plt.bar(counts.index, counts.values) - plt.xlabel("State") - plt.ylabel("Anzahl Einträge") - plt.title("Verteilung der Einträge nach State") - plt.xticks(counts.index) - plt.show() +print("counts shape :", counts.shape) +print("probs shape :", probs.shape) -def main(): - df = load_timeseries() - print(df) - df_norm = load_timeseries(normalize=True) - print(df_norm) - df_disc = load_timeseries(normalize=True, discretize=True) - print(df_disc) - plot_state_distribution(df_disc) +active_buckets = np.where(counts.sum(axis=(1, 2)) > 0)[0] +bucket = int(active_buckets[0]) if active_buckets.size else 0 +print(f"\nUsing bucket {bucket}") -if __name__ == "__main__": - main() +print("row sums :", probs[bucket].sum(axis=1)) + +plt.imshow(probs[bucket], aspect="auto") +plt.title(f"Bucket {bucket} – transition probabilities") +plt.xlabel("state t+1") +plt.ylabel("state t") +plt.colorbar() +plt.tight_layout() +plt.show() diff --git a/src/markov/__init__.py b/src/markov/__init__.py index e69de29..af47ed6 100644 --- a/src/markov/__init__.py +++ b/src/markov/__init__.py @@ -0,0 +1,10 @@ +from .buckets import NUM_BUCKETS, bucket_id +from .transition_counts import build_transition_counts +from .transitions import build_transition_matrices + +__all__ = [ + "bucket_id", + "NUM_BUCKETS", + "build_transition_counts", + "build_transition_matrices", +] diff --git a/src/markov/_core.py b/src/markov/_core.py new file mode 100644 index 0000000..04c7d8f --- /dev/null +++ b/src/markov/_core.py @@ -0,0 +1,21 @@ +import numpy as np +import pandas as pd + +from src.config import CONFIG + +from .buckets import NUM_BUCKETS + +N_STATES = int(CONFIG["model"]["n_states"]) + + +def _transition_counts( + df: pd.DataFrame, *, state_col="state", bucket_col="bucket", dtype=np.uint32 +) -> np.ndarray: + df = df.sort_values("timestamp") + s_t = df[state_col].to_numpy(dtype=int)[:-1] + s_tp1 = df[state_col].to_numpy(dtype=int)[1:] + buckets = df[bucket_col].to_numpy(dtype=int)[:-1] + + counts = np.zeros((NUM_BUCKETS, N_STATES, N_STATES), dtype=dtype) + np.add.at(counts, (buckets, s_t, s_tp1), 1) + return counts diff --git a/src/markov/buckets.py b/src/markov/buckets.py new file mode 100644 index 0000000..29797d3 --- /dev/null +++ b/src/markov/buckets.py @@ -0,0 +1,44 @@ +import pandas as pd + +_MONTH_FACTOR = 96 * 2 +_WEEKEND_FACTOR = 96 +_NUM_MONTHS = 12 +_NUM_QH = 96 +NUM_BUCKETS = _NUM_MONTHS * 2 * _NUM_QH # 2 304 + + +def _is_weekend(ts): + if isinstance(ts, pd.Series): + return ts.dt.dayofweek >= 5 + if isinstance(ts, pd.DatetimeIndex): + return ts.dayofweek >= 5 + return ts.dayofweek >= 5 + + +def bucket_id(ts): + if isinstance(ts, pd.Series): + weekend = _is_weekend(ts).astype(int) + qh = ts.dt.hour * 4 + ts.dt.minute // 15 + month = ts.dt.month - 1 + elif isinstance(ts, pd.DatetimeIndex): + weekend = _is_weekend(ts).astype(int) + qh = ts.hour * 4 + ts.minute // 15 + month = ts.month - 1 + else: + weekend = int(_is_weekend(ts)) + qh = ts.hour * 4 + ts.minute // 15 + month = ts.month - 1 + + return month * _MONTH_FACTOR + weekend * _WEEKEND_FACTOR + qh + + +def assign_buckets( + df: pd.DataFrame, + *, + ts_col: str = "timestamp", + bucket_col: str = "bucket", + inplace: bool = False, +) -> pd.DataFrame: + tgt = df if inplace else df.copy() + tgt[bucket_col] = bucket_id(tgt[ts_col]).astype("uint16") + return tgt diff --git a/src/markov/model.py b/src/markov/model.py deleted file mode 100644 index 7f2c744..0000000 --- a/src/markov/model.py +++ /dev/null @@ -1 +0,0 @@ -print("K2") diff --git a/src/markov/transition_counts.py b/src/markov/transition_counts.py new file mode 100644 index 0000000..5e9d756 --- /dev/null +++ b/src/markov/transition_counts.py @@ -0,0 +1,31 @@ +import numpy as np +import pandas as pd + +from src.config import CONFIG + +from .buckets import NUM_BUCKETS + +N_STATES = int(CONFIG["model"]["n_states"]) + + +def build_transition_counts( + df: pd.DataFrame, + *, + state_col: str = "state", + bucket_col: str = "bucket", + dtype=np.uint32, +) -> np.ndarray: + """ + Absolute transition counts: + C[b, i, j] = # of times state_t=i → state_{t+1}=j in bucket b + Shape = (2 304, 10, 10). + """ + df = df.sort_values("timestamp") + + s_t = df[state_col].to_numpy(dtype=int)[:-1] + s_tp1 = df[state_col].to_numpy(dtype=int)[1:] + buckets = df[bucket_col].to_numpy(dtype=int)[:-1] + + counts = np.zeros((NUM_BUCKETS, N_STATES, N_STATES), dtype=dtype) + np.add.at(counts, (buckets, s_t, s_tp1), 1) + return counts diff --git a/src/markov/transitions.py b/src/markov/transitions.py new file mode 100644 index 0000000..08095c0 --- /dev/null +++ b/src/markov/transitions.py @@ -0,0 +1,15 @@ +import numpy as np +import pandas as pd + +from src.config import CONFIG + +from ._core import _transition_counts + +alpha = float(CONFIG["model"]["laplace_alpha"]) + + +def build_transition_matrices(df: pd.DataFrame, *, dtype=np.float32) -> np.ndarray: + counts = _transition_counts(df, dtype=dtype) + counts += alpha + counts /= counts.sum(axis=2, keepdims=True) + return counts.astype(dtype) diff --git a/src/preprocessing/loader.py b/src/preprocessing/loader.py index d73fb14..8a9c1cc 100644 --- a/src/preprocessing/loader.py +++ b/src/preprocessing/loader.py @@ -5,6 +5,7 @@ from src.config import CONFIG +from ..markov.buckets import assign_buckets from .scaling import discretize_power, normalize_power RAW_DATA_DIR = Path(__file__).resolve().parents[2] / "data" / "raw" @@ -53,6 +54,8 @@ def load_timeseries( if discretize: df = discretize_power(df, col="power", state_col="state") + df = assign_buckets(df, inplace=True) + frames.append(df) return pd.concat(frames, ignore_index=True) diff --git a/tests/test_buckets.py b/tests/test_buckets.py new file mode 100644 index 0000000..165dc2f --- /dev/null +++ b/tests/test_buckets.py @@ -0,0 +1,38 @@ +import numpy as np +import pandas as pd + +from src.markov.buckets import NUM_BUCKETS, assign_buckets, bucket_id + + +def _manual_bucket(ts): + month_factor = 96 * 2 + weekend_factor = 96 + month = ts.month - 1 + weekend = int(ts.dayofweek >= 5) + qh = ts.hour * 4 + ts.minute // 15 + return month * month_factor + weekend * weekend_factor + qh + + +def test_bucket_id_single_weekday_weekend(): + ts_weekday = pd.Timestamp("2025-06-02 00:00") + ts_weekend = pd.Timestamp("2025-06-07 00:00") + + assert bucket_id(ts_weekday) == _manual_bucket(ts_weekday) + assert bucket_id(ts_weekend) == _manual_bucket(ts_weekend) + + +def test_bucket_id_series(): + s = pd.to_datetime(["2025-06-02 12:30", "2025-06-07 23:45"]) + out = bucket_id(s) + expected = s.map(_manual_bucket) + + assert np.array_equal(out.to_numpy(), expected.to_numpy()) + + +def test_assign_buckets_adds_column_and_range(): + ts = pd.date_range("2025-03-01", periods=4, freq="15min") + df = pd.DataFrame({"timestamp": ts}) + out = assign_buckets(df.copy()) + + assert "bucket" in out.columns + assert out["bucket"].between(0, NUM_BUCKETS - 1).all() diff --git a/tests/test_loader.py b/tests/test_loader.py index 6d4dc1c..81dad26 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -4,54 +4,64 @@ import pytest import src.preprocessing.loader as loader_module - -# Import the module under test +from src.markov.buckets import bucket_id from src.preprocessing.loader import load_timeseries -def create_sample_csv(path: Path): - # Write 21 dummy metadata lines - with open(path, "w", encoding="utf-8") as f: +def _create_sample_csv(path: Path) -> None: + """Write 3 kWh readings; first diff will be NaN and dropped.""" + with path.open("w", encoding="utf-8") as f: for _ in range(21): f.write("metadata line\n") - # Header and sample data f.write("Zeitstempel,Messwert\n") f.write("2021-01-01 00:00:00,0.0\n") f.write("2021-01-01 00:15:00,0.5\n") f.write("2021-01-01 00:30:00,1.0\n") -def test_load_timeseries(tmp_path, monkeypatch): - # Setup a temporary data/raw directory +def test_load_timeseries_full(tmp_path, monkeypatch): + """Loader must return timestamp, power state, bucket.""" raw_dir = tmp_path / "data" / "raw" raw_dir.mkdir(parents=True) csv_file = raw_dir / "sample.csv" - create_sample_csv(csv_file) + _create_sample_csv(csv_file) - # Monkeypatch the RAW_DATA_DIR in the loader module monkeypatch.setattr(loader_module, "RAW_DATA_DIR", raw_dir) + loader_module.CONFIG["input"].update( + { + "timestamp_col": "Zeitstempel", + "value_col": "Messwert", + "factor": 4.0, + } + ) - # Execute loader (skip normalize and discretize) - df = load_timeseries() + df = load_timeseries(normalize=True, discretize=True) - # Expect two rows (first diff yields NaN and is dropped) - assert df.shape == (2, 2) - # Columns should be timestamp and power - assert list(df.columns) == ["timestamp", "power"] + # two rows (first diff removed) & five columns + assert df.shape == (2, 4) + assert list(df.columns) == ["timestamp", "power", "state", "bucket"] - # Check timestamps are parsed correctly - expected_timestamps = pd.Series( - pd.to_datetime(["2021-01-01 00:15:00", "2021-01-01 00:30:00"]), name="timestamp" - ) - pd.testing.assert_series_equal( - df["timestamp"], expected_timestamps, check_names=False + # timestamps + expected_ts = pd.Series( + pd.to_datetime(["2021-01-01 00:15:00", "2021-01-01 00:30:00"]), + name="timestamp", ) + pd.testing.assert_series_equal(df["timestamp"], expected_ts, check_names=False) - # Check power calculation: diff * 4 - # Values: (0.5 - 0.0)*4 = 2.0, (1.0 - 0.5)*4 = 2.0 - expected_power = pd.Series([2.0, 2.0], name="power", dtype="float32") + # normalized power column: both values become 0.0 + expected_power = pd.Series([0.0, 0.0], name="power", dtype="float32") pd.testing.assert_series_equal(df["power"], expected_power) + # state column: both values land in bin 0 + expected_state = pd.Series([0, 0], name="state", dtype="int64") + pd.testing.assert_series_equal(df["state"], expected_state) + + # bucket IDs + expected_bucket = pd.Series( + [bucket_id(ts) for ts in expected_ts], name="bucket", dtype="uint16" + ) + pd.testing.assert_series_equal(df["bucket"], expected_bucket) + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/test_transition_counts.py b/tests/test_transition_counts.py new file mode 100644 index 0000000..e408ccd --- /dev/null +++ b/tests/test_transition_counts.py @@ -0,0 +1,21 @@ +import pandas as pd + +from src.markov.buckets import NUM_BUCKETS, bucket_id +from src.markov.transition_counts import N_STATES, build_transition_counts + + +def test_build_transition_counts_basic(): + ts = pd.to_datetime(["2025-01-06 00:00", "2025-01-06 00:15", "2025-01-06 00:30"]) + states = [0, 1, 1] + buckets = [bucket_id(t) for t in ts] + + df = pd.DataFrame({"timestamp": ts, "state": states, "bucket": buckets}) + + counts = build_transition_counts(df) + + assert counts.shape == (NUM_BUCKETS, N_STATES, N_STATES) + + assert counts[buckets[0], 0, 1] == 1 + assert counts[buckets[1], 1, 1] == 1 + + assert counts.sum() == 2 diff --git a/tests/test_transitions.py b/tests/test_transitions.py new file mode 100644 index 0000000..a04cea7 --- /dev/null +++ b/tests/test_transitions.py @@ -0,0 +1,20 @@ +import numpy as np +import pandas as pd + +from src.markov.buckets import NUM_BUCKETS, bucket_id +from src.markov.transition_counts import N_STATES +from src.markov.transitions import build_transition_matrices + + +def test_build_transition_matrices_row_sums(): + ts = pd.to_datetime(["2025-02-03 00:00", "2025-02-03 00:15", "2025-02-03 00:30"]) + states = [2, 3, 2] + buckets = [bucket_id(t) for t in ts] + + df = pd.DataFrame({"timestamp": ts, "state": states, "bucket": buckets}) + + probs = build_transition_matrices(df) + + assert probs.shape == (NUM_BUCKETS, N_STATES, N_STATES) + + assert np.allclose(probs.sum(axis=2), 1.0, atol=1e-6)