Added full Markov-model pipeline (#11)

PhilippSchmelter · web-flow · commit 4b79f062155f · 2025-06-28T01:14:29.000+02:00
* added full markov model

* comments

* use of config

* test loader

* added tests
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 - `CODEOWNERS` file and dependabot automation [#7](https://github.com/ie3-institute/simonaMarkovLoad/issues/7)
+- Added full Markov-model pipeline [#10](https://github.com/ie3-institute/simonaMarkovLoad/issues/10)
 
 ### Changed
 - Compute instantaneous kW from cumulative kWh via 15-minute differencing [#1](https://github.com/ie3-institute/simonaMarkovLoad/issues/1)
diff --git a/config.yml b/config.yml
@@ -3,3 +3,7 @@ input:
   timestamp_col: "Zeitstempel"
   value_col: "Messwert"
   factor: 4         # 15-min → kW depends on sampling
+
+model:
+  n_states: 10
+  laplace_alpha: 1.0
diff --git a/src/config.py b/src/config.py
@@ -1,4 +1,3 @@
-# src/config.py
 import os
 from pathlib import Path
 
diff --git a/src/main.py b/src/main.py
@@ -1,30 +1,31 @@
 import matplotlib.pyplot as plt
+import numpy as np
 
+from src.markov.transition_counts import build_transition_counts
+from src.markov.transitions import build_transition_matrices
 from src.preprocessing.loader import load_timeseries
 
+df = load_timeseries(normalize=True, discretize=True)
 
-def plot_state_distribution(df):
 
-    counts = df["state"].value_counts().sort_index()
+counts = build_transition_counts(df)
+probs = build_transition_matrices(df, alpha=1.0)
 
-    plt.figure()
-    plt.bar(counts.index, counts.values)
-    plt.xlabel("State")
-    plt.ylabel("Anzahl Einträge")
-    plt.title("Verteilung der Einträge nach State")
-    plt.xticks(counts.index)
-    plt.show()
+print("counts shape :", counts.shape)
+print("probs  shape :", probs.shape)
 
 
-def main():
-    df = load_timeseries()
-    print(df)
-    df_norm = load_timeseries(normalize=True)
-    print(df_norm)
-    df_disc = load_timeseries(normalize=True, discretize=True)
-    print(df_disc)
-    plot_state_distribution(df_disc)
+active_buckets = np.where(counts.sum(axis=(1, 2)) > 0)[0]
+bucket = int(active_buckets[0]) if active_buckets.size else 0
+print(f"\nUsing bucket {bucket}")
 
 
-if __name__ == "__main__":
-    main()
+print("row sums :", probs[bucket].sum(axis=1))
+
+plt.imshow(probs[bucket], aspect="auto")
+plt.title(f"Bucket {bucket} – transition probabilities")
+plt.xlabel("state t+1")
+plt.ylabel("state t")
+plt.colorbar()
+plt.tight_layout()
+plt.show()
diff --git a/src/markov/__init__.py b/src/markov/__init__.py
@@ -0,0 +1,10 @@
+from .buckets import NUM_BUCKETS, bucket_id
+from .transition_counts import build_transition_counts
+from .transitions import build_transition_matrices
+
+__all__ = [
+    "bucket_id",
+    "NUM_BUCKETS",
+    "build_transition_counts",
+    "build_transition_matrices",
+]
diff --git a/src/markov/_core.py b/src/markov/_core.py
@@ -0,0 +1,21 @@
+import numpy as np
+import pandas as pd
+
+from src.config import CONFIG
+
+from .buckets import NUM_BUCKETS
+
+N_STATES = int(CONFIG["model"]["n_states"])
+
+
+def _transition_counts(
+    df: pd.DataFrame, *, state_col="state", bucket_col="bucket", dtype=np.uint32
+) -> np.ndarray:
+    df = df.sort_values("timestamp")
+    s_t = df[state_col].to_numpy(dtype=int)[:-1]
+    s_tp1 = df[state_col].to_numpy(dtype=int)[1:]
+    buckets = df[bucket_col].to_numpy(dtype=int)[:-1]
+
+    counts = np.zeros((NUM_BUCKETS, N_STATES, N_STATES), dtype=dtype)
+    np.add.at(counts, (buckets, s_t, s_tp1), 1)
+    return counts
diff --git a/src/markov/buckets.py b/src/markov/buckets.py
@@ -0,0 +1,44 @@
+import pandas as pd
+
+_MONTH_FACTOR = 96 * 2
+_WEEKEND_FACTOR = 96
+_NUM_MONTHS = 12
+_NUM_QH = 96
+NUM_BUCKETS = _NUM_MONTHS * 2 * _NUM_QH  # 2 304
+
+
+def _is_weekend(ts):
+    if isinstance(ts, pd.Series):
+        return ts.dt.dayofweek >= 5
+    if isinstance(ts, pd.DatetimeIndex):
+        return ts.dayofweek >= 5
+    return ts.dayofweek >= 5
+
+
+def bucket_id(ts):
+    if isinstance(ts, pd.Series):
+        weekend = _is_weekend(ts).astype(int)
+        qh = ts.dt.hour * 4 + ts.dt.minute // 15
+        month = ts.dt.month - 1
+    elif isinstance(ts, pd.DatetimeIndex):
+        weekend = _is_weekend(ts).astype(int)
+        qh = ts.hour * 4 + ts.minute // 15
+        month = ts.month - 1
+    else:
+        weekend = int(_is_weekend(ts))
+        qh = ts.hour * 4 + ts.minute // 15
+        month = ts.month - 1
+
+    return month * _MONTH_FACTOR + weekend * _WEEKEND_FACTOR + qh
+
+
+def assign_buckets(
+    df: pd.DataFrame,
+    *,
+    ts_col: str = "timestamp",
+    bucket_col: str = "bucket",
+    inplace: bool = False,
+) -> pd.DataFrame:
+    tgt = df if inplace else df.copy()
+    tgt[bucket_col] = bucket_id(tgt[ts_col]).astype("uint16")
+    return tgt
diff --git a/src/markov/model.py b/src/markov/model.py
diff --git a/src/markov/transition_counts.py b/src/markov/transition_counts.py
@@ -0,0 +1,31 @@
+import numpy as np
+import pandas as pd
+
+from src.config import CONFIG
+
+from .buckets import NUM_BUCKETS
+
+N_STATES = int(CONFIG["model"]["n_states"])
+
+
+def build_transition_counts(
+    df: pd.DataFrame,
+    *,
+    state_col: str = "state",
+    bucket_col: str = "bucket",
+    dtype=np.uint32,
+) -> np.ndarray:
+    """
+    Absolute transition counts:
+        C[b, i, j] = # of times state_t=i  → state_{t+1}=j in bucket b
+    Shape = (2 304, 10, 10).
+    """
+    df = df.sort_values("timestamp")
+
+    s_t = df[state_col].to_numpy(dtype=int)[:-1]
+    s_tp1 = df[state_col].to_numpy(dtype=int)[1:]
+    buckets = df[bucket_col].to_numpy(dtype=int)[:-1]
+
+    counts = np.zeros((NUM_BUCKETS, N_STATES, N_STATES), dtype=dtype)
+    np.add.at(counts, (buckets, s_t, s_tp1), 1)
+    return counts
diff --git a/src/markov/transitions.py b/src/markov/transitions.py
@@ -0,0 +1,15 @@
+import numpy as np
+import pandas as pd
+
+from src.config import CONFIG
+
+from ._core import _transition_counts
+
+alpha = float(CONFIG["model"]["laplace_alpha"])
+
+
+def build_transition_matrices(df: pd.DataFrame, *, dtype=np.float32) -> np.ndarray:
+    counts = _transition_counts(df, dtype=dtype)
+    counts += alpha
+    counts /= counts.sum(axis=2, keepdims=True)
+    return counts.astype(dtype)
diff --git a/src/preprocessing/loader.py b/src/preprocessing/loader.py
@@ -5,6 +5,7 @@
 
 from src.config import CONFIG
 
+from ..markov.buckets import assign_buckets
 from .scaling import discretize_power, normalize_power
 
 RAW_DATA_DIR = Path(__file__).resolve().parents[2] / "data" / "raw"
@@ -53,6 +54,8 @@ def load_timeseries(
         if discretize:
             df = discretize_power(df, col="power", state_col="state")
 
+        df = assign_buckets(df, inplace=True)
+
         frames.append(df)
 
     return pd.concat(frames, ignore_index=True)
diff --git a/tests/test_buckets.py b/tests/test_buckets.py
@@ -0,0 +1,38 @@
+import numpy as np
+import pandas as pd
+
+from src.markov.buckets import NUM_BUCKETS, assign_buckets, bucket_id
+
+
+def _manual_bucket(ts):
+    month_factor = 96 * 2
+    weekend_factor = 96
+    month = ts.month - 1
+    weekend = int(ts.dayofweek >= 5)
+    qh = ts.hour * 4 + ts.minute // 15
+    return month * month_factor + weekend * weekend_factor + qh
+
+
+def test_bucket_id_single_weekday_weekend():
+    ts_weekday = pd.Timestamp("2025-06-02 00:00")
+    ts_weekend = pd.Timestamp("2025-06-07 00:00")
+
+    assert bucket_id(ts_weekday) == _manual_bucket(ts_weekday)
+    assert bucket_id(ts_weekend) == _manual_bucket(ts_weekend)
+
+
+def test_bucket_id_series():
+    s = pd.to_datetime(["2025-06-02 12:30", "2025-06-07 23:45"])
+    out = bucket_id(s)
+    expected = s.map(_manual_bucket)
+
+    assert np.array_equal(out.to_numpy(), expected.to_numpy())
+
+
+def test_assign_buckets_adds_column_and_range():
+    ts = pd.date_range("2025-03-01", periods=4, freq="15min")
+    df = pd.DataFrame({"timestamp": ts})
+    out = assign_buckets(df.copy())
+
+    assert "bucket" in out.columns
+    assert out["bucket"].between(0, NUM_BUCKETS - 1).all()
diff --git a/tests/test_loader.py b/tests/test_loader.py
@@ -4,54 +4,64 @@
 import pytest
 
 import src.preprocessing.loader as loader_module
-
-# Import the module under test
+from src.markov.buckets import bucket_id
 from src.preprocessing.loader import load_timeseries
 
 
-def create_sample_csv(path: Path):
-    # Write 21 dummy metadata lines
-    with open(path, "w", encoding="utf-8") as f:
+def _create_sample_csv(path: Path) -> None:
+    """Write 3 kWh readings; first diff will be NaN and dropped."""
+    with path.open("w", encoding="utf-8") as f:
         for _ in range(21):
             f.write("metadata line\n")
-        # Header and sample data
         f.write("Zeitstempel,Messwert\n")
         f.write("2021-01-01 00:00:00,0.0\n")
         f.write("2021-01-01 00:15:00,0.5\n")
         f.write("2021-01-01 00:30:00,1.0\n")
 
 
-def test_load_timeseries(tmp_path, monkeypatch):
-    # Setup a temporary data/raw directory
+def test_load_timeseries_full(tmp_path, monkeypatch):
+    """Loader must return timestamp, power state, bucket."""
     raw_dir = tmp_path / "data" / "raw"
     raw_dir.mkdir(parents=True)
     csv_file = raw_dir / "sample.csv"
-    create_sample_csv(csv_file)
+    _create_sample_csv(csv_file)
 
-    # Monkeypatch the RAW_DATA_DIR in the loader module
     monkeypatch.setattr(loader_module, "RAW_DATA_DIR", raw_dir)
+    loader_module.CONFIG["input"].update(
+        {
+            "timestamp_col": "Zeitstempel",
+            "value_col": "Messwert",
+            "factor": 4.0,
+        }
+    )
 
-    # Execute loader (skip normalize and discretize)
-    df = load_timeseries()
+    df = load_timeseries(normalize=True, discretize=True)
 
-    # Expect two rows (first diff yields NaN and is dropped)
-    assert df.shape == (2, 2)
-    # Columns should be timestamp and power
-    assert list(df.columns) == ["timestamp", "power"]
+    # two rows (first diff removed) & five columns
+    assert df.shape == (2, 4)
+    assert list(df.columns) == ["timestamp", "power", "state", "bucket"]
 
-    # Check timestamps are parsed correctly
-    expected_timestamps = pd.Series(
-        pd.to_datetime(["2021-01-01 00:15:00", "2021-01-01 00:30:00"]), name="timestamp"
-    )
-    pd.testing.assert_series_equal(
-        df["timestamp"], expected_timestamps, check_names=False
+    # timestamps
+    expected_ts = pd.Series(
+        pd.to_datetime(["2021-01-01 00:15:00", "2021-01-01 00:30:00"]),
+        name="timestamp",
     )
+    pd.testing.assert_series_equal(df["timestamp"], expected_ts, check_names=False)
 
-    # Check power calculation: diff * 4
-    # Values: (0.5 - 0.0)*4 = 2.0, (1.0 - 0.5)*4 = 2.0
-    expected_power = pd.Series([2.0, 2.0], name="power", dtype="float32")
+    # normalized power column: both values become 0.0
+    expected_power = pd.Series([0.0, 0.0], name="power", dtype="float32")
     pd.testing.assert_series_equal(df["power"], expected_power)
 
+    # state column: both values land in bin 0
+    expected_state = pd.Series([0, 0], name="state", dtype="int64")
+    pd.testing.assert_series_equal(df["state"], expected_state)
+
+    # bucket IDs
+    expected_bucket = pd.Series(
+        [bucket_id(ts) for ts in expected_ts], name="bucket", dtype="uint16"
+    )
+    pd.testing.assert_series_equal(df["bucket"], expected_bucket)
+
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/test_transition_counts.py b/tests/test_transition_counts.py
@@ -0,0 +1,21 @@
+import pandas as pd
+
+from src.markov.buckets import NUM_BUCKETS, bucket_id
+from src.markov.transition_counts import N_STATES, build_transition_counts
+
+
+def test_build_transition_counts_basic():
+    ts = pd.to_datetime(["2025-01-06 00:00", "2025-01-06 00:15", "2025-01-06 00:30"])
+    states = [0, 1, 1]
+    buckets = [bucket_id(t) for t in ts]
+
+    df = pd.DataFrame({"timestamp": ts, "state": states, "bucket": buckets})
+
+    counts = build_transition_counts(df)
+
+    assert counts.shape == (NUM_BUCKETS, N_STATES, N_STATES)
+
+    assert counts[buckets[0], 0, 1] == 1
+    assert counts[buckets[1], 1, 1] == 1
+
+    assert counts.sum() == 2
diff --git a/tests/test_transitions.py b/tests/test_transitions.py
@@ -0,0 +1,20 @@
+import numpy as np
+import pandas as pd
+
+from src.markov.buckets import NUM_BUCKETS, bucket_id
+from src.markov.transition_counts import N_STATES
+from src.markov.transitions import build_transition_matrices
+
+
+def test_build_transition_matrices_row_sums():
+    ts = pd.to_datetime(["2025-02-03 00:00", "2025-02-03 00:15", "2025-02-03 00:30"])
+    states = [2, 3, 2]
+    buckets = [bucket_id(t) for t in ts]
+
+    df = pd.DataFrame({"timestamp": ts, "state": states, "bucket": buckets})
+
+    probs = build_transition_matrices(df)
+
+    assert probs.shape == (NUM_BUCKETS, N_STATES, N_STATES)
+
+    assert np.allclose(probs.sum(axis=2), 1.0, atol=1e-6)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-# src/config.py`
`2`	`1`	`import os`
`3`	`2`	`from pathlib import Path`
`4`	`3`