Skip to content

Added full Markov-model pipeline #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added
- `CODEOWNERS` file and dependabot automation [#7](https://github.com/ie3-institute/simonaMarkovLoad/issues/7)
- Added full Markov-model pipeline [#10](https://github.com/ie3-institute/simonaMarkovLoad/issues/10)

### Changed
- Compute instantaneous kW from cumulative kWh via 15-minute differencing [#1](https://github.com/ie3-institute/simonaMarkovLoad/issues/1)
4 changes: 4 additions & 0 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@ input:
timestamp_col: "Zeitstempel"
value_col: "Messwert"
factor: 4 # 15-min → kW depends on sampling

model:
n_states: 10
laplace_alpha: 1.0
1 change: 0 additions & 1 deletion src/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# src/config.py
import os
from pathlib import Path

Expand Down
39 changes: 20 additions & 19 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
import matplotlib.pyplot as plt
import numpy as np

from src.markov.transition_counts import build_transition_counts
from src.markov.transitions import build_transition_matrices
from src.preprocessing.loader import load_timeseries

df = load_timeseries(normalize=True, discretize=True)

def plot_state_distribution(df):

counts = df["state"].value_counts().sort_index()
counts = build_transition_counts(df)
probs = build_transition_matrices(df, alpha=1.0)

plt.figure()
plt.bar(counts.index, counts.values)
plt.xlabel("State")
plt.ylabel("Anzahl Einträge")
plt.title("Verteilung der Einträge nach State")
plt.xticks(counts.index)
plt.show()
print("counts shape :", counts.shape)
print("probs shape :", probs.shape)


def main():
df = load_timeseries()
print(df)
df_norm = load_timeseries(normalize=True)
print(df_norm)
df_disc = load_timeseries(normalize=True, discretize=True)
print(df_disc)
plot_state_distribution(df_disc)
active_buckets = np.where(counts.sum(axis=(1, 2)) > 0)[0]
bucket = int(active_buckets[0]) if active_buckets.size else 0
print(f"\nUsing bucket {bucket}")


if __name__ == "__main__":
main()
print("row sums :", probs[bucket].sum(axis=1))

plt.imshow(probs[bucket], aspect="auto")
plt.title(f"Bucket {bucket} – transition probabilities")
plt.xlabel("state t+1")
plt.ylabel("state t")
plt.colorbar()
plt.tight_layout()
plt.show()
10 changes: 10 additions & 0 deletions src/markov/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from .buckets import NUM_BUCKETS, bucket_id
from .transition_counts import build_transition_counts
from .transitions import build_transition_matrices

__all__ = [
"bucket_id",
"NUM_BUCKETS",
"build_transition_counts",
"build_transition_matrices",
]
21 changes: 21 additions & 0 deletions src/markov/_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import numpy as np
import pandas as pd

from src.config import CONFIG

from .buckets import NUM_BUCKETS

N_STATES = int(CONFIG["model"]["n_states"])


def _transition_counts(
df: pd.DataFrame, *, state_col="state", bucket_col="bucket", dtype=np.uint32
) -> np.ndarray:
df = df.sort_values("timestamp")
s_t = df[state_col].to_numpy(dtype=int)[:-1]
s_tp1 = df[state_col].to_numpy(dtype=int)[1:]
buckets = df[bucket_col].to_numpy(dtype=int)[:-1]

counts = np.zeros((NUM_BUCKETS, N_STATES, N_STATES), dtype=dtype)
np.add.at(counts, (buckets, s_t, s_tp1), 1)
return counts
44 changes: 44 additions & 0 deletions src/markov/buckets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pandas as pd

_MONTH_FACTOR = 96 * 2
_WEEKEND_FACTOR = 96
_NUM_MONTHS = 12
_NUM_QH = 96
NUM_BUCKETS = _NUM_MONTHS * 2 * _NUM_QH # 2 304


def _is_weekend(ts):
if isinstance(ts, pd.Series):
return ts.dt.dayofweek >= 5
if isinstance(ts, pd.DatetimeIndex):
return ts.dayofweek >= 5
return ts.dayofweek >= 5


def bucket_id(ts):
if isinstance(ts, pd.Series):
weekend = _is_weekend(ts).astype(int)
qh = ts.dt.hour * 4 + ts.dt.minute // 15
month = ts.dt.month - 1
elif isinstance(ts, pd.DatetimeIndex):
weekend = _is_weekend(ts).astype(int)
qh = ts.hour * 4 + ts.minute // 15
month = ts.month - 1
else:
weekend = int(_is_weekend(ts))
qh = ts.hour * 4 + ts.minute // 15
month = ts.month - 1

return month * _MONTH_FACTOR + weekend * _WEEKEND_FACTOR + qh


def assign_buckets(
df: pd.DataFrame,
*,
ts_col: str = "timestamp",
bucket_col: str = "bucket",
inplace: bool = False,
) -> pd.DataFrame:
tgt = df if inplace else df.copy()
tgt[bucket_col] = bucket_id(tgt[ts_col]).astype("uint16")
return tgt
1 change: 0 additions & 1 deletion src/markov/model.py

This file was deleted.

31 changes: 31 additions & 0 deletions src/markov/transition_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import numpy as np
import pandas as pd

from src.config import CONFIG

from .buckets import NUM_BUCKETS

N_STATES = int(CONFIG["model"]["n_states"])


def build_transition_counts(
df: pd.DataFrame,
*,
state_col: str = "state",
bucket_col: str = "bucket",
dtype=np.uint32,
) -> np.ndarray:
"""
Absolute transition counts:
C[b, i, j] = # of times state_t=i → state_{t+1}=j in bucket b
Shape = (2 304, 10, 10).
"""
df = df.sort_values("timestamp")

s_t = df[state_col].to_numpy(dtype=int)[:-1]
s_tp1 = df[state_col].to_numpy(dtype=int)[1:]
buckets = df[bucket_col].to_numpy(dtype=int)[:-1]

counts = np.zeros((NUM_BUCKETS, N_STATES, N_STATES), dtype=dtype)
np.add.at(counts, (buckets, s_t, s_tp1), 1)
return counts
15 changes: 15 additions & 0 deletions src/markov/transitions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import numpy as np
import pandas as pd

from src.config import CONFIG

from ._core import _transition_counts

alpha = float(CONFIG["model"]["laplace_alpha"])


def build_transition_matrices(df: pd.DataFrame, *, dtype=np.float32) -> np.ndarray:
counts = _transition_counts(df, dtype=dtype)
counts += alpha
counts /= counts.sum(axis=2, keepdims=True)
return counts.astype(dtype)
3 changes: 3 additions & 0 deletions src/preprocessing/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from src.config import CONFIG

from ..markov.buckets import assign_buckets
from .scaling import discretize_power, normalize_power

RAW_DATA_DIR = Path(__file__).resolve().parents[2] / "data" / "raw"
Expand Down Expand Up @@ -53,6 +54,8 @@ def load_timeseries(
if discretize:
df = discretize_power(df, col="power", state_col="state")

df = assign_buckets(df, inplace=True)

frames.append(df)

return pd.concat(frames, ignore_index=True)
38 changes: 38 additions & 0 deletions tests/test_buckets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy as np
import pandas as pd

from src.markov.buckets import NUM_BUCKETS, assign_buckets, bucket_id


def _manual_bucket(ts):
month_factor = 96 * 2
weekend_factor = 96
month = ts.month - 1
weekend = int(ts.dayofweek >= 5)
qh = ts.hour * 4 + ts.minute // 15
return month * month_factor + weekend * weekend_factor + qh


def test_bucket_id_single_weekday_weekend():
ts_weekday = pd.Timestamp("2025-06-02 00:00")
ts_weekend = pd.Timestamp("2025-06-07 00:00")

assert bucket_id(ts_weekday) == _manual_bucket(ts_weekday)
assert bucket_id(ts_weekend) == _manual_bucket(ts_weekend)


def test_bucket_id_series():
s = pd.to_datetime(["2025-06-02 12:30", "2025-06-07 23:45"])
out = bucket_id(s)
expected = s.map(_manual_bucket)

assert np.array_equal(out.to_numpy(), expected.to_numpy())


def test_assign_buckets_adds_column_and_range():
ts = pd.date_range("2025-03-01", periods=4, freq="15min")
df = pd.DataFrame({"timestamp": ts})
out = assign_buckets(df.copy())

assert "bucket" in out.columns
assert out["bucket"].between(0, NUM_BUCKETS - 1).all()
60 changes: 35 additions & 25 deletions tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,54 +4,64 @@
import pytest

import src.preprocessing.loader as loader_module

# Import the module under test
from src.markov.buckets import bucket_id
from src.preprocessing.loader import load_timeseries


def create_sample_csv(path: Path):
# Write 21 dummy metadata lines
with open(path, "w", encoding="utf-8") as f:
def _create_sample_csv(path: Path) -> None:
"""Write 3 kWh readings; first diff will be NaN and dropped."""
with path.open("w", encoding="utf-8") as f:
for _ in range(21):
f.write("metadata line\n")
# Header and sample data
f.write("Zeitstempel,Messwert\n")
f.write("2021-01-01 00:00:00,0.0\n")
f.write("2021-01-01 00:15:00,0.5\n")
f.write("2021-01-01 00:30:00,1.0\n")


def test_load_timeseries(tmp_path, monkeypatch):
# Setup a temporary data/raw directory
def test_load_timeseries_full(tmp_path, monkeypatch):
"""Loader must return timestamp, power state, bucket."""
raw_dir = tmp_path / "data" / "raw"
raw_dir.mkdir(parents=True)
csv_file = raw_dir / "sample.csv"
create_sample_csv(csv_file)
_create_sample_csv(csv_file)

# Monkeypatch the RAW_DATA_DIR in the loader module
monkeypatch.setattr(loader_module, "RAW_DATA_DIR", raw_dir)
loader_module.CONFIG["input"].update(
{
"timestamp_col": "Zeitstempel",
"value_col": "Messwert",
"factor": 4.0,
}
)

# Execute loader (skip normalize and discretize)
df = load_timeseries()
df = load_timeseries(normalize=True, discretize=True)

# Expect two rows (first diff yields NaN and is dropped)
assert df.shape == (2, 2)
# Columns should be timestamp and power
assert list(df.columns) == ["timestamp", "power"]
# two rows (first diff removed) & five columns
assert df.shape == (2, 4)
assert list(df.columns) == ["timestamp", "power", "state", "bucket"]

# Check timestamps are parsed correctly
expected_timestamps = pd.Series(
pd.to_datetime(["2021-01-01 00:15:00", "2021-01-01 00:30:00"]), name="timestamp"
)
pd.testing.assert_series_equal(
df["timestamp"], expected_timestamps, check_names=False
# timestamps
expected_ts = pd.Series(
pd.to_datetime(["2021-01-01 00:15:00", "2021-01-01 00:30:00"]),
name="timestamp",
)
pd.testing.assert_series_equal(df["timestamp"], expected_ts, check_names=False)

# Check power calculation: diff * 4
# Values: (0.5 - 0.0)*4 = 2.0, (1.0 - 0.5)*4 = 2.0
expected_power = pd.Series([2.0, 2.0], name="power", dtype="float32")
# normalized power column: both values become 0.0
expected_power = pd.Series([0.0, 0.0], name="power", dtype="float32")
pd.testing.assert_series_equal(df["power"], expected_power)

# state column: both values land in bin 0
expected_state = pd.Series([0, 0], name="state", dtype="int64")
pd.testing.assert_series_equal(df["state"], expected_state)

# bucket IDs
expected_bucket = pd.Series(
[bucket_id(ts) for ts in expected_ts], name="bucket", dtype="uint16"
)
pd.testing.assert_series_equal(df["bucket"], expected_bucket)


if __name__ == "__main__":
pytest.main([__file__])
21 changes: 21 additions & 0 deletions tests/test_transition_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pandas as pd

from src.markov.buckets import NUM_BUCKETS, bucket_id
from src.markov.transition_counts import N_STATES, build_transition_counts


def test_build_transition_counts_basic():
ts = pd.to_datetime(["2025-01-06 00:00", "2025-01-06 00:15", "2025-01-06 00:30"])
states = [0, 1, 1]
buckets = [bucket_id(t) for t in ts]

df = pd.DataFrame({"timestamp": ts, "state": states, "bucket": buckets})

counts = build_transition_counts(df)

assert counts.shape == (NUM_BUCKETS, N_STATES, N_STATES)

assert counts[buckets[0], 0, 1] == 1
assert counts[buckets[1], 1, 1] == 1

assert counts.sum() == 2
20 changes: 20 additions & 0 deletions tests/test_transitions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import numpy as np
import pandas as pd

from src.markov.buckets import NUM_BUCKETS, bucket_id
from src.markov.transition_counts import N_STATES
from src.markov.transitions import build_transition_matrices


def test_build_transition_matrices_row_sums():
ts = pd.to_datetime(["2025-02-03 00:00", "2025-02-03 00:15", "2025-02-03 00:30"])
states = [2, 3, 2]
buckets = [bucket_id(t) for t in ts]

df = pd.DataFrame({"timestamp": ts, "state": states, "bucket": buckets})

probs = build_transition_matrices(df)

assert probs.shape == (NUM_BUCKETS, N_STATES, N_STATES)

assert np.allclose(probs.sum(axis=2), 1.0, atol=1e-6)