Skip to content

Commit f47870f

Browse files
current gmm state
1 parent 4b79f06 commit f47870f

File tree

3 files changed

+248
-16
lines changed

3 files changed

+248
-16
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99
### Added
1010
- `CODEOWNERS` file and dependabot automation [#7](https://github.com/ie3-institute/simonaMarkovLoad/issues/7)
1111
- Added full Markov-model pipeline [#10](https://github.com/ie3-institute/simonaMarkovLoad/issues/10)
12+
- Added GMM feature to project [#12](https://github.com/ie3-institute/simonaMarkovLoad/issues/12)
1213

1314
### Changed
1415
- Compute instantaneous kW from cumulative kWh via 15-minute differencing [#1](https://github.com/ie3-institute/simonaMarkovLoad/issues/1)

src/main.py

Lines changed: 118 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,133 @@
11
import matplotlib.pyplot as plt
22
import numpy as np
3+
import pandas as pd
4+
from matplotlib.colors import Normalize
35

6+
from src.markov.buckets import assign_buckets, bucket_id
7+
from src.markov.gmm import fit_gmms, sample_value
48
from src.markov.transition_counts import build_transition_counts
59
from src.markov.transitions import build_transition_matrices
610
from src.preprocessing.loader import load_timeseries
711

8-
df = load_timeseries(normalize=True, discretize=True)
12+
SIM_DAYS = 3
913

1014

11-
counts = build_transition_counts(df)
12-
probs = build_transition_matrices(df, alpha=1.0)
15+
def _detect_value_col(df: pd.DataFrame) -> str:
16+
candidate_cols = ["x", "value", "load", "power", "p_norm", "load_norm"]
17+
for c in candidate_cols:
18+
if c in df.columns and np.issubdtype(df[c].dtype, np.number):
19+
return c
20+
raise KeyError("No numeric load column found – please inspect the dataframe.")
1321

14-
print("counts shape :", counts.shape)
15-
print("probs shape :", probs.shape)
1622

23+
def _simulate_series(
24+
probs: np.ndarray,
25+
gmms,
26+
start_ts: pd.Timestamp,
27+
start_state: int,
28+
periods: int,
29+
rng: np.random.Generator | None = None,
30+
) -> pd.DataFrame:
31+
"""Generate synthetic 15‑min series (timestamp, state, x)."""
32+
rng = np.random.default_rng() if rng is None else rng
33+
timestamps = pd.date_range(start_ts, periods=periods, freq="15min")
34+
states = np.empty(periods, dtype=int)
35+
xs = np.empty(periods, dtype=float)
1736

18-
active_buckets = np.where(counts.sum(axis=(1, 2)) > 0)[0]
19-
bucket = int(active_buckets[0]) if active_buckets.size else 0
20-
print(f"\nUsing bucket {bucket}")
37+
s = start_state
38+
for i, ts in enumerate(timestamps):
39+
b = bucket_id(ts)
40+
s = rng.choice(probs.shape[1], p=probs[b, s])
41+
states[i] = s
42+
xs[i] = sample_value(gmms, b, s, rng=rng)
43+
return pd.DataFrame({"timestamp": timestamps, "state": states, "x_sim": xs})
2144

2245

23-
print("row sums :", probs[bucket].sum(axis=1))
46+
def main() -> None:
47+
df = load_timeseries(normalize=True, discretize=True)
48+
if "bucket" not in df.columns:
49+
df = assign_buckets(df)
2450

25-
plt.imshow(probs[bucket], aspect="auto")
26-
plt.title(f"Bucket {bucket} – transition probabilities")
27-
plt.xlabel("state t+1")
28-
plt.ylabel("state t")
29-
plt.colorbar()
30-
plt.tight_layout()
31-
plt.show()
51+
value_col = _detect_value_col(df)
52+
print("Using load column:", value_col)
53+
54+
counts = build_transition_counts(df)
55+
probs = build_transition_matrices(df)
56+
57+
_plot_first_25_buckets(counts, probs)
58+
59+
print("Fitting GMMs … (this may take a moment)")
60+
gmms = fit_gmms(df, value_col=value_col)
61+
62+
periods = SIM_DAYS * 96
63+
sim_df = _simulate_series(
64+
probs,
65+
gmms,
66+
start_ts=df["timestamp"].min().normalize(),
67+
start_state=int(df["state"].iloc[0]),
68+
periods=periods,
69+
)
70+
71+
_plot_simulation_diagnostics(df, sim_df, value_col)
72+
73+
74+
def _plot_first_25_buckets(counts: np.ndarray, probs: np.ndarray) -> None:
75+
"""Heat‑map grid for buckets 0‑24."""
76+
buckets = list(range(25))
77+
fig, axes = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True)
78+
vmax = probs[buckets].max()
79+
norm = Normalize(vmin=0, vmax=vmax)
80+
81+
for idx, b in enumerate(buckets):
82+
ax = axes.flat[idx]
83+
if counts[b].sum() == 0:
84+
ax.axis("off")
85+
continue
86+
im = ax.imshow(probs[b], aspect="auto", origin="lower", norm=norm)
87+
ax.set_title(f"Bucket {b}", fontsize=8)
88+
ax.set_xticks([])
89+
ax.set_yticks([])
90+
91+
for ax in axes.flat[len(buckets) :]:
92+
ax.axis("off")
93+
94+
fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, label="p")
95+
fig.suptitle("Transition probabilities – buckets 0‑24", fontsize=14)
96+
fig.tight_layout(rect=[0, 0, 0.97, 0.96])
97+
plt.show()
98+
99+
100+
def _plot_simulation_diagnostics(
101+
df: pd.DataFrame, sim: pd.DataFrame, value_col: str
102+
) -> None:
103+
first_day = sim.iloc[:96]
104+
plt.figure(figsize=(10, 3))
105+
plt.plot(first_day["timestamp"], first_day["x_sim"], marker=".")
106+
plt.title("Simulated power – first day")
107+
plt.ylabel("normalised load x")
108+
plt.tight_layout()
109+
plt.show()
110+
111+
plt.figure(figsize=(6, 4))
112+
plt.hist(df[value_col], bins=50, alpha=0.6, density=True, label="original")
113+
plt.hist(sim["x_sim"], bins=50, alpha=0.6, density=True, label="simulated")
114+
plt.title("Original vs simulated load distribution")
115+
plt.xlabel("normalised load x")
116+
plt.ylabel("density")
117+
plt.legend()
118+
plt.tight_layout()
119+
plt.show()
120+
121+
sim["hour"] = sim["timestamp"].dt.hour
122+
plt.figure(figsize=(10, 4))
123+
sim.boxplot(column="x_sim", by="hour", grid=False)
124+
plt.suptitle("")
125+
plt.title("Simulated power by hour of day")
126+
plt.xlabel("hour of day")
127+
plt.ylabel("normalised load x")
128+
plt.tight_layout()
129+
plt.show()
130+
131+
132+
if __name__ == "__main__":
133+
main()

src/markov/gmm.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import math
2+
from typing import Dict, List, Tuple
3+
4+
import joblib
5+
import numpy as np
6+
import pandas as pd
7+
from sklearn.mixture import GaussianMixture
8+
9+
try:
10+
# rich progress bar for terminals / notebooks; optional dependency
11+
from tqdm.auto import tqdm # type: ignore
12+
except ImportError:
13+
tqdm = None # type: ignore
14+
15+
from .buckets import NUM_BUCKETS
16+
from .transition_counts import N_STATES
17+
18+
__all__ = [
19+
"GaussianBucketModels",
20+
"fit_gmms",
21+
"sample_value",
22+
]
23+
24+
25+
GmmTuple = Tuple[np.ndarray, np.ndarray, np.ndarray]
26+
GaussianBucketModels = List[List[GmmTuple]]
27+
28+
29+
def _fit_single(
30+
x: np.ndarray,
31+
*,
32+
min_samples: int = 30,
33+
k_candidates: Tuple[int, ...] = (1, 2, 3),
34+
random_state: int | None = None,
35+
) -> GmmTuple:
36+
"""Fit 1‑3 component spherical GMM; fallback to Normal if too few samples."""
37+
if len(x) < min_samples:
38+
mean = float(np.mean(x))
39+
var = float(np.var(x) + 1e-6)
40+
return (np.array([1.0]), np.array([mean]), np.array([var]))
41+
42+
best_bic = math.inf
43+
best_gmm: GaussianMixture | None = None
44+
for k in k_candidates:
45+
gmm = GaussianMixture(
46+
n_components=k,
47+
covariance_type="spherical",
48+
n_init="auto",
49+
random_state=random_state,
50+
).fit(x.reshape(-1, 1))
51+
bic = gmm.bic(x.reshape(-1, 1))
52+
if bic < best_bic:
53+
best_bic = bic
54+
best_gmm = gmm
55+
56+
weights = best_gmm.weights_
57+
means = best_gmm.means_.ravel()
58+
variances = best_gmm.covariances_.ravel()
59+
return (weights, means, variances)
60+
61+
62+
def fit_gmms(
63+
df: pd.DataFrame,
64+
*,
65+
value_col: str = "x",
66+
bucket_col: str = "bucket",
67+
state_col: str = "state",
68+
min_samples: int = 30,
69+
k_candidates: Tuple[int, ...] = (1, 2, 3),
70+
n_jobs: int = -1,
71+
random_state: int | None = None,
72+
verbose: int = 0,
73+
) -> GaussianBucketModels:
74+
"""Return list [bucket][state] -> (weights, means, variances)."""
75+
76+
samples: Dict[Tuple[int, int], List[float]] = {}
77+
for _, row in df[[bucket_col, state_col, value_col]].iterrows():
78+
samples.setdefault((row[bucket_col], row[state_col]), []).append(row[value_col])
79+
80+
tasks = [
81+
((b, s), samples.get((b, s), []))
82+
for b in range(NUM_BUCKETS)
83+
for s in range(N_STATES)
84+
]
85+
86+
iterable = tasks
87+
if verbose > 0 and tqdm is not None:
88+
iterable = tqdm(tasks, desc="Fitting GMMs", unit="model")
89+
90+
def _worker(item: Tuple[Tuple[int, int], List[float]]):
91+
(b, s), x_list = item
92+
x = np.asarray(x_list, dtype=float)
93+
return (
94+
b,
95+
s,
96+
_fit_single(
97+
x,
98+
min_samples=min_samples,
99+
k_candidates=k_candidates,
100+
random_state=random_state,
101+
),
102+
)
103+
104+
results = joblib.Parallel(n_jobs=n_jobs, verbose=verbose)(
105+
joblib.delayed(_worker)(task) for task in iterable
106+
)
107+
108+
gmms: GaussianBucketModels = [
109+
[None for _ in range(N_STATES)] for _ in range(NUM_BUCKETS)
110+
]
111+
for b, s, gmm_tuple in results:
112+
gmms[b][s] = gmm_tuple
113+
return gmms
114+
115+
116+
_rng = np.random.default_rng()
117+
118+
119+
def sample_value(
120+
gmms: GaussianBucketModels,
121+
bucket: int,
122+
state: int,
123+
rng: np.random.Generator | None = None,
124+
) -> float:
125+
"""Draw a normalised load value from the GMM for (bucket, state)."""
126+
weights, means, vars_ = gmms[bucket][state]
127+
rng = _rng if rng is None else rng
128+
comp = rng.choice(len(weights), p=weights)
129+
return float(rng.normal(means[comp], math.sqrt(vars_[comp])))

0 commit comments

Comments
 (0)