restore GPT5 code

ColtAllen · ColtAllen · commit fb96220ddc6b · 2025-08-13T14:04:52.000-06:00
diff --git a/pymc_extras/distributions/discrete.py b/pymc_extras/distributions/discrete.py
@@ -412,7 +412,12 @@ class GrassiaIIGeometricRV(RandomVariable):
     @classmethod
     def rng_fn(cls, rng, r, alpha, time_covariate_vector, size):
         # Aggregate time covariates for each sample before broadcasting
-        exp_time_covar = np.exp(time_covariate_vector).sum(axis=0)
+        time_cov = np.asarray(time_covariate_vector)
+        if np.ndim(time_cov) == 0:
+            exp_time_covar = np.asarray(1.0)
+        else:
+            # Collapse all time/feature axes to a scalar multiplier for RNG
+            exp_time_covar = np.asarray(np.exp(time_cov).sum())
 
         # Determine output size
         if size is None:
@@ -428,6 +433,11 @@ def rng_fn(cls, rng, r, alpha, time_covariate_vector, size):
         lam_covar = lam * exp_time_covar
 
         p = 1 - np.exp(-lam_covar)
+        # TODO: This is a hack to ensure valid probability in (0, 1]
+        # We should find a better way to do this.
+        # Ensure valid probability in (0, 1]
+        tiny = np.finfo(p.dtype).tiny
+        p = np.clip(p, tiny, 1.0)
         samples = rng.geometric(p)
         # samples = np.ceil(np.log(1 - rng.uniform(size=size)) / (-lam_covar))
 
@@ -500,24 +510,29 @@ def dist(cls, r, alpha, time_covariate_vector=None, *args, **kwargs):
 
         if time_covariate_vector is None:
             time_covariate_vector = pt.constant(0.0)
+        time_covariate_vector = pt.as_tensor_variable(time_covariate_vector)
+        # Normalize covariate to be 1D over time
+        if time_covariate_vector.ndim == 0:
+            time_covariate_vector = pt.reshape(time_covariate_vector, (1,))
+        elif time_covariate_vector.ndim > 1:
+            feature_axes = tuple(range(time_covariate_vector.ndim - 1))
+            time_covariate_vector = pt.sum(time_covariate_vector, axis=feature_axes)
 
         return super().dist([r, alpha, time_covariate_vector], *args, **kwargs)
 
     def logp(value, r, alpha, time_covariate_vector):
-        logp = pt.log(
-            pt.pow(alpha / (alpha + C_t(value - 1, time_covariate_vector)), r)
-            - pt.pow(alpha / (alpha + C_t(value, time_covariate_vector)), r)
-        )
-
-        # Handle invalid values
-        logp = pt.switch(
-            pt.or_(
-                value < 1,  # Value must be >= 1
-                pt.isnan(logp),  # Handle NaN cases
-            ),
-            -np.inf,
-            logp,
-        )
+        v = pt.as_tensor_variable(value)
+        ct_prev = C_t(v - 1, time_covariate_vector)
+        ct_curr = C_t(v, time_covariate_vector)
+        logS_prev = r * (pt.log(alpha) - pt.log(alpha + ct_prev))
+        logS_curr = r * (pt.log(alpha) - pt.log(alpha + ct_curr))
+        # Compute log(exp(logS_prev) - exp(logS_curr)) stably
+        max_logS = pt.maximum(logS_prev, logS_curr)
+        diff = pt.exp(logS_prev - max_logS) - pt.exp(logS_curr - max_logS)
+        logp = max_logS + pt.log(diff)
+
+        # Handle invalid / out-of-domain values
+        logp = pt.switch(value < 1, -np.inf, logp)
 
         return check_parameters(
             logp,
@@ -527,9 +542,15 @@ def logp(value, r, alpha, time_covariate_vector):
         )
 
     def logcdf(value, r, alpha, time_covariate_vector):
-        logcdf = r * (
-            pt.log(C_t(value, time_covariate_vector))
-            - pt.log(alpha + C_t(value, time_covariate_vector))
+        # Log CDF: log(1 - (alpha / (alpha + C(t)))**r)
+        t = pt.as_tensor_variable(value)
+        ct = C_t(t, time_covariate_vector)
+        logS = r * (pt.log(alpha) - pt.log(alpha + ct))
+        # Numerically stable log(1 - exp(logS))
+        logcdf = pt.switch(
+            pt.lt(logS, np.log(0.5)),
+            pt.log1p(-pt.exp(logS)),
+            pt.log(-pt.expm1(logS)),
         )
 
         return check_parameters(
@@ -550,7 +571,6 @@ def support_point(rv, size, r, alpha, time_covariate_vector):
         When time_covariate_vector is provided, it affects the expected value through
         the exponential link function: exp(time_covariate_vector).
         """
-
         base_lambda = r / alpha
 
         # Approximate expected value of geometric distribution
@@ -560,8 +580,11 @@ def support_point(rv, size, r, alpha, time_covariate_vector):
             1.0 / (1.0 - pt.exp(-base_lambda)),  # Full expression for larger lambda
         )
 
-        # Apply time covariates if provided
-        mean = mean * pt.exp(time_covariate_vector.sum(axis=0))
+        # Apply time covariates if provided: multiply by exp(sum over axis=0)
+        # This yields a scalar for 1D covariates and a time-length vector for 2D (features x time)
+        tcv = pt.as_tensor_variable(time_covariate_vector)
+        if tcv.ndim != 0:
+            mean = mean * pt.exp(tcv.sum(axis=0))
 
         # Round up to nearest integer and ensure >= 1
         mean = pt.maximum(pt.ceil(mean), 1.0)
@@ -575,14 +598,27 @@ def support_point(rv, size, r, alpha, time_covariate_vector):
 
 def C_t(t: pt.TensorVariable, time_covariate_vector: pt.TensorVariable) -> pt.TensorVariable:
     """Utility for processing time-varying covariates in GrassiaIIGeometric distribution."""
+    # If unspecified (scalar), simply return t
     if time_covariate_vector.ndim == 0:
-        # Reshape time_covariate_vector to length t
-        return pt.full((t,), time_covariate_vector)
+        return t
+
+    # Sum exp(covariates) across feature axes, keep last axis as time
+    if time_covariate_vector.ndim == 1:
+        per_time_sum = pt.exp(time_covariate_vector)
     else:
-        # Ensure t is a valid index
-        t_idx = pt.maximum(0, t - 1)  # Convert to 0-based indexing
-        # If t_idx exceeds length of time_covariate_vector, use last value
-        max_idx = pt.shape(time_covariate_vector)[0] - 1
-        safe_idx = pt.minimum(t_idx, max_idx)
-        covariate_value = time_covariate_vector[..., safe_idx]
-        return pt.exp(covariate_value).sum()
+        # If axis=0 is time and axis>0 are features, sum over features (axis>0)
+        per_time_sum = pt.sum(pt.exp(time_covariate_vector), axis=0)
+
+    # Build cumulative sum up to each t without advanced indexing
+    time_length = pt.shape(per_time_sum)[0]
+    # Ensure t is at least 1D int64 for broadcasting
+    t_vec = pt.cast(t, "int64")
+    t_vec = pt.shape_padleft(t_vec) if t_vec.ndim == 0 else t_vec
+    # Create time indices [0, 1, ..., T-1]
+    time_idx = pt.arange(time_length, dtype="int64")
+    # Mask where time index < t (exclusive upper bound)
+    mask = pt.lt(time_idx, pt.shape_padright(t_vec, 1))
+    # Sum per-time contributions over time axis
+    base_sum = pt.sum(pt.shape_padleft(per_time_sum) * mask, axis=-1)
+    # If original t was scalar, return scalar (saturate at last time step)
+    return pt.squeeze(base_sum)