add beta distribution

BaerVervergaert · BaerVervergaert · commit 1fb9777ddcf5 · 2025-07-06T17:18:16.000+02:00
diff --git a/ngboost/distns/__init__.py b/ngboost/distns/__init__.py
@@ -1,4 +1,5 @@
 """NGBoost distributions"""
+from .beta import Beta
 from .categorical import Bernoulli, k_categorical
 from .cauchy import Cauchy
 from .distn import ClassificationDistn, Distn, RegressionDistn
@@ -15,6 +16,7 @@
 
 __all__ = [
     "Bernoulli",
+    "Beta",
     "k_categorical",
     "Cauchy",
     "ClassificationDistn",
diff --git a/ngboost/distns/beta.py b/ngboost/distns/beta.py
@@ -0,0 +1,73 @@
+"""The NGBoost Beta distribution and scores"""
+from scipy.stats import beta as dist
+from scipy.special import digamma, polygamma
+import numpy as np
+
+from ngboost.distns.distn import RegressionDistn
+from ngboost.scores import LogScore
+
+class BetaLogScore(LogScore):
+    """Log score for the Beta distribution."""
+    def score(self, Y):
+        """Calculate the log score for the Beta distribution."""
+        return -self.dist.logpdf(Y)
+
+    def d_score(self, Y):
+        """Calculate the derivative of the log score with respect to the parameters."""
+        D = np.zeros((len(Y), 2)) # first col is dS/d(log(a)), second col is dS/d(log(b))
+        D[:, 0] = - self.a * ( digamma(self.a + self.b) - digamma(self.a) + np.log(Y))
+        D[:, 1] = - self.b * ( digamma(self.a + self.b) - digamma(self.b) + np.log(1 - Y))
+        return D
+
+    def metric(self):
+        """Return the Fisher Information matrix for the Beta distribution."""
+        FI = np.zeros((self.a.shape[0], 2, 2))
+        trigamma_a_b = polygamma(1, self.a + self.b)
+        FI[:, 0, 0] = self.a**2 * ( polygamma(1, self.a) - trigamma_a_b )
+        FI[:, 0, 1] = -self.a * self.b * trigamma_a_b
+        FI[:, 1, 0] = -self.a * self.b * trigamma_a_b
+        FI[:, 1, 1] = self.b**2 * ( polygamma(1, self.b) - trigamma_a_b )
+        return FI
+
+class Beta(RegressionDistn):
+    """
+    Implements the Beta distribution for NGBoost.
+
+    The Beta distribution has two parameters, a and b.
+    The scipy loc and scale parameters are held constant for this implementation.
+    LogScore is supported for the Beta distribution.
+    """
+
+    n_params = 2
+    scores = [BetaLogScore] # will implement this later
+
+    def __init__(self, params):
+        self._params = params
+
+        # create other objects that will be useful later
+        self.log_a = params[0]
+        self.log_b = params[1]
+        self.a = np.exp(params[0]) # since params[0] is log(a)
+        self.b = np.exp(params[1]) # since params[1] is log(b)
+        self.dist = dist(a=self.a, b=self.b)
+
+    @staticmethod
+    def fit(Y):
+        """Fit the distribution to the data."""
+        # Use scipy's beta distribution to fit the parameters
+        a, b, loc, scale = dist.fit(Y, floc=0, fscale=1)
+        return np.array([np.log(a), np.log(b)])
+
+    def sample(self, m):
+        """Sample from the distribution."""
+        return np.array([self.dist.rvs() for i in range(m)])
+
+    def __getattr__(self, name): # gives us access to Beta.mean() required for RegressionDist.predict()
+        if name in dir(self.dist):
+            return getattr(self.dist, name)
+        return None
+
+    @property
+    def params(self):
+        """Return the parameters of the Beta distribution."""
+        return {'a': self.a, 'b': self.b}
diff --git a/tests/test_distns.py b/tests/test_distns.py
@@ -9,6 +9,7 @@
 from ngboost import NGBClassifier, NGBRegressor, NGBSurvival
 from ngboost.distns import (
     Bernoulli,
+    Beta,
     Cauchy,
     Distn,
     Exponential,
@@ -166,6 +167,34 @@ def test_bernoulli(learner, classification_data: Tuple4Array):
     # test properties of output
 
 
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "learner",
+    [
+        DecisionTreeRegressor(criterion="friedman_mse", max_depth=5),
+        DecisionTreeRegressor(criterion="friedman_mse", max_depth=3),
+    ],
+)
+def test_beta(learner, regression_data: Tuple4Array):
+    X_reg_train, X_reg_test, Y_reg_train, Y_reg_test = regression_data
+
+    # Scale the target to (0, 1) range for Beta distribution
+    min_value = min(Y_reg_train.min(), Y_reg_test.min())
+    max_value = max(Y_reg_train.max(), Y_reg_test.max())
+    Y_reg_train = (Y_reg_train - min_value)/(max_value - min_value)
+    Y_reg_train = np.clip(Y_reg_train, 1e-5, 1 - 1e-5)  # Avoid log(0) issues
+    Y_reg_test = (Y_reg_test - min_value)/(max_value - min_value)
+    Y_reg_test = np.clip(Y_reg_test, 1e-5, 1 - 1e-5)  # Avoid log(0) issues
+
+    # test early stopping features
+    # test other args, n_trees, LR, minibatching- args as fixture
+    ngb = NGBRegressor(Dist=Beta, Score=LogScore, Base=learner, verbose=False)
+    ngb.fit(X_reg_train, Y_reg_train)
+    y_pred = ngb.predict(X_reg_test)
+    y_dist = ngb.pred_dist(X_reg_test)
+    # test properties of output
+
+
 @pytest.mark.slow
 @pytest.mark.parametrize("k", [2, 4, 7])
 @pytest.mark.parametrize(