Skip to content

Add beta distribution #391

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ngboost/distns/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""NGBoost distributions"""
from .beta import Beta
from .categorical import Bernoulli, k_categorical
from .cauchy import Cauchy
from .distn import ClassificationDistn, Distn, RegressionDistn
Expand All @@ -15,6 +16,7 @@

__all__ = [
"Bernoulli",
"Beta",
"k_categorical",
"Cauchy",
"ClassificationDistn",
Expand Down
82 changes: 82 additions & 0 deletions ngboost/distns/beta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""The NGBoost Beta distribution and scores"""
import numpy as np
from scipy.special import digamma, polygamma
from scipy.stats import beta as dist

from ngboost.distns.distn import RegressionDistn
from ngboost.scores import LogScore


class BetaLogScore(LogScore):
"""Log score for the Beta distribution."""

def score(self, Y):
"""Calculate the log score for the Beta distribution."""
return -self.dist.logpdf(Y)

def d_score(self, Y):
"""Calculate the derivative of the log score with respect to the parameters."""
D = np.zeros(
(len(Y), 2)
) # first col is dS/d(log(a)), second col is dS/d(log(b))
D[:, 0] = -self.a * (digamma(self.a + self.b) - digamma(self.a) + np.log(Y))
D[:, 1] = -self.b * (digamma(self.a + self.b) - digamma(self.b) + np.log(1 - Y))
return D

def metric(self):
"""Return the Fisher Information matrix for the Beta distribution."""
FI = np.zeros((self.a.shape[0], 2, 2))
trigamma_a_b = polygamma(1, self.a + self.b)
FI[:, 0, 0] = self.a**2 * (polygamma(1, self.a) - trigamma_a_b)
FI[:, 0, 1] = -self.a * self.b * trigamma_a_b
FI[:, 1, 0] = -self.a * self.b * trigamma_a_b
FI[:, 1, 1] = self.b**2 * (polygamma(1, self.b) - trigamma_a_b)
return FI


class Beta(RegressionDistn):
"""
Implements the Beta distribution for NGBoost.

The Beta distribution has two parameters, a and b.
The scipy loc and scale parameters are held constant for this implementation.
LogScore is supported for the Beta distribution.
"""

n_params = 2
scores = [BetaLogScore] # will implement this later

# pylint: disable=super-init-not-called
def __init__(self, params):
self._params = params

# create other objects that will be useful later
self.log_a = params[0]
self.log_b = params[1]
self.a = np.exp(params[0]) # since params[0] is log(a)
self.b = np.exp(params[1]) # since params[1] is log(b)
Copy link
Author

@BaerVervergaert BaerVervergaert Jul 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might need to introduce clipping here because sometimes the algorithm overflows and sets value a or b to 0.

self.dist = dist(a=self.a, b=self.b)

@staticmethod
def fit(Y):
"""Fit the distribution to the data."""
# Use scipy's beta distribution to fit the parameters
# pylint: disable=unused-variable
a, b, loc, scale = dist.fit(Y, floc=0, fscale=1)
return np.array([np.log(a), np.log(b)])

def sample(self, m):
"""Sample from the distribution."""
return np.array([self.dist.rvs() for i in range(m)])

def __getattr__(
self, name
): # gives us access to Beta.mean() required for RegressionDist.predict()
if name in dir(self.dist):
return getattr(self.dist, name)
return None

@property
def params(self):
"""Return the parameters of the Beta distribution."""
return {"a": self.a, "b": self.b}
29 changes: 29 additions & 0 deletions tests/test_distns.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ngboost import NGBClassifier, NGBRegressor, NGBSurvival
from ngboost.distns import (
Bernoulli,
Beta,
Cauchy,
Distn,
Exponential,
Expand Down Expand Up @@ -166,6 +167,34 @@ def test_bernoulli(learner, classification_data: Tuple4Array):
# test properties of output


@pytest.mark.slow
@pytest.mark.parametrize(
"learner",
[
DecisionTreeRegressor(criterion="friedman_mse", max_depth=5),
DecisionTreeRegressor(criterion="friedman_mse", max_depth=3),
],
)
def test_beta(learner, regression_data: Tuple4Array):
X_reg_train, X_reg_test, Y_reg_train, Y_reg_test = regression_data

# Scale the target to (0, 1) range for Beta distribution
min_value = min(Y_reg_train.min(), Y_reg_test.min())
max_value = max(Y_reg_train.max(), Y_reg_test.max())
Y_reg_train = (Y_reg_train - min_value) / (max_value - min_value)
Y_reg_train = np.clip(Y_reg_train, 1e-5, 1 - 1e-5) # Avoid log(0) issues
Y_reg_test = (Y_reg_test - min_value) / (max_value - min_value)
Y_reg_test = np.clip(Y_reg_test, 1e-5, 1 - 1e-5) # Avoid log(0) issues

# test early stopping features
# test other args, n_trees, LR, minibatching- args as fixture
ngb = NGBRegressor(Dist=Beta, Score=LogScore, Base=learner, verbose=False)
ngb.fit(X_reg_train, Y_reg_train)
y_pred = ngb.predict(X_reg_test)
y_dist = ngb.pred_dist(X_reg_test)
# test properties of output


@pytest.mark.slow
@pytest.mark.parametrize("k", [2, 4, 7])
@pytest.mark.parametrize(
Expand Down