diff --git a/ngboost/distns/__init__.py b/ngboost/distns/__init__.py index 2a574c8..40858c0 100644 --- a/ngboost/distns/__init__.py +++ b/ngboost/distns/__init__.py @@ -1,4 +1,5 @@ """NGBoost distributions""" +from .beta import Beta from .categorical import Bernoulli, k_categorical from .cauchy import Cauchy from .distn import ClassificationDistn, Distn, RegressionDistn @@ -15,6 +16,7 @@ __all__ = [ "Bernoulli", + "Beta", "k_categorical", "Cauchy", "ClassificationDistn", diff --git a/ngboost/distns/beta.py b/ngboost/distns/beta.py new file mode 100644 index 0000000..d3c1edb --- /dev/null +++ b/ngboost/distns/beta.py @@ -0,0 +1,82 @@ +"""The NGBoost Beta distribution and scores""" +import numpy as np +from scipy.special import digamma, polygamma +from scipy.stats import beta as dist + +from ngboost.distns.distn import RegressionDistn +from ngboost.scores import LogScore + + +class BetaLogScore(LogScore): + """Log score for the Beta distribution.""" + + def score(self, Y): + """Calculate the log score for the Beta distribution.""" + return -self.dist.logpdf(Y) + + def d_score(self, Y): + """Calculate the derivative of the log score with respect to the parameters.""" + D = np.zeros( + (len(Y), 2) + ) # first col is dS/d(log(a)), second col is dS/d(log(b)) + D[:, 0] = -self.a * (digamma(self.a + self.b) - digamma(self.a) + np.log(Y)) + D[:, 1] = -self.b * (digamma(self.a + self.b) - digamma(self.b) + np.log(1 - Y)) + return D + + def metric(self): + """Return the Fisher Information matrix for the Beta distribution.""" + FI = np.zeros((self.a.shape[0], 2, 2)) + trigamma_a_b = polygamma(1, self.a + self.b) + FI[:, 0, 0] = self.a**2 * (polygamma(1, self.a) - trigamma_a_b) + FI[:, 0, 1] = -self.a * self.b * trigamma_a_b + FI[:, 1, 0] = -self.a * self.b * trigamma_a_b + FI[:, 1, 1] = self.b**2 * (polygamma(1, self.b) - trigamma_a_b) + return FI + + +class Beta(RegressionDistn): + """ + Implements the Beta distribution for NGBoost. + + The Beta distribution has two parameters, a and b. + The scipy loc and scale parameters are held constant for this implementation. + LogScore is supported for the Beta distribution. + """ + + n_params = 2 + scores = [BetaLogScore] # will implement this later + + # pylint: disable=super-init-not-called + def __init__(self, params): + self._params = params + + # create other objects that will be useful later + self.log_a = params[0] + self.log_b = params[1] + self.a = np.exp(params[0]) # since params[0] is log(a) + self.b = np.exp(params[1]) # since params[1] is log(b) + self.dist = dist(a=self.a, b=self.b) + + @staticmethod + def fit(Y): + """Fit the distribution to the data.""" + # Use scipy's beta distribution to fit the parameters + # pylint: disable=unused-variable + a, b, loc, scale = dist.fit(Y, floc=0, fscale=1) + return np.array([np.log(a), np.log(b)]) + + def sample(self, m): + """Sample from the distribution.""" + return np.array([self.dist.rvs() for i in range(m)]) + + def __getattr__( + self, name + ): # gives us access to Beta.mean() required for RegressionDist.predict() + if name in dir(self.dist): + return getattr(self.dist, name) + return None + + @property + def params(self): + """Return the parameters of the Beta distribution.""" + return {"a": self.a, "b": self.b} diff --git a/tests/test_distns.py b/tests/test_distns.py index 4b75d00..8e5ef51 100644 --- a/tests/test_distns.py +++ b/tests/test_distns.py @@ -9,6 +9,7 @@ from ngboost import NGBClassifier, NGBRegressor, NGBSurvival from ngboost.distns import ( Bernoulli, + Beta, Cauchy, Distn, Exponential, @@ -166,6 +167,34 @@ def test_bernoulli(learner, classification_data: Tuple4Array): # test properties of output +@pytest.mark.slow +@pytest.mark.parametrize( + "learner", + [ + DecisionTreeRegressor(criterion="friedman_mse", max_depth=5), + DecisionTreeRegressor(criterion="friedman_mse", max_depth=3), + ], +) +def test_beta(learner, regression_data: Tuple4Array): + X_reg_train, X_reg_test, Y_reg_train, Y_reg_test = regression_data + + # Scale the target to (0, 1) range for Beta distribution + min_value = min(Y_reg_train.min(), Y_reg_test.min()) + max_value = max(Y_reg_train.max(), Y_reg_test.max()) + Y_reg_train = (Y_reg_train - min_value) / (max_value - min_value) + Y_reg_train = np.clip(Y_reg_train, 1e-5, 1 - 1e-5) # Avoid log(0) issues + Y_reg_test = (Y_reg_test - min_value) / (max_value - min_value) + Y_reg_test = np.clip(Y_reg_test, 1e-5, 1 - 1e-5) # Avoid log(0) issues + + # test early stopping features + # test other args, n_trees, LR, minibatching- args as fixture + ngb = NGBRegressor(Dist=Beta, Score=LogScore, Base=learner, verbose=False) + ngb.fit(X_reg_train, Y_reg_train) + y_pred = ngb.predict(X_reg_test) + y_dist = ngb.pred_dist(X_reg_test) + # test properties of output + + @pytest.mark.slow @pytest.mark.parametrize("k", [2, 4, 7]) @pytest.mark.parametrize(