diff --git a/README.md b/README.md
index 22896cc51..a5cf7a219 100644
--- a/README.md
+++ b/README.md
@@ -107,65 +107,6 @@ jupytext --to notebook example.py
 jupytext --to py:percent example.ipynb
 ```
 
-# Simple example
-
-Let us import some dependencies and simulate a toy dataset $\mathcal{D}$.
-
-```python
-from jax import config
-
-config.update("jax_enable_x64", True)
-
-import gpjax as gpx
-from jax import grad, jit
-import jax.numpy as jnp
-import jax.random as jr
-import optax as ox
-
-key = jr.key(123)
-
-f = lambda x: 10 * jnp.sin(x)
-
-n = 50
-x = jr.uniform(key=key, minval=-3.0, maxval=3.0, shape=(n,1)).sort()
-y = f(x) + jr.normal(key, shape=(n,1))
-D = gpx.Dataset(X=x, y=y)
-
-# Construct the prior
-meanf = gpx.mean_functions.Zero()
-kernel = gpx.kernels.RBF()
-prior = gpx.gps.Prior(mean_function=meanf, kernel = kernel)
-
-# Define a likelihood
-likelihood = gpx.likelihoods.Gaussian(num_datapoints = n)
-
-# Construct the posterior
-posterior = prior * likelihood
-
-# Define an optimiser
-optimiser = ox.adam(learning_rate=1e-2)
-
-# Obtain Type 2 MLEs of the hyperparameters
-opt_posterior, history = gpx.fit(
-    model=posterior,
-    objective=lambda p, d: -gpx.objectives.conjugate_mll(p, d),
-    train_data=D,
-    optim=optimiser,
-    num_iters=500,
-    safe=True,
-    key=key,
-)
-
-# Infer the predictive posterior distribution
-xtest = jnp.linspace(-3., 3., 100).reshape(-1, 1)
-latent_dist = opt_posterior(xtest, D)
-predictive_dist = opt_posterior.likelihood(latent_dist)
-
-# Obtain the predictive mean and standard deviation
-pred_mean = predictive_dist.mean()
-pred_std = predictive_dist.stddev()
-```
-
 # Installation
 
 ## Stable version
diff --git a/docs/scripts/sharp_bits_figure.py b/docs/scripts/sharp_bits_figure.py
index ef0622d52..53700aac8 100644
--- a/docs/scripts/sharp_bits_figure.py
+++ b/docs/scripts/sharp_bits_figure.py
@@ -69,12 +69,12 @@
 plt.savefig("../_static/step_size_figure.png", bbox_inches="tight")
 
 # %%
-import tensorflow_probability.substrates.jax.bijectors as tfb
+import numpyro.distributions.transforms as npt
 
-bij = tfb.Exp()
+bij = npt.ExpTransform()
 
 x = np.linspace(0.05, 3.0, 6)
-y = np.asarray(bij.inverse(x))
+y = np.asarray(bij.inv(x))
 lval = 0.5
 rval = 0.52
 
diff --git a/docs/sharp_bits.md b/docs/sharp_bits.md
index 9f165e41c..43b445583 100644
--- a/docs/sharp_bits.md
+++ b/docs/sharp_bits.md
@@ -80,7 +80,7 @@ this value that we apply gradient updates to. When we wish to recover the constr
 value, we apply the inverse of the bijector, which is the exponential function in this
 case. This gives us back the blue cross.
 
-In GPJax, we supply bijective functions using [Tensorflow Probability](https://www.tensorflow.org/probability/api_docs/python/tfp/substrates/jax/bijectors).
+In GPJax, we supply bijective functions using [Numpyro](https://num.pyro.ai/en/stable/distributions.html#transforms).
 
 
 ## Positive-definiteness
diff --git a/examples/barycentres.py b/examples/barycentres.py
index 6aba09aea..132c0fabf 100644
--- a/examples/barycentres.py
+++ b/examples/barycentres.py
@@ -8,7 +8,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -41,7 +41,7 @@
 import jax.scipy.linalg as jsl
 from jaxtyping import install_import_hook
 import matplotlib.pyplot as plt
-import tensorflow_probability.substrates.jax.distributions as tfd
+import numpyro.distributions as npd
 
 from examples.utils import use_mpl_style
 
@@ -161,7 +161,7 @@
 
 
 # %%
-def fit_gp(x: jax.Array, y: jax.Array) -> tfd.MultivariateNormalFullCovariance:
+def fit_gp(x: jax.Array, y: jax.Array) -> npd.MultivariateNormal:
     if y.ndim == 1:
         y = y.reshape(-1, 1)
     D = gpx.Dataset(X=x, y=y)
@@ -204,9 +204,9 @@ def sqrtm(A: jax.Array):
 
 
 def wasserstein_barycentres(
-    distributions: tp.List[tfd.MultivariateNormalFullCovariance], weights: jax.Array
+    distributions: tp.List[npd.MultivariateNormal], weights: jax.Array
 ):
-    covariances = [d.covariance() for d in distributions]
+    covariances = [d.covariance_matrix for d in distributions]
     cov_stack = jnp.stack(covariances)
     stack_sqrt = jax.vmap(sqrtm)(cov_stack)
 
@@ -231,7 +231,7 @@ def step(covariance_candidate: jax.Array, idx: None):
 # %%
 weights = jnp.ones((n_datasets,)) / n_datasets
 
-means = jnp.stack([d.mean() for d in posterior_preds])
+means = jnp.stack([d.mean for d in posterior_preds])
 barycentre_mean = jnp.tensordot(weights, means, axes=1)
 
 step_fn = jax.jit(wasserstein_barycentres(posterior_preds, weights))
@@ -242,7 +242,7 @@ def step(covariance_candidate: jax.Array, idx: None):
 )
 L = jnp.linalg.cholesky(barycentre_covariance)
 
-barycentre_process = tfd.MultivariateNormalTriL(barycentre_mean, L)
+barycentre_process = npd.MultivariateNormal(barycentre_mean, scale_tril=L)
 
 # %% [markdown]
 # ## Plotting the result
@@ -254,7 +254,7 @@ def step(covariance_candidate: jax.Array, idx: None):
 
 # %%
 def plot(
-    dist: tfd.MultivariateNormalTriL,
+    dist: npd.MultivariateNormal,
     ax,
     color: str,
     label: str = None,
@@ -262,8 +262,8 @@ def plot(
     linewidth: float = 1.0,
     zorder: int = 0,
 ):
-    mu = dist.mean()
-    sigma = dist.stddev()
+    mu = dist.mean
+    sigma = jnp.sqrt(dist.variance)
     ax.plot(xtest, mu, linewidth=linewidth, color=color, label=label, zorder=zorder)
     ax.fill_between(
         xtest.squeeze(),
diff --git a/examples/classification.py b/examples/classification.py
index b8a8ec4e8..96d0d3613 100644
--- a/examples/classification.py
+++ b/examples/classification.py
@@ -8,7 +8,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -37,8 +37,8 @@
     install_import_hook,
 )
 import matplotlib.pyplot as plt
+import numpyro.distributions as npd
 import optax as ox
-import tensorflow_probability.substrates.jax as tfp
 
 from examples.utils import use_mpl_style
 from gpjax.lower_cholesky import lower_cholesky
@@ -50,7 +50,6 @@
     import gpjax as gpx
 
 
-tfd = tfp.distributions
 identity_matrix = jnp.eye
 
 # set the default style for plotting
@@ -120,7 +119,6 @@
 # Optax's optimisers.
 
 # %%
-
 optimiser = ox.adam(learning_rate=0.01)
 
 opt_posterior, history = gpx.fit(
@@ -140,8 +138,8 @@
 map_latent_dist = opt_posterior.predict(xtest, train_data=D)
 predictive_dist = opt_posterior.likelihood(map_latent_dist)
 
-predictive_mean = predictive_dist.mean()
-predictive_std = predictive_dist.stddev()
+predictive_mean = predictive_dist.mean
+predictive_std = jnp.sqrt(predictive_dist.variance)
 
 fig, ax = plt.subplots()
 ax.scatter(x, y, label="Observations", color=cols[0])
@@ -215,8 +213,6 @@
 # datapoints below.
 
 # %%
-
-
 gram, cross_covariance = (kernel.gram, kernel.cross_covariance)
 jitter = 1e-6
 
@@ -246,7 +242,7 @@ def loss(params, D):
 L_inv = jsp.linalg.solve_triangular(L, identity_matrix(D.n), lower=True)
 H_inv = jsp.linalg.solve_triangular(L.T, L_inv, lower=False)
 LH = jnp.linalg.cholesky(H_inv)
-laplace_approximation = tfd.MultivariateNormalTriL(f_hat.squeeze(), LH)
+laplace_approximation = npd.MultivariateNormal(f_hat.squeeze(), scale_tril=LH)
 
 
 # %% [markdown]
@@ -265,7 +261,7 @@ def loss(params, D):
 
 
 # %%
-def construct_laplace(test_inputs: Float[Array, "N D"]) -> tfd.MultivariateNormalTriL:
+def construct_laplace(test_inputs: Float[Array, "N D"]) -> npd.MultivariateNormal:
     map_latent_dist = opt_posterior.predict(xtest, train_data=D)
 
     Kxt = opt_posterior.prior.kernel.cross_covariance(x, test_inputs)
@@ -279,10 +275,10 @@ def construct_laplace(test_inputs: Float[Array, "N D"]) -> tfd.MultivariateNorma
     # Ktx Kxx⁻¹[ H⁻¹ ] Kxx⁻¹ Kxt
     laplace_cov_term = jnp.matmul(jnp.matmul(Kxx_inv_Kxt.T, H_inv), Kxx_inv_Kxt)
 
-    mean = map_latent_dist.mean()
-    covariance = map_latent_dist.covariance() + laplace_cov_term
+    mean = map_latent_dist.mean
+    covariance = map_latent_dist.covariance_matrix + laplace_cov_term
     L = jnp.linalg.cholesky(covariance)
-    return tfd.MultivariateNormalTriL(jnp.atleast_1d(mean.squeeze()), L)
+    return npd.MultivariateNormal(jnp.atleast_1d(mean.squeeze()), scale_tril=L)
 
 
 # %% [markdown]
@@ -291,8 +287,8 @@ def construct_laplace(test_inputs: Float[Array, "N D"]) -> tfd.MultivariateNorma
 laplace_latent_dist = construct_laplace(xtest)
 predictive_dist = opt_posterior.likelihood(laplace_latent_dist)
 
-predictive_mean = predictive_dist.mean()
-predictive_std = predictive_dist.stddev()
+predictive_mean = predictive_dist.mean
+predictive_std = jnp.sqrt(predictive_dist.variance)
 
 fig, ax = plt.subplots()
 ax.scatter(x, y, label="Observations", color=cols[0])
diff --git a/examples/collapsed_vi.py b/examples/collapsed_vi.py
index b6337ea48..08d7408c9 100644
--- a/examples/collapsed_vi.py
+++ b/examples/collapsed_vi.py
@@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax_beartype
 #     language: python
@@ -161,10 +161,10 @@
 
 inducing_points = opt_posterior.inducing_inputs.value
 
-samples = latent_dist.sample(seed=key, sample_shape=(20,))
+samples = latent_dist.sample(key=key, sample_shape=(20,))
 
-predictive_mean = predictive_dist.mean()
-predictive_std = predictive_dist.stddev()
+predictive_mean = predictive_dist.mean
+predictive_std = jnp.sqrt(predictive_dist.variance)
 
 fig, ax = plt.subplots()
 
diff --git a/examples/constructing_new_kernels.py b/examples/constructing_new_kernels.py
index 58cf3672d..7d7c3363e 100644
--- a/examples/constructing_new_kernels.py
+++ b/examples/constructing_new_kernels.py
@@ -8,7 +8,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -24,6 +24,7 @@
 # %%
 # Enable Float64 for more stable matrix inversions.
 from jax import config
+from jax.nn import softplus
 import jax.numpy as jnp
 import jax.random as jr
 from jaxtyping import (
@@ -32,7 +33,9 @@
     install_import_hook,
 )
 import matplotlib.pyplot as plt
-import tensorflow_probability.substrates.jax as tfp
+import numpyro.distributions as npd
+from numpyro.distributions import constraints
+import numpyro.distributions.transforms as npt
 
 from examples.utils import use_mpl_style
 from gpjax.kernels.computations import DenseKernelComputation
@@ -225,9 +228,27 @@ def angular_distance(x, y, c):
     return jnp.abs((x - y + c) % (c * 2) - c)
 
 
-bij = tfb.SoftClip(low=jnp.array(4.0, dtype=jnp.float64))
+class ShiftedSoftplusTransform(npt.ParameterFreeTransform):
+    r"""
+    Transform from unconstrained space to the domain [4, infinity) via
+    :math:`y = 4 + \log(1 + \exp(x))`. The inverse is computed as
+    :math:`x = \log(\exp(y - 4) - 1)`.
+    """
 
-DEFAULT_BIJECTION["polar"] = bij
+    domain = constraints.real
+    codomain = constraints.interval(4.0, jnp.inf)  # updated codomain
+
+    def __call__(self, x):
+        return 4.0 + softplus(x)  # shift the softplus output by 4
+
+    def _inverse(self, y):
+        return npt._softplus_inv(y - 4.0)  # subtract the shift in the inverse
+
+    def log_abs_det_jacobian(self, x, y, intermediates=None):
+        return -softplus(-x)
+
+
+DEFAULT_BIJECTION["polar"] = ShiftedSoftplusTransform()
 
 
 class Polar(gpx.kernels.AbstractKernel):
@@ -307,7 +328,7 @@ def __call__(
 
 # %%
 posterior_rv = opt_posterior.likelihood(opt_posterior.predict(angles, train_data=D))
-mu = posterior_rv.mean()
+mu = posterior_rv.mean
 one_sigma = posterior_rv.stddev()
 
 # %%
diff --git a/examples/deep_kernels.py b/examples/deep_kernels.py
index 5b3b945f2..d40f844b3 100644
--- a/examples/deep_kernels.py
+++ b/examples/deep_kernels.py
@@ -8,7 +8,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -238,8 +238,8 @@ def __call__(self, x: jax.Array) -> jax.Array:
 latent_dist = opt_posterior(xtest, train_data=D)
 predictive_dist = opt_posterior.likelihood(latent_dist)
 
-predictive_mean = predictive_dist.mean()
-predictive_std = predictive_dist.stddev()
+predictive_mean = predictive_dist.mean
+predictive_std = jnp.sqrt(predictive_dist.variance)
 
 fig, ax = plt.subplots()
 ax.plot(x, y, "o", label="Observations", color=cols[0])
diff --git a/examples/graph_kernels.py b/examples/graph_kernels.py
index 71c8c65d7..07e06e7e7 100644
--- a/examples/graph_kernels.py
+++ b/examples/graph_kernels.py
@@ -8,7 +8,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -124,7 +124,7 @@
 prior = gpx.gps.Prior(mean_function=gpx.mean_functions.Zero(), kernel=true_kernel)
 
 fx = prior(x)
-y = fx.sample(seed=key, sample_shape=(1,)).reshape(-1, 1)
+y = fx.sample(key=key, sample_shape=(1,)).reshape(-1, 1)
 
 D = gpx.Dataset(X=x, y=y)
 
@@ -194,8 +194,8 @@
 initial_dist = likelihood(posterior(x, D))
 predictive_dist = opt_posterior.likelihood(opt_posterior(x, D))
 
-initial_mean = initial_dist.mean()
-learned_mean = predictive_dist.mean()
+initial_mean = initial_dist.mean
+learned_mean = predictive_dist.mean
 
 rmse = lambda ytrue, ypred: jnp.sum(jnp.sqrt(jnp.square(ytrue - ypred)))
 
diff --git a/examples/intro_to_gps.py b/examples/intro_to_gps.py
index 5a3666771..45d6aeb44 100644
--- a/examples/intro_to_gps.py
+++ b/examples/intro_to_gps.py
@@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -127,9 +127,9 @@
 import jax.random as jr
 import matplotlib as mpl
 import matplotlib.pyplot as plt
+import numpyro.distributions as npd
 import pandas as pd
 import seaborn as sns
-import tensorflow_probability.substrates.jax as tfp
 
 from examples.utils import (
     confidence_ellipse,
@@ -143,11 +143,10 @@
 
 
 cols = mpl.rcParams["axes.prop_cycle"].by_key()["color"]
-tfd = tfp.distributions
 
-ud1 = tfd.Normal(0.0, 1.0)
-ud2 = tfd.Normal(-1.0, 0.5)
-ud3 = tfd.Normal(0.25, 1.5)
+ud1 = npd.Normal(0.0, 1.0)
+ud2 = npd.Normal(-1.0, 0.5)
+ud3 = npd.Normal(0.25, 1.5)
 
 xs = jnp.linspace(-5.0, 5.0, 500)
 
@@ -156,7 +155,7 @@
     ax.plot(
         xs,
         jnp.exp(d.log_prob(xs)),
-        label=f"$\\mathcal{{N}}({{{float(d.mean())}}},\\  {{{float(d.stddev())}}}^2)$",
+        label=f"$\\mathcal{{N}}({{{float(d.mean)}}},\\  {{{float(jnp.sqrt(d.variance))}}}^2)$",
     )
     ax.fill_between(xs, jnp.zeros_like(xs), jnp.exp(d.log_prob(xs)), alpha=0.2)
 ax.legend(loc="best")
@@ -190,12 +189,12 @@
 # %%
 key = jr.key(123)
 
-d1 = tfd.MultivariateNormalDiag(loc=jnp.zeros(2), scale_diag=jnp.ones(2))
-d2 = tfd.MultivariateNormalTriL(
-    jnp.zeros(2), jnp.linalg.cholesky(jnp.array([[1.0, 0.9], [0.9, 1.0]]))
+d1 = npd.MultivariateNormal(loc=jnp.zeros(2), covariance_matrix=jnp.diag(jnp.ones(2)))
+d2 = npd.MultivariateNormal(
+    jnp.zeros(2), scale_tril=jnp.linalg.cholesky(jnp.array([[1.0, 0.9], [0.9, 1.0]]))
 )
-d3 = tfd.MultivariateNormalTriL(
-    jnp.zeros(2), jnp.linalg.cholesky(jnp.array([[1.0, -0.5], [-0.5, 1.0]]))
+d3 = npd.MultivariateNormal(
+    jnp.zeros(2), scale_tril=jnp.linalg.cholesky(jnp.array([[1.0, -0.5], [-0.5, 1.0]]))
 )
 
 dists = [d1, d2, d3]
@@ -215,13 +214,21 @@
 cmap = mpl.colors.LinearSegmentedColormap.from_list("custom", ["white", cols[1]], N=256)
 
 for a, t, d in zip([ax0, ax1, ax2], titles, dists, strict=False):
-    d_prob = d.prob(jnp.hstack([xx.reshape(-1, 1), yy.reshape(-1, 1)])).reshape(
-        xx.shape
+    d_prob = jnp.exp(
+        d.log_prob(jnp.hstack([xx.reshape(-1, 1), yy.reshape(-1, 1)]))
+    ).reshape(xx.shape)
+    cntf = a.contourf(
+        xx,
+        yy,
+        jnp.exp(d_prob),
+        levels=20,
+        antialiased=True,
+        cmap=cmap,
+        edgecolor="face",
     )
-    cntf = a.contourf(xx, yy, jnp.exp(d_prob), levels=20, antialiased=True, cmap=cmap, edgecolor="face")
     a.set_xlim(-2.75, 2.75)
     a.set_ylim(-2.75, 2.75)
-    samples = d.sample(seed=key, sample_shape=(5000,))
+    samples = d.sample(key=key, sample_shape=(5000,))
     xsample, ysample = samples[:, 0], samples[:, 1]
     confidence_ellipse(
         xsample, ysample, a, edgecolor="#3f3f3f", n_std=1.0, linestyle="--", alpha=0.8
@@ -274,13 +281,13 @@
 
 # %%
 n = 1000
-x = tfd.Normal(loc=0.0, scale=1.0).sample(seed=key, sample_shape=(n,))
+x = npd.Normal(loc=0.0, scale=1.0).sample(key, sample_shape=(n,))
 key, subkey = jr.split(key)
-y = tfd.Normal(loc=0.25, scale=0.5).sample(seed=subkey, sample_shape=(n,))
+y = npd.Normal(loc=0.25, scale=0.5).sample(subkey, sample_shape=(n,))
 key, subkey = jr.split(subkey)
-xfull = tfd.Normal(loc=0.0, scale=1.0).sample(seed=subkey, sample_shape=(n * 10,))
+xfull = npd.Normal(loc=0.0, scale=1.0).sample(subkey, sample_shape=(n * 10,))
 key, subkey = jr.split(subkey)
-yfull = tfd.Normal(loc=0.25, scale=0.5).sample(seed=subkey, sample_shape=(n * 10,))
+yfull = npd.Normal(loc=0.25, scale=0.5).sample(subkey, sample_shape=(n * 10,))
 key, subkey = jr.split(subkey)
 df = pd.DataFrame({"x": x, "y": y, "idx": jnp.ones(n)})
 
diff --git a/examples/intro_to_kernels.py b/examples/intro_to_kernels.py
index 7cf9038dc..49a2d9bb7 100644
--- a/examples/intro_to_kernels.py
+++ b/examples/intro_to_kernels.py
@@ -8,7 +8,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -204,7 +204,7 @@
 for k, ax in zip(kernels, axes.ravel(), strict=False):
     prior = gpx.gps.Prior(mean_function=meanf, kernel=k)
     rv = prior(x)
-    y = rv.sample(seed=key, sample_shape=(10,))
+    y = rv.sample(key=key, sample_shape=(10,))
     ax.plot(x, y.T, alpha=0.7)
     ax.set_title(k.name)
 
@@ -292,8 +292,8 @@ def forrester(x: Float[Array, "N"]) -> Float[Array, "N"]:  # noqa: F821
 
 # %%
 def plot_ribbon(ax, x, dist, color):
-    mean = dist.mean()
-    std = dist.stddev()
+    mean = dist.mean
+    std = jnp.sqrt(dist.variance)
     ax.plot(x, mean, label="Predictive mean", color=color)
     ax.fill_between(
         x.squeeze(),
@@ -311,8 +311,8 @@ def plot_ribbon(ax, x, dist, color):
 opt_latent_dist = opt_posterior.predict(test_x, train_data=D)
 opt_predictive_dist = opt_posterior.likelihood(opt_latent_dist)
 
-opt_predictive_mean = opt_predictive_dist.mean()
-opt_predictive_std = opt_predictive_dist.stddev()
+opt_predictive_mean = opt_predictive_dist.mean
+opt_predictive_std = jnp.sqrt(opt_predictive_dist.variance)
 
 fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(5, 6))
 ax1.plot(
@@ -369,12 +369,11 @@ def plot_ribbon(ax, x, dist, color):
 
 x = jnp.linspace(-3.0, 3.0, num=200).reshape(-1, 1)
 rv = prior(x)
-y = rv.sample(seed=key, sample_shape=(10,))
+y = rv.sample(key=key, sample_shape=(10,))
 
 fig, ax = plt.subplots()
 ax.plot(x, y.T, alpha=0.7)
 ax.set_title("Samples from the Periodic Kernel")
-plt.show()
 
 # %% [markdown]
 # In other scenarios, it may be known that the underlying function is *linear*, in which case the *linear* kernel would be a suitable choice:
@@ -392,12 +391,11 @@ def plot_ribbon(ax, x, dist, color):
 
 x = jnp.linspace(-3.0, 3.0, num=200).reshape(-1, 1)
 rv = prior(x)
-y = rv.sample(seed=key, sample_shape=(10,))
+y = rv.sample(key=key, sample_shape=(10,))
 
 fig, ax = plt.subplots()
 ax.plot(x, y.T, alpha=0.7)
 ax.set_title("Samples from the Linear Kernel")
-plt.show()
 
 # %% [markdown]
 # ## Composing Kernels
@@ -428,11 +426,10 @@ def plot_ribbon(ax, x, dist, color):
 
 x = jnp.linspace(-3.0, 3.0, num=200).reshape(-1, 1)
 rv = prior(x)
-y = rv.sample(seed=key, sample_shape=(10,))
+y = rv.sample(key=key, sample_shape=(10,))
 fig, ax = plt.subplots()
 ax.plot(x, y.T, alpha=0.7)
 ax.set_title("Samples from a GP Prior with Kernel = Linear + Periodic")
-plt.show()
 
 
 # %% [markdown]
@@ -453,11 +450,10 @@ def plot_ribbon(ax, x, dist, color):
 
 x = jnp.linspace(-3.0, 3.0, num=200).reshape(-1, 1)
 rv = prior(x)
-y = rv.sample(seed=key, sample_shape=(10,))
+y = rv.sample(key=key, sample_shape=(10,))
 fig, ax = plt.subplots()
 ax.plot(x, y.T, alpha=0.7)
 ax.set_title("Samples from a GP with Kernel = Linear x Periodic")
-plt.show()
 
 
 # %% [markdown]
@@ -498,7 +494,6 @@ def plot_ribbon(ax, x, dist, color):
 ax.set_title("CO2 Concentration in the Atmosphere")
 ax.set_xlabel("Year")
 ax.set_ylabel("CO2 Concentration (ppm)")
-plt.show()
 
 # %% [markdown]
 # Looking at the data, we can see that there is clearly a periodic trend, with a period of
@@ -569,8 +564,8 @@ def loss(posterior, data):
 latent_dist = opt_posterior.predict(test_x, train_data=D)
 predictive_dist = opt_posterior.likelihood(latent_dist)
 
-predictive_mean = predictive_dist.mean().reshape(-1, 1)
-predictive_std = predictive_dist.stddev().reshape(-1, 1)
+predictive_mean = predictive_dist.mean.reshape(-1, 1)
+predictive_std = jnp.sqrt(predictive_dist.variance).reshape(-1, 1)
 
 # %% [markdown]
 # Let's plot the model's predictions over this period of time:
diff --git a/examples/likelihoods_guide.py b/examples/likelihoods_guide.py
index 7e8c537f5..0f5a9cb52 100644
--- a/examples/likelihoods_guide.py
+++ b/examples/likelihoods_guide.py
@@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -73,7 +73,6 @@
 import jax.numpy as jnp
 import jax.random as jr
 import matplotlib.pyplot as plt
-import tensorflow_probability.substrates.jax as tfp
 
 from examples.utils import use_mpl_style
 import gpjax as gpx
@@ -81,8 +80,6 @@
 config.update("jax_enable_x64", True)
 
 
-tfd = tfp.distributions
-
 # set the default style for plotting
 use_mpl_style()
 cols = plt.rcParams["axes.prop_cycle"].by_key()["color"]
@@ -162,13 +159,13 @@
 for ax in axes.ravel():
     subkey, _ = jr.split(subkey)
     ax.plot(
-        latent_dist.sample(sample_shape=(1,), seed=subkey).T,
+        latent_dist.sample(sample_shape=(1,), key=subkey).T,
         lw=1,
         color=cols[0],
         label="Latent samples",
     )
     ax.plot(
-        likelihood.predict(latent_dist).sample(sample_shape=(1,), seed=subkey).T,
+        likelihood.predict(latent_dist).sample(sample_shape=(1,), key=subkey).T,
         "o",
         markersize=5,
         alpha=0.3,
@@ -189,13 +186,13 @@
 for ax in axes.ravel():
     subkey, _ = jr.split(subkey)
     ax.plot(
-        latent_dist.sample(sample_shape=(1,), seed=subkey).T,
+        latent_dist.sample(sample_shape=(1,), key=subkey).T,
         lw=1,
         color=cols[0],
         label="Latent samples",
     )
     ax.plot(
-        likelihood.predict(latent_dist).sample(sample_shape=(1,), seed=subkey).T,
+        likelihood.predict(latent_dist).sample(sample_shape=(1,), key=subkey).T,
         "o",
         markersize=3,
         alpha=0.5,
@@ -260,7 +257,7 @@
 
 def q_moments(x):
     qx = q(x)
-    return qx.mean(), qx.variance()
+    return qx.mean, qx.variance
 
 
 mean, variance = jax.vmap(q_moments)(x[:, None])
diff --git a/examples/oceanmodelling.py b/examples/oceanmodelling.py
index 1c2d86ee8..4a1c56b7c 100644
--- a/examples/oceanmodelling.py
+++ b/examples/oceanmodelling.py
@@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -17,11 +17,20 @@
 # %% [markdown]
 # # Gaussian Processes for Vector Fields and Ocean Current Modelling
 #
-# In this notebook, we use Gaussian processes to learn vector-valued functions. We will be
-# recreating the results by [Berlinghieri et al. (2023)](https://arxiv.org/pdf/2302.10364.pdf) by an
-# application to real-world ocean surface velocity data, collected via surface drifters.
+# In this notebook, we use Gaussian processes to learn vector-valued functions. We will
+# be recreating the results by [Berlinghieri et al.
+# (2023)](https://arxiv.org/pdf/2302.10364.pdf) by an application to real-world ocean
+# surface velocity data, collected via surface drifters.
 #
-# Surface drifters are measurement devices that measure the dynamics and circulation patterns of the world's oceans. Studying and predicting ocean currents are important to climate research, for example, forecasting and predicting oil spills, oceanographic surveying of eddies and upwelling, or providing information on the distribution of biomass in ecosystems. We will be using the [Gulf Drifters Open dataset](https://zenodo.org/record/4421585), which contains all publicly available surface drifter trajectories from the Gulf of Mexico spanning 28 years.
+# Surface drifters are measurement devices that measure the dynamics and circulation
+# patterns of the world's oceans. Studying and predicting ocean currents are important
+# to climate research, for example, forecasting and predicting oil spills, oceanographic
+# surveying of eddies and upwelling, or providing information on the distribution of
+# biomass in ecosystems. We will be using the [Gulf Drifters Open
+# dataset](https://zenodo.org/record/4421585), which contains all publicly available
+# surface drifter trajectories from the Gulf of Mexico spanning 28 years.
+#
+
 # %%
 from dataclasses import (
     dataclass,
@@ -41,8 +50,8 @@
 )
 from matplotlib import rcParams
 import matplotlib.pyplot as plt
+import numpyro.distributions as npd
 import pandas as pd
-import tensorflow_probability as tfp
 
 from examples.utils import use_mpl_style
 from gpjax.kernels.computations import DenseKernelComputation
@@ -64,15 +73,22 @@
 
 # %% [markdown]
 # ## Data loading and preprocessing
-# The real dataset has been binned into an $N=34\times16$ grid, equally spaced over the longitude-latitude interval $[-90.8,-83.8] \times [24.0,27.5]$. Each bin has a size $\approx 0.21\times0.21$, and contains the average velocity across all measurements that fall inside it.
+# The real dataset has been binned into an $N=34\times16$ grid, equally spaced over the
+# longitude-latitude interval $[-90.8,-83.8] \times [24.0,27.5]$. Each bin has a size
+# $\approx 0.21\times0.21$, and contains the average velocity across all measurements
+# that fall inside it.
 #
-# We will call this binned ocean data the ground truth, and label it with the vector field
-# $$
-# \mathbf{F} \equiv \mathbf{F}(\mathbf{x}),
-# $$
-# where $\mathbf{x} = (x^{(0)}$,$x^{(1)})^\text{T}$, with a vector basis in the standard Cartesian directions (dimensions will be indicated by superscripts).
+# We will call this binned ocean data the ground truth, and label it with the vector
+# field $$ \mathbf{F} \equiv \mathbf{F}(\mathbf{x}), $$ where $\mathbf{x} =
+# (x^{(0)}$,$x^{(1)})^\text{T}$, with a vector basis in the standard Cartesian
+# directions (dimensions will be indicated by superscripts).
 #
-# We shall label the ground truth $D_0=\left\{ \left(\mathbf{x}_{0,i} , \mathbf{y}_{0,i} \right)\right\}_{i=1}^N$, where $\mathbf{y}_{0,i}$ is the 2-dimensional velocity vector at the $i$-th location, $\mathbf{x}_{0,i}$. The training dataset contains simulated measurements from ocean drifters $D_T=\left\{\left(\mathbf{x}_{T,i}, \mathbf{y}_{T,i} \right)\right\}_{i=1}^{N_T}$, $N_T = 20$ in this case (the subscripts indicate the ground truth and the simulated measurements respectively).
+# We shall label the ground truth $D_0=\left\{ \left(\mathbf{x}_{0,i} , \mathbf{y}_{0,i}
+# \right)\right\}_{i=1}^N$, where $\mathbf{y}_{0,i}$ is the 2-dimensional velocity
+# vector at the $i$-th location, $\mathbf{x}_{0,i}$. The training dataset contains
+# simulated measurements from ocean drifters $D_T=\left\{\left(\mathbf{x}_{T,i},
+# \mathbf{y}_{T,i} \right)\right\}_{i=1}^{N_T}$, $N_T = 20$ in this case (the subscripts
+# indicate the ground truth and the simulated measurements respectively).
 #
 
 
@@ -136,46 +152,54 @@ def prepare_data(df):
     bbox_to_anchor=(0.5, -0.3),
     loc="lower center",
 )
-plt.show()
 
 # %% [markdown]
 # ## Problem Setting
-# We aim to obtain estimates for $\mathbf{F}$ at the set of points $\left\{ \mathbf{x}_{0,i} \right\}_{i=1}^N$ using Gaussian processes, followed by a comparison of the latent model to the ground truth $D_0$. Note that $D_0$ is not passed into any functions used  by GPJax, and is only used to compare against the two GP models at the end of the notebook.
-#
-# Since $\mathbf{F}$ is a vector-valued function, we require GPs that can directly learn vector-valued functions[<sup>1</sup>](#fn1). To implement this in GPJax, the problem can be changed to learn a scalar-valued function by 'massaging' the data into a  $2N\times2N$ problem, such that each dimension of our GP is associated with a *component* of $\mathbf{y}_{T,i}$.
-#
-# For a particular measurement $\mathbf{y}$ (training or testing) at location $\mathbf{x}$, the components $(y^{(0)}, y^{(1)})$ are described by the latent vector field $\mathbf{F}$, such that
-#
-# $$
-# \mathbf{y} = \mathbf{F}(\mathbf{x}) = \left(\begin{array}{l}
+# We aim to obtain estimates for $\mathbf{F}$ at the set of points $\left\{
+# \mathbf{x}_{0,i} \right\}_{i=1}^N$ using Gaussian processes, followed by a comparison
+# of the latent model to the ground truth $D_0$. Note that $D_0$ is not passed into any
+# functions used  by GPJax, and is only used to compare against the two GP models at the
+# end of the notebook.
+#
+# Since $\mathbf{F}$ is a vector-valued function, we require GPs that can directly learn
+# vector-valued functions[<sup>1</sup>](#fn1). To implement this in GPJax, the problem
+# can be changed to learn a scalar-valued function by 'massaging' the data into a
+# $2N\times2N$ problem, such that each dimension of our GP is associated with a
+# *component* of $\mathbf{y}_{T,i}$.
+#
+# For a particular measurement $\mathbf{y}$ (training or testing) at location
+# $\mathbf{x}$, the components $(y^{(0)}, y^{(1)})$ are described by the latent vector
+# field $\mathbf{F}$, such that
+#
+# $$ \mathbf{y} = \mathbf{F}(\mathbf{x}) = \left(\begin{array}{l}
 # f^{(0)}\left(\mathbf{x}\right) \\
-# f^{(1)}\left(\mathbf{x}\right)
-# \end{array}\right),
-# $$
+# f^{(1)}\left(\mathbf{x}\right) \end{array}\right), $$
 #
-# where each $f^{(z)}\left(\mathbf{x}\right), z \in \{0,1\}$ is a scalar-valued function.
+# where each $f^{(z)}\left(\mathbf{x}\right), z \in \{0,1\}$ is a scalar-valued
+# function.
 #
-# Now consider the scalar-valued function $g: \mathbb{R}^2 \times\{0,1\} \rightarrow \mathbb{R}$, such that
+# Now consider the scalar-valued function $g: \mathbb{R}^2 \times\{0,1\} \rightarrow
+# \mathbb{R}$, such that
 #
-# $$
-# g \left(\mathbf{x} , 0 \right) = f^{(0)} ( \mathbf{x} ), \text{and } g \left( \mathbf{x}, 1 \right)=f^{(1)}\left(\mathbf{x}\right).
-# $$
+# $$ g \left(\mathbf{x} , 0 \right) = f^{(0)} ( \mathbf{x} ), \text{and } g \left(
+# \mathbf{x}, 1 \right)=f^{(1)}\left(\mathbf{x}\right).  $$
 #
-# We have increased the input dimension by 1, from the 2D $\mathbf{x}$ to the 3D $\mathbf{X} = \left(\mathbf{x}, 0\right)$ or $\mathbf{X} = \left(\mathbf{x}, 1\right)$.
+# We have increased the input dimension by 1, from the 2D $\mathbf{x}$ to the 3D
+# $\mathbf{X} = \left(\mathbf{x}, 0\right)$ or $\mathbf{X} = \left(\mathbf{x},
+# 1\right)$.
 #
 # By choosing the value of the third dimension, 0 or 1, we may now incorporate this
-# information into the computation of the kernel.
-# We therefore make new 3D datasets $D_{T,3D} = \left\{\left( \mathbf{X}_{T,i},\mathbf{Y}_{T,i} \right) \right\} _{i=0}^{2N_T}$ and $D_{0,3D} = \left\{\left( \mathbf{X}_{0,i},\mathbf{Y}_{0,i} \right) \right\} _{i=0}^{2N}$ that incorporates this new labelling, such that for each dataset (indicated by the subscript $D = 0$ or $D=T$),
+# information into the computation of the kernel.  We therefore make new 3D datasets
+# $D_{T,3D} = \left\{\left( \mathbf{X}_{T,i},\mathbf{Y}_{T,i} \right) \right\}
+# _{i=0}^{2N_T}$ and $D_{0,3D} = \left\{\left( \mathbf{X}_{0,i},\mathbf{Y}_{0,i} \right)
+# \right\} _{i=0}^{2N}$ that incorporates this new labelling, such that for each dataset
+# (indicated by the subscript $D = 0$ or $D=T$),
 #
-# $$
-# X_{D,i} = \left( \mathbf{x}_{D,i}, z \right),
-# $$
+# $$ X_{D,i} = \left( \mathbf{x}_{D,i}, z \right), $$
 #
 # and
 #
-# $$
-# Y_{D,i} = y_{D,i}^{(z)},
-# $$
+# $$ Y_{D,i} = y_{D,i}^{(z)}, $$
 #
 # where $z = 0$ if $i$ is odd and $z=1$ if $i$ is even.
 
@@ -206,18 +230,34 @@ def dataset_3d(pos, vel):
 
 
 # %% [markdown]
-# ## Velocity (dimension) decomposition
-# Having labelled the data, we are now in a position to use GPJax to learn the function $g$, and hence $\mathbf{F}$. A naive approach to the problem is to apply a GP prior directly to the velocities of each dimension independently, which is called the *velocity* GP. For our prior, we choose an isotropic mean 0 over all dimensions of the GP, and a piecewise kernel that depends on the $z$ labels of the inputs, such that for two inputs $\mathbf{X} = \left( \mathbf{x}, z \right )$ and $\mathbf{X}^\prime = \left( \mathbf{x}^\prime, z^\prime \right )$,
 #
-# $$
-# k_{\text{vel}} \left(\mathbf{X}, \mathbf{X}^{\prime}\right)=
+# ## Velocity (dimension) decomposition
+# Having labelled the data, we are now in a position to use GPJax to learn the function
+# $g$, and hence $\mathbf{F}$. A naive approach to the problem is to apply a GP prior
+# directly to the velocities of each dimension independently, which is called the
+# *velocity* GP. For our prior, we choose an isotropic mean 0 over all dimensions of the
+# GP, and a piecewise kernel that depends on the $z$ labels of the inputs, such that for
+# two inputs $\mathbf{X} = \left( \mathbf{x}, z \right )$ and $\mathbf{X}^\prime =
+# \left( \mathbf{x}^\prime, z^\prime \right )$,
+#
+# $$ k_{\text{vel}} \left(\mathbf{X}, \mathbf{X}^{\prime}\right)=
 # \begin{cases}k^{(z)}\left(\mathbf{x}, \mathbf{x}^{\prime}\right) & \text { if }
-# z=z^{\prime} \\ 0 & \text { if } z \neq z^{\prime}, \end{cases}
-# $$
+# z=z^{\prime} \\ 0 & \text { if } z \neq z^{\prime}, \end{cases} $$
 #
-# where $k^{(z)}\left(\mathbf{x}, \mathbf{x}^{\prime}\right)$ are the user chosen kernels for each dimension. What this means is that there are no correlations between the $x^{(0)}$ and $x^{(1)}$ dimensions for all choices $\mathbf{X}$ and $\mathbf{X}^{\prime}$, since there are no off-diagonal elements in the Gram matrix populated by this choice.
+# where $k^{(z)}\left(\mathbf{x}, \mathbf{x}^{\prime}\right)$ are the user chosen
+# kernels for each dimension. What this means is that there are no correlations between
+# the $x^{(0)}$ and $x^{(1)}$ dimensions for all choices $\mathbf{X}$ and
+# $\mathbf{X}^{\prime}$, since there are no off-diagonal elements in the Gram matrix
+# populated by this choice.
 #
-# To implement this approach in GPJax, we define `VelocityKernel` in the following cell, following the steps outlined in the [custom kernels notebook](https://docs.jaxgaussianprocesses.com/_examples/constructing_new_kernels/#custom-kernel). This modular implementation takes the choice of user kernels as its class attributes: `kernel0` and `kernel1`. We must additionally pass the argument `active_dims = [0,1]`, which is an attribute of the base class `AbstractKernel`, into the chosen kernels. This is necessary such that the subsequent likelihood optimisation does not optimise over the artificial label dimension.
+# To implement this approach in GPJax, we define `VelocityKernel` in the following cell,
+# following the steps outlined in the [custom kernels
+# notebook](https://docs.jaxgaussianprocesses.com/_examples/constructing_new_kernels/#custom-kernel).
+# This modular implementation takes the choice of user kernels as its class attributes:
+# `kernel0` and `kernel1`. We must additionally pass the argument `active_dims = [0,1]`,
+# which is an attribute of the base class `AbstractKernel`, into the chosen kernels.
+# This is necessary such that the subsequent likelihood optimisation does not optimise
+# over the artificial label dimension.
 #
 
 
@@ -251,7 +291,9 @@ def __call__(
 
 # %% [markdown]
 # ### GPJax implementation
-# Next, we define the model in GPJax. The prior is defined using $k_{\text{vel}}\left(\mathbf{X}, \mathbf{X}^\prime \right)$ and 0 mean and 0 observation noise. We choose a Gaussian marginal log-likelihood (MLL).
+# Next, we define the model in GPJax. The prior is defined using
+# $k_{\text{vel}}\left(\mathbf{X}, \mathbf{X}^\prime \right)$ and 0 mean and 0
+# observation noise. We choose a Gaussian marginal log-likelihood (MLL).
 #
 
 
@@ -272,7 +314,12 @@ def initialise_gp(kernel, mean, dataset):
 
 
 # %% [markdown]
-# With a model now defined, we can proceed to optimise the hyperparameters of our likelihood over $D_0$. This is done by minimising the MLL using `BFGS`. We also plot its value at each step to visually confirm that we have found the minimum. See the  [introduction to Gaussian Processes](https://docs.jaxgaussianprocesses.com/_examples/intro_to_gps/) notebook for more information on optimising the MLL.
+# With a model now defined, we can proceed to optimise the hyperparameters
+# of our likelihood over $D_0$. This is done by minimising the MLL using `BFGS`. We also
+# plot its value at each step to visually confirm that we have found the minimum. See
+# the  [introduction to Gaussian
+# Processes](https://docs.jaxgaussianprocesses.com/_examples/intro_to_gps/) notebook for
+# more information on optimising the MLL.
 
 
 # %%
@@ -293,13 +340,16 @@ def optimise_mll(posterior, dataset, NIters=1000, key=key):
 
 # %% [markdown]
 # ### Comparison
-# We next obtain the latent distribution of the GP of $g$ at $\mathbf{x}_{0,i}$, then extract its mean and standard at the test locations, $\mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i})$, as well as the standard deviation (we will use it at the very end).
+# We next obtain the latent distribution of the GP of $g$ at $\mathbf{x}_{0,i}$, then
+# extract its mean and standard at the test locations,
+# $\mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i})$, as well as the standard deviation (we
+# will use it at the very end).
 
 
 # %%
 def latent_distribution(opt_posterior, pos_3d, dataset_train):
     latent = opt_posterior.predict(pos_3d, train_data=dataset_train)
-    latent_mean = latent.mean()
+    latent_mean = latent.mean
     latent_std = latent.stddev()
     return latent_mean, latent_std
 
@@ -313,7 +363,11 @@ def latent_distribution(opt_posterior, pos_3d, dataset_train):
 
 
 # %% [markdown]
-# We now replot the ground truth (testing data) $D_0$, the predicted latent vector field $\mathbf{F}_{\text{latent}}(\mathbf{x_i})$, and a heatmap of the residuals at each location $\mathbf{R}(\mathbf{x}_{0,i}) = \mathbf{y}_{0,i} - \mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i})$, as well as $\left|\left|\mathbf{R}(\mathbf{x}_{0,i})\right|\right|$.
+# We now replot the ground truth (testing data) $D_0$, the predicted
+# latent vector field $\mathbf{F}_{\text{latent}}(\mathbf{x_i})$, and a heatmap of the
+# residuals at each location $\mathbf{R}(\mathbf{x}_{0,i}) = \mathbf{y}_{0,i} -
+# \mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i})$, as well as
+# $\left|\left|\mathbf{R}(\mathbf{x}_{0,i})\right|\right|$.
 
 
 # %%
@@ -407,59 +461,76 @@ def plot_fields(
         bbox_to_anchor=(0.5, -0.03),
         loc="lower center",
     )
-    plt.show()
 
 
 plot_fields(dataset_ground_truth, dataset_train, dataset_latent_velocity)
 
 
 # %% [markdown]
-# From the latent estimate we can see the velocity GP struggles to reconstruct features of the ground truth. This is because our construction of the kernel placed an independent prior on each physical dimension, which cannot be assumed. Therefore, we need a different approach that can implicitly incorporate this dependence at a fundamental level. To achieve this we will require a *Helmholtz Decomposition*.
+# From the latent estimate we can see the velocity GP struggles to
+# reconstruct features of the ground truth. This is because our construction of the
+# kernel placed an independent prior on each physical dimension, which cannot be
+# assumed. Therefore, we need a different approach that can implicitly incorporate this
+# dependence at a fundamental level. To achieve this we will require a *Helmholtz
+# Decomposition*.
 
 
 # %% [markdown]
 # ## Helmholtz decomposition
-# In 2 dimensions, a twice continuously differentiable and compactly supported vector field $\mathbf{F}: \mathbb{R}^2 \rightarrow \mathbb{R}^2$ can be expressed as the sum of the gradient of a scalar potential $\Phi: \mathbb{R}^2 \rightarrow \mathbb{R}$, called the potential function, and the vorticity operator of another scalar potential $\Psi: \mathbb{R}^2 \rightarrow \mathbb{R}$, called the stream function ([Berlinghieri et al. (2023)](https://arxiv.org/pdf/2302.10364.pdf)) such that
-# $$
-# \mathbf{F}=\operatorname{grad} \Phi+\operatorname{rot} \Psi,
-# $$
-# where
-# $$
-# \operatorname{grad} \Phi:=\left[\begin{array}{l}
-# \partial \Phi / \partial x^{(0)} \\
-# \partial \Phi / \partial x^{(1)}
-# \end{array}\right] \text { and } \operatorname{rot} \Psi:=\left[\begin{array}{c}
-# \partial \Psi / \partial x^{(1)} \\
-# -\partial \Psi / \partial x^{(0)}
-# \end{array}\right].
-# $$
-#
-# This is reminiscent of a 3 dimensional [Helmholtz decomposition](https://en.wikipedia.org/wiki/Helmholtz_decomposition).
-#
-# The 2 dimensional decomposition motivates a different approach: placing priors on $\Psi$ and $\Phi$, allowing us to make assumptions directly about fundamental properties of $\mathbf{F}$. If we choose independent GP priors such that $\Phi \sim \mathcal{G P}\left(0, k_{\Phi}\right)$ and $\Psi \sim \mathcal{G P}\left(0, k_{\Psi}\right)$, then $\mathbf{F} \sim \mathcal{G P} \left(0, k_\text{Helm}\right)$ (since acting linear operations on a GPs give GPs).
-#
-# For $\mathbf{X}, \mathbf{X}^{\prime} \in \mathbb{R}^2 \times \left\{0,1\right\}$ and $z, z^\prime \in \{0,1\}$,
-#
-# $$
-# \boxed{ k_{\mathrm{Helm}}\left(\mathbf{x}, \mathbf{x}^{\prime}\right)_{z,z^\prime} =  \frac{\partial^2 k_{\Phi}\left(\mathbf{x}, \mathbf{x}^{\prime}\right)}{\partial x^{(z)} \partial\left(x^{\prime}\right)^{(z^\prime)}}+(-1)^{z+z^\prime} \frac{\partial^2 k_{\Psi}\left(\mathbf{x}, \mathbf{x}^{\prime}\right)}{\partial x^{(1-z)} \partial\left(x^{\prime}\right)^{(1-z^\prime)}}}.
-# $$
-#
-# where $x^{(z)}$ and $(x^\prime)^{(z^\prime)}$ are the $z$ and $z^\prime$ components of $\mathbf{X}$ and ${\mathbf{X}}^{\prime}$ respectively.
-#
-# We compute the second derivatives using `jax.hessian`. In the following implementation, for a kernel $k(\mathbf{x}, \mathbf{x}^{\prime})$, this computes the Hessian matrix with respect to the components of $\mathbf{x}$
-#
-# $$
-# \frac{\partial^2 k\left(\mathbf{x}, \mathbf{x}^{\prime}\right)}{\partial x^{(z)} \partial x^{(z^\prime)}}.
-# $$
-#
-# Note that we have operated $\dfrac{\partial}{\partial x^{(z)}}$, *not* $\dfrac{\partial}{\partial \left(x^\prime \right)^{(z)}}$, as the boxed equation suggests. This is not an issue if we choose stationary kernels $k(\mathbf{x}, \mathbf{x}^{\prime}) = k(\mathbf{x} - \mathbf{x}^{\prime})$ , as the partial derivatives with respect to the components have the following exchange symmetry:
-#
-# $$
-# \frac{\partial}{\partial x^{(z)}} = - \frac{\partial}{\partial \left( x^\prime \right)^{(z)}},
-# $$
+# In 2 dimensions, a twice continuously differentiable and compactly supported vector
+# field $\mathbf{F}: \mathbb{R}^2 \rightarrow \mathbb{R}^2$ can be expressed as the sum
+# of the gradient of a scalar potential $\Phi: \mathbb{R}^2 \rightarrow \mathbb{R}$,
+# called the potential function, and the vorticity operator of another scalar potential
+# $\Psi: \mathbb{R}^2 \rightarrow \mathbb{R}$, called the stream function ([Berlinghieri
+# et al. (2023)](https://arxiv.org/pdf/2302.10364.pdf)) such that $$
+# \mathbf{F}=\operatorname{grad} \Phi+\operatorname{rot} \Psi, $$ where $$
+# \operatorname{grad} \Phi:=\left[\begin{array}{l} \partial \Phi / \partial x^{(0)} \\
+# \partial \Phi / \partial x^{(1)} \end{array}\right] \text { and } \operatorname{rot}
+# \Psi:=\left[\begin{array}{c} \partial \Psi / \partial x^{(1)} \\
+# -\partial \Psi / \partial x^{(0)} \end{array}\right].  $$
+#
+# This is reminiscent of a 3 dimensional [Helmholtz
+# decomposition](https://en.wikipedia.org/wiki/Helmholtz_decomposition).
+#
+# The 2 dimensional decomposition motivates a different approach: placing priors on
+# $\Psi$ and $\Phi$, allowing us to make assumptions directly about fundamental
+# properties of $\mathbf{F}$. If we choose independent GP priors such that $\Phi \sim
+# \mathcal{G P}\left(0, k_{\Phi}\right)$ and $\Psi \sim \mathcal{G P}\left(0,
+# k_{\Psi}\right)$, then $\mathbf{F} \sim \mathcal{G P} \left(0, k_\text{Helm}\right)$
+# (since acting linear operations on a GPs give GPs).
+#
+# For $\mathbf{X}, \mathbf{X}^{\prime} \in \mathbb{R}^2 \times \left\{0,1\right\}$ and
+# $z, z^\prime \in \{0,1\}$,
+#
+# $$ \boxed{ k_{\mathrm{Helm}}\left(\mathbf{x}, \mathbf{x}^{\prime}\right)_{z,z^\prime}
+# =  \frac{\partial^2 k_{\Phi}\left(\mathbf{x}, \mathbf{x}^{\prime}\right)}{\partial
+# x^{(z)} \partial\left(x^{\prime}\right)^{(z^\prime)}}+(-1)^{z+z^\prime}
+# \frac{\partial^2 k_{\Psi}\left(\mathbf{x}, \mathbf{x}^{\prime}\right)}{\partial
+# x^{(1-z)} \partial\left(x^{\prime}\right)^{(1-z^\prime)}}}.  $$
+#
+# where $x^{(z)}$ and $(x^\prime)^{(z^\prime)}$ are the $z$ and $z^\prime$ components of
+# $\mathbf{X}$ and ${\mathbf{X}}^{\prime}$ respectively.
+#
+# We compute the second derivatives using `jax.hessian`. In the following
+# implementation, for a kernel $k(\mathbf{x}, \mathbf{x}^{\prime})$, this computes the
+# Hessian matrix with respect to the components of $\mathbf{x}$
+#
+# $$ \frac{\partial^2 k\left(\mathbf{x}, \mathbf{x}^{\prime}\right)}{\partial x^{(z)}
+# \partial x^{(z^\prime)}}.  $$
+#
+# Note that we have operated $\dfrac{\partial}{\partial x^{(z)}}$, *not*
+# $\dfrac{\partial}{\partial \left(x^\prime \right)^{(z)}}$, as the boxed equation
+# suggests. This is not an issue if we choose stationary kernels $k(\mathbf{x},
+# \mathbf{x}^{\prime}) = k(\mathbf{x} - \mathbf{x}^{\prime})$ , as the partial
+# derivatives with respect to the components have the following exchange symmetry:
+#
+# $$ \frac{\partial}{\partial x^{(z)}} = - \frac{\partial}{\partial \left( x^\prime
+# \right)^{(z)}}, $$
 #
 # for either $z$.
 # %%
+
+
 @dataclass
 class HelmholtzKernel(gpx.kernels.stationary.StationaryKernel):
     # initialise Phi and Psi kernels as any stationary kernel in gpJax
@@ -492,7 +563,8 @@ def __call__(
 
 # %% [markdown]
 # ### GPJax implementation
-# We repeat the same steps as with the velocity GP model, replacing `VelocityKernel` with `HelmholtzKernel`.
+# We repeat the same steps as with the velocity GP model, replacing `VelocityKernel`
+# with `HelmholtzKernel`.
 
 # %%
 # Redefine Gaussian process with Helmholtz kernel
@@ -504,7 +576,11 @@ def __call__(
 
 # %% [markdown]
 # ### Comparison
-# We again plot the ground truth (testing data) $D_0$, the predicted latent vector field $\mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i})$, and a heatmap of the residuals at each location $R(\mathbf{x}_{0,i}) = \mathbf{y}_{0,i} - \mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i})$ and $\left|\left|R(\mathbf{x}_{0,i})  \right|\right|$.
+# We again plot the ground truth (testing data) $D_0$, the predicted latent vector field
+# $\mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i})$, and a heatmap of the residuals at each
+# location $R(\mathbf{x}_{0,i}) = \mathbf{y}_{0,i} -
+# \mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i})$ and $\left|\left|R(\mathbf{x}_{0,i})
+# \right|\right|$.
 
 # %%
 # obtain latent distribution, extract x and y values over g
@@ -516,25 +592,37 @@ def __call__(
 plot_fields(dataset_ground_truth, dataset_train, dataset_latent_helmholtz)
 
 # %% [markdown]
-# Visually, the Helmholtz model performs better than the velocity model, preserving the local structure of the $\mathbf{F}$. Since we placed priors on $\Phi$ and $\Psi$, the construction of $\mathbf{F}$ allows for correlations between the dimensions (non-zero off-diagonal elements in the Gram matrix populated by $k_\text{Helm}\left(\mathbf{X},\mathbf{X}^{\prime}\right)$ ).
+# Visually, the Helmholtz model performs better than the velocity model, preserving the
+# local structure of the $\mathbf{F}$. Since we placed priors on $\Phi$ and $\Psi$, the
+# construction of $\mathbf{F}$ allows for correlations between the dimensions (non-zero
+# off-diagonal elements in the Gram matrix populated by
+# $k_\text{Helm}\left(\mathbf{X},\mathbf{X}^{\prime}\right)$ ).
 
 
 # %% [markdown]
 # ## Negative log predictive densities
-# Lastly, we directly compare the velocity and Helmholtz models by computing the [negative log predictive densities](https://en.wikipedia.org/wiki/Negative_log_predictive_density) for each model. This is a quantitative metric that measures the probability of the ground truth given the data.
+# Lastly, we directly compare the velocity and Helmholtz models by computing the
+# [negative log predictive
+# densities](https://en.wikipedia.org/wiki/Negative_log_predictive_density) for each
+# model. This is a quantitative metric that measures the probability of the ground truth
+# given the data.
 #
 # $$
 # \mathrm{NLPD}=-\sum_{i=1}^{2N} \log \left(  p\left(\mathcal{Y}_i = Y_{0,i} \mid \mathbf{X}_{i}\right) \right),
 # $$
 #
-# where each $p\left(\mathcal{Y}_i \mid \mathbf{X}_i \right)$ is the marginal Gaussian distribution over $\mathcal{Y}_i$ at each test location, and $Y_{i,0}$ is the $i$-th component of the (massaged) test data that we reserved at the beginning of the notebook in $D_0$. A smaller value is better, since the deviation of the ground truth and the model are small in this case.
+# where each $p\left(\mathcal{Y}_i \mid \mathbf{X}_i \right)$ is the marginal Gaussian
+# distribution over $\mathcal{Y}_i$ at each test location, and $Y_{i,0}$ is the $i$-th
+# component of the (massaged) test data that we reserved at the beginning of the
+# notebook in $D_0$. A smaller value is better, since the deviation of the ground truth
+# and the model are small in this case.
 
 
 # %%
 # ensure testing data alternates between x0 and x1 components
 def nlpd(mean, std, vel_test):
     vel_query = jnp.column_stack((vel_test[0], vel_test[1])).flatten()
-    normal = tfp.substrates.jax.distributions.Normal(loc=mean, scale=std)
+    normal = npd.Normal(loc=mean, scale=std)
     return -jnp.sum(normal.log_prob(vel_query))
 
 
diff --git a/examples/poisson.py b/examples/poisson.py
index c049ddb96..4b5815fbf 100644
--- a/examples/poisson.py
+++ b/examples/poisson.py
@@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: base
 #     language: python
@@ -33,7 +33,6 @@
 from jaxtyping import install_import_hook
 import matplotlib as mpl
 import matplotlib.pyplot as plt
-import tensorflow_probability.substrates.jax as tfp
 
 from examples.utils import use_mpl_style
 
@@ -43,7 +42,6 @@
 
 # Enable Float64 for more stable matrix inversions.
 config.update("jax_enable_x64", True)
-tfd = tfp.distributions
 
 # set the default style for plotting
 use_mpl_style()
@@ -216,7 +214,7 @@ def one_step(state, rng_key):
     model = nnx.merge(graphdef, sample_params, *static_state)
     latent_dist = model.predict(xtest, train_data=D)
     predictive_dist = model.likelihood(latent_dist)
-    posterior_samples.append(predictive_dist.sample(seed=key, sample_shape=(10,)))
+    posterior_samples.append(predictive_dist.sample(key=key, sample_shape=(10,)))
 
 posterior_samples = jnp.vstack(posterior_samples)
 lower_ci, upper_ci = jnp.percentile(posterior_samples, jnp.array([2.5, 97.5]), axis=0)
diff --git a/examples/regression.py b/examples/regression.py
index 5633110cd..54968e9e4 100644
--- a/examples/regression.py
+++ b/examples/regression.py
@@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -126,9 +126,9 @@
 # %%
 prior_dist = prior.predict(xtest)
 
-prior_mean = prior_dist.mean()
-prior_std = prior_dist.variance()
-samples = prior_dist.sample(seed=key, sample_shape=(20,))
+prior_mean = prior_dist.mean
+prior_std = prior_dist.variance
+samples = prior_dist.sample(key=key, sample_shape=(20,))
 
 
 fig, ax = plt.subplots()
@@ -217,8 +217,8 @@
 latent_dist = opt_posterior.predict(xtest, train_data=D)
 predictive_dist = opt_posterior.likelihood(latent_dist)
 
-predictive_mean = predictive_dist.mean()
-predictive_std = predictive_dist.stddev()
+predictive_mean = predictive_dist.mean
+predictive_std = jnp.sqrt(predictive_dist.variance)
 
 # %% [markdown]
 # With the predictions and their uncertainty acquired, we illustrate the GP's
diff --git a/examples/uncollapsed_vi.py b/examples/uncollapsed_vi.py
index c2c9e11d8..708173342 100644
--- a/examples/uncollapsed_vi.py
+++ b/examples/uncollapsed_vi.py
@@ -8,7 +8,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax_beartype
 #     language: python
@@ -39,7 +39,6 @@
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import optax as ox
-import tensorflow_probability.substrates.jax as tfp
 
 from examples.utils import use_mpl_style
 
@@ -50,9 +49,6 @@
     import gpjax as gpx
     import gpjax.kernels as jk
 
-
-tfb = tfp.bijectors
-
 key = jr.key(123)
 
 # set the default style for plotting
@@ -112,7 +108,7 @@
 # On the other hand, sparse variational Gaussian processes (SVGPs)
 # [approximate the posterior, not the model](https://www.secondmind.ai/labs/sparse-gps-approximate-the-posterior-not-the-model/).
 # These provide a low-rank approximation scheme via variational inference. Here we
-# posit a family of densities parameterised by “variational parameters”.
+# posit a family of densities parameterised by "variational parameters".
 # We then seek to find the closest family member to the posterior by minimising the
 # Kullback-Leibler divergence over the variational parameters.
 # The fitted variational density then serves as a proxy for the exact posterior.
@@ -273,8 +269,8 @@
 latent_dist = opt_posterior(xtest)
 predictive_dist = opt_posterior.posterior.likelihood(latent_dist)
 
-meanf = predictive_dist.mean()
-sigma = predictive_dist.stddev()
+meanf = predictive_dist.mean
+sigma = jnp.sqrt(predictive_dist.variance)
 
 fig, ax = plt.subplots()
 ax.scatter(x, y, alpha=0.15, label="Training Data", color=cols[0])
@@ -298,68 +294,6 @@
 )
 ax.legend()
 
-# %% [markdown]
-# ## Custom transformations
-#
-# To train a covariance matrix, GPJax uses `tfb.FillTriangular` transformation by
-# default. `tfb.FillTriangular` fills a 1d vector into a lower triangular matrix.
-# Users can change this default transformation
-# with another valid transformation of their choice. For example, `Square`
-# transformation on the diagonal can also serve the purpose.
-
-# %%
-
-params_bijection = gpx.parameters.DEFAULT_BIJECTION.copy()
-params_bijection[gpx.parameters.LowerTriangular] = tfb.FillScaleTriL(
-    diag_bijector=tfb.Square(), diag_shift=jnp.array(q.jitter)
-)
-
-# %%
-opt_rep, history = gpx.fit(
-    model=q,
-    objective=lambda p, d: -gpx.objectives.elbo(p, d),
-    train_data=D,
-    optim=ox.adam(learning_rate=0.01),
-    num_iters=3000,
-    key=jr.key(42),
-    batch_size=128,
-)
-
-
-# %%
-latent_dist = opt_rep(xtest)
-predictive_dist = opt_rep.posterior.likelihood(latent_dist)
-
-meanf = predictive_dist.mean()
-sigma = predictive_dist.stddev()
-
-fig, ax = plt.subplots()
-ax.scatter(x, y, alpha=0.15, label="Training Data", color=cols[0])
-ax.plot(xtest, meanf, label="Posterior mean", color=cols[1])
-ax.fill_between(
-    xtest.flatten(),
-    meanf - 2 * sigma,
-    meanf + 2 * sigma,
-    alpha=0.3,
-    color=cols[1],
-    label="Two sigma",
-)
-ax.vlines(
-    opt_rep.inducing_inputs.value,
-    ymin=y.min(),
-    ymax=y.max(),
-    alpha=0.3,
-    linewidth=1,
-    label="Inducing point",
-    color=cols[2],
-)
-ax.legend()
-
-# %% [markdown]
-# We can see that `Square` transformation is able to get relatively better fit
-# compared to `Softplus` with the same number of iterations, but `Softplus` is
-# recommended over `Square` for stability of optimisation.
-
 # %% [markdown]
 # ## System configuration
 
diff --git a/examples/yacht.py b/examples/yacht.py
index a07ecfc18..2f4da30d3 100644
--- a/examples/yacht.py
+++ b/examples/yacht.py
@@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.16.6
+#       jupytext_version: 1.16.7
 #   kernelspec:
 #     display_name: gpjax
 #     language: python
@@ -211,8 +211,8 @@
 latent_dist = opt_posterior(scaled_Xte, training_data)
 predictive_dist = likelihood(latent_dist)
 
-predictive_mean = predictive_dist.mean()
-predictive_stddev = predictive_dist.stddev()
+predictive_mean = predictive_dist.mean
+predictive_stddev = jnp.sqrt(predictive_dist.variance)
 
 # %% [markdown]
 # ## Evaluation
diff --git a/gpjax/__init__.py b/gpjax/__init__.py
index e9430cbee..744c8aa9f 100644
--- a/gpjax/__init__.py
+++ b/gpjax/__init__.py
@@ -39,7 +39,7 @@
 __description__ = "Didactic Gaussian processes in JAX"
 __url__ = "https://github.com/JaxGaussianProcesses/GPJax"
 __contributors__ = "https://github.com/JaxGaussianProcesses/GPJax/graphs/contributors"
-__version__ = "0.10.2"
+__version__ = "0.11.0"
 
 __all__ = [
     "base",
diff --git a/gpjax/distributions.py b/gpjax/distributions.py
index 3c6d7e82a..39e6aaceb 100644
--- a/gpjax/distributions.py
+++ b/gpjax/distributions.py
@@ -15,77 +15,76 @@
 
 
 from beartype.typing import (
-    Any,
     Optional,
-    Tuple,
-    TypeVar,
 )
 import cola
+from cola.linalg.decompositions import Cholesky
 from cola.ops import (
-    Identity,
     LinearOperator,
 )
 from jax import vmap
 import jax.numpy as jnp
 import jax.random as jr
 from jaxtyping import Float
-import tensorflow_probability.substrates.jax as tfp
+from numpyro.distributions import constraints
+from numpyro.distributions.distribution import Distribution
+from numpyro.distributions.util import is_prng_key
 
 from gpjax.lower_cholesky import lower_cholesky
 from gpjax.typing import (
     Array,
-    KeyArray,
     ScalarFloat,
 )
 
-tfd = tfp.distributions
-
-from cola.linalg.decompositions import Cholesky
-
 
-class GaussianDistribution(tfd.Distribution):
-    r"""Multivariate Gaussian distribution with a linear operator scale matrix."""
-
-    # TODO: Consider `distrax.transformed.Transformed` object. Can we create a LinearOperator to `distrax.bijector` representation
-    # and modify `distrax.MultivariateNormalFromBijector`?
-    # TODO: Consider natural and expectation parameterisations in future work.
-    # TODO: we don't really need to inherit from `tfd.Distribution` here
+class GaussianDistribution(Distribution):
+    support = constraints.real_vector
 
     def __init__(
         self,
-        loc: Optional[Float[Array, " N"]] = None,
-        scale: Optional[LinearOperator] = None,
-    ) -> None:
-        r"""Initialises the distribution.
-
-        Args:
-            loc: the mean of the distribution as an array of shape (n_points,).
-            scale: the scale matrix of the distribution as a LinearOperator object.
-        """
-        _check_loc_scale(loc, scale)
+        loc: Optional[Float[Array, " N"]],
+        scale: Optional[LinearOperator],
+        validate_args=None,
+    ):
+        self.loc = loc
+        self.scale = cola.PSD(scale)
+        batch_shape = ()
+        event_shape = jnp.shape(self.loc)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
 
-        # Find dimensionality of the distribution.
-        if loc is not None:
-            num_dims = loc.shape[-1]
+    def sample(self, key, sample_shape=()):
+        assert is_prng_key(key)
+        # Obtain covariance root.
+        covariance_root = lower_cholesky(self.scale)
 
-        elif scale is not None:
-            num_dims = scale.shape[-1]
+        # Gather n samples from standard normal distribution Z = [z₁, ..., zₙ]ᵀ.
+        white_noise = jr.normal(
+            key, shape=sample_shape + self.batch_shape + self.event_shape
+        )
 
-        # Set the location to zero vector if unspecified.
-        if loc is None:
-            loc = jnp.zeros((num_dims,))
+        # xᵢ ~ N(loc, cov) <=> xᵢ = loc + sqrt zᵢ, where zᵢ ~ N(0, I).
+        def affine_transformation(_x):
+            return self.loc + covariance_root @ _x
 
-        # If not specified, set the scale to the identity matrix.
-        if scale is None:
-            scale = Identity(shape=(num_dims, num_dims), dtype=loc.dtype)
-
-        self.loc = loc
-        self.scale = cola.PSD(scale)
+        return vmap(affine_transformation)(white_noise)
 
+    @property
     def mean(self) -> Float[Array, " N"]:
         r"""Calculates the mean."""
         return self.loc
 
+    @property
+    def variance(self) -> Float[Array, " N"]:
+        r"""Calculates the variance."""
+        return cola.diag(self.scale)
+
+    def entropy(self) -> ScalarFloat:
+        r"""Calculates the entropy of the distribution."""
+        return 0.5 * (
+            self.event_shape[0] * (1.0 + jnp.log(2.0 * jnp.pi))
+            + cola.logdet(self.scale, Cholesky(), Cholesky())
+        )
+
     def median(self) -> Float[Array, " N"]:
         r"""Calculates the median."""
         return self.loc
@@ -98,25 +97,19 @@ def covariance(self) -> Float[Array, "N N"]:
         r"""Calculates the covariance matrix."""
         return self.scale.to_dense()
 
-    def variance(self) -> Float[Array, " N"]:
-        r"""Calculates the variance."""
-        return cola.diag(self.scale)
+    @property
+    def covariance_matrix(self) -> Float[Array, "N N"]:
+        r"""Calculates the covariance matrix."""
+        return self.covariance()
 
     def stddev(self) -> Float[Array, " N"]:
         r"""Calculates the standard deviation."""
         return jnp.sqrt(cola.diag(self.scale))
 
-    @property
-    def event_shape(self) -> Tuple:
-        r"""Returns the event shape."""
-        return self.loc.shape[-1:]
-
-    def entropy(self) -> ScalarFloat:
-        r"""Calculates the entropy of the distribution."""
-        return 0.5 * (
-            self.event_shape[0] * (1.0 + jnp.log(2.0 * jnp.pi))
-            + cola.logdet(self.scale, Cholesky(), Cholesky())
-        )
+    #     @property
+    #     def event_shape(self) -> Tuple:
+    #         r"""Returns the event shape."""
+    #         return self.loc.shape[-1:]
 
     def log_prob(self, y: Float[Array, " N"]) -> ScalarFloat:
         r"""Calculates the log pdf of the multivariate Gaussian.
@@ -141,42 +134,39 @@ def log_prob(self, y: Float[Array, " N"]) -> ScalarFloat:
             + diff.T @ cola.solve(sigma, diff, Cholesky())
         )
 
-    def _sample_n(self, key: KeyArray, n: int) -> Float[Array, "n N"]:
-        r"""Samples from the distribution.
+    #     def _sample_n(self, key: KeyArray, n: int) -> Float[Array, "n N"]:
+    #         r"""Samples from the distribution.
 
-        Args:
-            key (KeyArray): The key to use for sampling.
+    #         Args:
+    #             key (KeyArray): The key to use for sampling.
 
-        Returns:
-            The samples as an array of shape (n_samples, n_points).
-        """
-        # Obtain covariance root.
-        sqrt = lower_cholesky(self.scale)
+    #         Returns:
+    #             The samples as an array of shape (n_samples, n_points).
+    #         """
+    #         # Obtain covariance root.
+    #         sqrt = lower_cholesky(self.scale)
 
-        # Gather n samples from standard normal distribution Z = [z₁, ..., zₙ]ᵀ.
-        Z = jr.normal(key, shape=(n, *self.event_shape))
+    #         # Gather n samples from standard normal distribution Z = [z₁, ..., zₙ]ᵀ.
+    #         Z = jr.normal(key, shape=(n, *self.event_shape))
 
-        # xᵢ ~ N(loc, cov) <=> xᵢ = loc + sqrt zᵢ, where zᵢ ~ N(0, I).
-        def affine_transformation(x):
-            return self.loc + sqrt @ x
+    #         # xᵢ ~ N(loc, cov) <=> xᵢ = loc + sqrt zᵢ, where zᵢ ~ N(0, I).
+    #         def affine_transformation(x):
+    #             return self.loc + sqrt @ x
 
-        return vmap(affine_transformation)(Z)
+    #         return vmap(affine_transformation)(Z)
 
-    def sample(
-        self, seed: KeyArray, sample_shape: Tuple[int, ...]
-    ):  # pylint: disable=useless-super-delegation
-        r"""See `Distribution.sample`."""
-        return self._sample_n(
-            seed, sample_shape[0]
-        )  # TODO this looks weird, why ignore the second entry?
+    #     def sample(
+    #         self, seed: KeyArray, sample_shape: Tuple[int, ...]
+    #     ):  # pylint: disable=useless-super-delegation
+    #         r"""See `Distribution.sample`."""
+    #         return self._sample_n(
+    #             seed, sample_shape[0]
+    #         )  # TODO this looks weird, why ignore the second entry?
 
     def kl_divergence(self, other: "GaussianDistribution") -> ScalarFloat:
         return _kl_divergence(self, other)
 
 
-DistrT = TypeVar("DistrT", bound=tfd.Distribution)
-
-
 def _check_and_return_dimension(
     q: GaussianDistribution, p: GaussianDistribution
 ) -> int:
@@ -245,37 +235,37 @@ def _kl_divergence(q: GaussianDistribution, p: GaussianDistribution) -> ScalarFl
     ) / 2.0
 
 
-def _check_loc_scale(loc: Optional[Any], scale: Optional[Any]) -> None:
-    r"""Checks that the inputs are correct."""
-    if loc is None and scale is None:
-        raise ValueError("At least one of `loc` or `scale` must be specified.")
-
-    if loc is not None and loc.ndim < 1:
-        raise ValueError("The parameter `loc` must have at least one dimension.")
-
-    if scale is not None and len(scale.shape) < 2:  # scale.ndim < 2:
-        raise ValueError(
-            "The `scale` must have at least two dimensions, but "
-            f"`scale.shape = {scale.shape}`."
-        )
-
-    if scale is not None and not isinstance(scale, LinearOperator):
-        raise ValueError(
-            f"The `scale` must be a CoLA LinearOperator but got {type(scale)}"
-        )
-
-    if scale is not None and (scale.shape[-1] != scale.shape[-2]):
-        raise ValueError(
-            f"The `scale` must be a square matrix, but `scale.shape = {scale.shape}`."
-        )
-
-    if loc is not None:
-        num_dims = loc.shape[-1]
-        if scale is not None and (scale.shape[-1] != num_dims):
-            raise ValueError(
-                f"Shapes are not compatible: `loc.shape = {loc.shape}` and "
-                f"`scale.shape = {scale.shape}`."
-            )
+# def _check_loc_scale(loc: Optional[Any], scale: Optional[Any]) -> None:
+#     r"""Checks that the inputs are correct."""
+#     if loc is None and scale is None:
+#         raise ValueError("At least one of `loc` or `scale` must be specified.")
+
+#     if loc is not None and loc.ndim < 1:
+#         raise ValueError("The parameter `loc` must have at least one dimension.")
+
+#     if scale is not None and len(scale.shape) < 2:  # scale.ndim < 2:
+#         raise ValueError(
+#             "The `scale` must have at least two dimensions, but "
+#             f"`scale.shape = {scale.shape}`."
+#         )
+
+#     if scale is not None and not isinstance(scale, LinearOperator):
+#         raise ValueError(
+#             f"The `scale` must be a CoLA LinearOperator but got {type(scale)}"
+#         )
+
+#     if scale is not None and (scale.shape[-1] != scale.shape[-2]):
+#         raise ValueError(
+#             f"The `scale` must be a square matrix, but `scale.shape = {scale.shape}`."
+#         )
+
+#     if loc is not None:
+#         num_dims = loc.shape[-1]
+#         if scale is not None and (scale.shape[-1] != num_dims):
+#             raise ValueError(
+#                 f"Shapes are not compatible: `loc.shape = {loc.shape}` and "
+#                 f"`scale.shape = {scale.shape}`."
+#             )
 
 
 __all__ = [
diff --git a/gpjax/fit.py b/gpjax/fit.py
index c6ed0935b..571406ae2 100644
--- a/gpjax/fit.py
+++ b/gpjax/fit.py
@@ -20,9 +20,9 @@
 from jax.flatten_util import ravel_pytree
 import jax.numpy as jnp
 import jax.random as jr
+from numpyro.distributions.transforms import Transform
 import optax as ox
 from scipy.optimize import minimize
-from tensorflow_probability.substrates.jax.bijectors import Bijector
 
 from gpjax.dataset import Dataset
 from gpjax.objectives import Objective
@@ -47,7 +47,7 @@ def fit(  # noqa: PLR0913
     objective: Objective,
     train_data: Dataset,
     optim: ox.GradientTransformation,
-    params_bijection: tp.Union[dict[Parameter, Bijector], None] = DEFAULT_BIJECTION,
+    params_bijection: tp.Union[dict[Parameter, Transform], None] = DEFAULT_BIJECTION,
     key: KeyArray = jr.PRNGKey(42),
     num_iters: int = 100,
     batch_size: int = -1,
diff --git a/gpjax/kernels/approximations/rff.py b/gpjax/kernels/approximations/rff.py
index dedcd3c4c..8fb56b78d 100644
--- a/gpjax/kernels/approximations/rff.py
+++ b/gpjax/kernels/approximations/rff.py
@@ -68,7 +68,7 @@ def __init__(
 
             self.frequencies = Static(
                 self.base_kernel.spectral_density.sample(
-                    seed=key, sample_shape=(self.num_basis_fns, n_dims)
+                    key=key, sample_shape=(self.num_basis_fns, n_dims)
                 )
             )
         self.name = f"{self.base_kernel.name} (RFF)"
diff --git a/gpjax/kernels/stationary/base.py b/gpjax/kernels/stationary/base.py
index 4045e6c37..835ff9fa9 100644
--- a/gpjax/kernels/stationary/base.py
+++ b/gpjax/kernels/stationary/base.py
@@ -18,7 +18,7 @@
 from flax import nnx
 import jax.numpy as jnp
 from jaxtyping import Float
-import tensorflow_probability.substrates.jax.distributions as tfd
+import numpyro.distributions as npd
 
 from gpjax.kernels.base import AbstractKernel
 from gpjax.kernels.computations import (
@@ -92,7 +92,7 @@ def __init__(
                 self.variance = tp.cast(PositiveReal[ScalarFloat], self.variance)
 
     @property
-    def spectral_density(self) -> tfd.Distribution:
+    def spectral_density(self) -> npd.Normal | npd.StudentT:
         r"""The spectral density of the kernel.
 
         Returns:
diff --git a/gpjax/kernels/stationary/matern12.py b/gpjax/kernels/stationary/matern12.py
index ddf2f1058..cab6d04c2 100644
--- a/gpjax/kernels/stationary/matern12.py
+++ b/gpjax/kernels/stationary/matern12.py
@@ -15,7 +15,7 @@
 
 import jax.numpy as jnp
 from jaxtyping import Float
-import tensorflow_probability.substrates.jax.distributions as tfd
+import numpyro.distributions as npd
 
 from gpjax.kernels.stationary.base import StationaryKernel
 from gpjax.kernels.stationary.utils import (
@@ -48,5 +48,5 @@ def __call__(self, x: Float[Array, " D"], y: Float[Array, " D"]) -> ScalarFloat:
         return K.squeeze()
 
     @property
-    def spectral_density(self) -> tfd.Distribution:
+    def spectral_density(self) -> npd.StudentT:
         return build_student_t_distribution(nu=1)
diff --git a/gpjax/kernels/stationary/matern32.py b/gpjax/kernels/stationary/matern32.py
index a0127745c..4725cd0bb 100644
--- a/gpjax/kernels/stationary/matern32.py
+++ b/gpjax/kernels/stationary/matern32.py
@@ -15,7 +15,7 @@
 
 import jax.numpy as jnp
 from jaxtyping import Float
-import tensorflow_probability.substrates.jax.distributions as tfd
+import numpyro.distributions as npd
 
 from gpjax.kernels.stationary.base import StationaryKernel
 from gpjax.kernels.stationary.utils import (
@@ -54,5 +54,5 @@ def __call__(
         return K.squeeze()
 
     @property
-    def spectral_density(self) -> tfd.Distribution:
+    def spectral_density(self) -> npd.StudentT:
         return build_student_t_distribution(nu=3)
diff --git a/gpjax/kernels/stationary/matern52.py b/gpjax/kernels/stationary/matern52.py
index 65130df51..8e9ab2953 100644
--- a/gpjax/kernels/stationary/matern52.py
+++ b/gpjax/kernels/stationary/matern52.py
@@ -15,7 +15,7 @@
 
 import jax.numpy as jnp
 from jaxtyping import Float
-import tensorflow_probability.substrates.jax.distributions as tfd
+import numpyro.distributions as npd
 
 from gpjax.kernels.stationary.base import StationaryKernel
 from gpjax.kernels.stationary.utils import (
@@ -53,5 +53,5 @@ def __call__(
         return K.squeeze()
 
     @property
-    def spectral_density(self) -> tfd.Distribution:
+    def spectral_density(self) -> npd.StudentT:
         return build_student_t_distribution(nu=5)
diff --git a/gpjax/kernels/stationary/rbf.py b/gpjax/kernels/stationary/rbf.py
index 7d0cbe0e3..fc8446403 100644
--- a/gpjax/kernels/stationary/rbf.py
+++ b/gpjax/kernels/stationary/rbf.py
@@ -15,7 +15,7 @@
 
 import jax.numpy as jnp
 from jaxtyping import Float
-import tensorflow_probability.substrates.jax as tfp
+import numpyro.distributions as npd
 
 from gpjax.kernels.stationary.base import StationaryKernel
 from gpjax.kernels.stationary.utils import squared_distance
@@ -44,5 +44,5 @@ def __call__(self, x: Float[Array, " D"], y: Float[Array, " D"]) -> ScalarFloat:
         return K.squeeze()
 
     @property
-    def spectral_density(self) -> tfp.distributions.Normal:
-        return tfp.distributions.Normal(0.0, 1.0)
+    def spectral_density(self) -> npd.Normal:
+        return npd.Normal(0.0, 1.0)
diff --git a/gpjax/kernels/stationary/utils.py b/gpjax/kernels/stationary/utils.py
index 58c08fa66..c3de36704 100644
--- a/gpjax/kernels/stationary/utils.py
+++ b/gpjax/kernels/stationary/utils.py
@@ -14,17 +14,15 @@
 # ==============================================================================
 import jax.numpy as jnp
 from jaxtyping import Float
-import tensorflow_probability.substrates.jax as tfp
+import numpyro.distributions as npd
 
 from gpjax.typing import (
     Array,
     ScalarFloat,
 )
 
-tfd = tfp.distributions
 
-
-def build_student_t_distribution(nu: int) -> tfd.Distribution:
+def build_student_t_distribution(nu: int) -> npd.StudentT:
     r"""Build a Student's t distribution with a fixed smoothness parameter.
 
     For a fixed half-integer smoothness parameter, compute the spectral density of a
@@ -37,7 +35,7 @@ def build_student_t_distribution(nu: int) -> tfd.Distribution:
     -------
         tfp.Distribution: A Student's t distribution with the same smoothness parameter.
     """
-    dist = tfd.StudentT(df=nu, loc=0.0, scale=1.0)
+    dist = npd.StudentT(df=nu, loc=0.0, scale=1.0)
     return dist
 
 
diff --git a/gpjax/likelihoods.py b/gpjax/likelihoods.py
index ba831c81b..e0585b51d 100644
--- a/gpjax/likelihoods.py
+++ b/gpjax/likelihoods.py
@@ -19,7 +19,7 @@
 import jax.numpy as jnp
 import jax.scipy as jsp
 from jaxtyping import Float
-import tensorflow_probability.substrates.jax as tfp
+import numpyro.distributions as npd
 
 from gpjax.distributions import GaussianDistribution
 from gpjax.integrators import (
@@ -36,9 +36,6 @@
     ScalarFloat,
 )
 
-tfb = tfp.bijectors
-tfd = tfp.distributions
-
 
 class AbstractLikelihood(nnx.Module):
     r"""Abstract base class for likelihoods.
@@ -62,7 +59,7 @@ def __init__(
         self.num_datapoints = num_datapoints
         self.integrator = integrator
 
-    def __call__(self, *args: tp.Any, **kwargs: tp.Any) -> tfd.Distribution:
+    def __call__(self, *args: tp.Any, **kwargs: tp.Any) -> npd.Distribution:
         r"""Evaluate the likelihood function at a given predictive distribution.
 
         Args:
@@ -76,7 +73,7 @@ def __call__(self, *args: tp.Any, **kwargs: tp.Any) -> tfd.Distribution:
         return self.predict(*args, **kwargs)
 
     @abc.abstractmethod
-    def predict(self, *args: tp.Any, **kwargs: tp.Any) -> tfd.Distribution:
+    def predict(self, *args: tp.Any, **kwargs: tp.Any) -> npd.Distribution:
         r"""Evaluate the likelihood function at a given predictive distribution.
 
         Args:
@@ -85,19 +82,19 @@ def predict(self, *args: tp.Any, **kwargs: tp.Any) -> tfd.Distribution:
                 `predict` method.
 
         Returns:
-            tfd.Distribution: The predictive distribution.
+            npd.Distribution: The predictive distribution.
         """
         raise NotImplementedError
 
     @abc.abstractmethod
-    def link_function(self, f: Float[Array, "..."]) -> tfd.Distribution:
+    def link_function(self, f: Float[Array, "..."]) -> npd.Distribution:
         r"""Return the link function of the likelihood function.
 
         Args:
             f (Float[Array, "..."]): the latent Gaussian process values.
 
         Returns:
-            tfd.Distribution: The distribution of observations, y, given values of the
+            npd.Distribution: The distribution of observations, y, given values of the
                 Gaussian process, f.
         """
         raise NotImplementedError
@@ -157,20 +154,20 @@ def __init__(
 
         super().__init__(num_datapoints, integrator)
 
-    def link_function(self, f: Float[Array, "..."]) -> tfd.Normal:
+    def link_function(self, f: Float[Array, "..."]) -> npd.Normal:
         r"""The link function of the Gaussian likelihood.
 
         Args:
             f (Float[Array, "..."]): Function values.
 
         Returns:
-            tfd.Normal: The likelihood function.
+            npd.Normal: The likelihood function.
         """
-        return tfd.Normal(loc=f, scale=self.obs_stddev.value.astype(f.dtype))
+        return npd.Normal(loc=f, scale=self.obs_stddev.value.astype(f.dtype))
 
     def predict(
-        self, dist: tp.Union[tfd.MultivariateNormalTriL, GaussianDistribution]
-    ) -> tfd.MultivariateNormalFullCovariance:
+        self, dist: tp.Union[npd.MultivariateNormal, GaussianDistribution]
+    ) -> npd.MultivariateNormal:
         r"""Evaluate the Gaussian likelihood.
 
         Evaluate the Gaussian likelihood function at a given predictive
@@ -179,75 +176,79 @@ def predict(
         distribution's covariance matrix.
 
         Args:
-            dist (tfd.Distribution): The Gaussian process posterior,
+            dist (npd.Distribution): The Gaussian process posterior,
                 evaluated at a finite set of test points.
 
         Returns:
-            tfd.Distribution: The predictive distribution.
+            npd.Distribution: The predictive distribution.
         """
         n_data = dist.event_shape[0]
-        cov = dist.covariance()
+        cov = dist.covariance_matrix
         noisy_cov = cov.at[jnp.diag_indices(n_data)].add(self.obs_stddev.value**2)
 
-        return tfd.MultivariateNormalFullCovariance(dist.mean(), noisy_cov)
+        return npd.MultivariateNormal(dist.mean, noisy_cov)
 
 
 class Bernoulli(AbstractLikelihood):
-    def link_function(self, f: Float[Array, "..."]) -> tfd.Distribution:
+    def link_function(self, f: Float[Array, "..."]) -> npd.BernoulliProbs:
         r"""The probit link function of the Bernoulli likelihood.
 
         Args:
             f (Float[Array, "..."]): Function values.
 
         Returns:
-            tfd.Distribution: The likelihood function.
+            npd.Bernoulli: The likelihood function.
         """
-        return tfd.Bernoulli(probs=inv_probit(f))
+        return npd.Bernoulli(probs=inv_probit(f))
 
-    def predict(self, dist: tfd.Distribution) -> tfd.Distribution:
+    def predict(
+        self, dist: tp.Union[npd.MultivariateNormal, GaussianDistribution]
+    ) -> npd.BernoulliProbs:
         r"""Evaluate the pointwise predictive distribution.
 
         Evaluate the pointwise predictive distribution, given a Gaussian
         process posterior and likelihood parameters.
 
         Args:
-            dist (tfd.Distribution): The Gaussian process posterior, evaluated
-                at a finite set of test points.
+            dist ([npd.MultivariateNormal, GaussianDistribution].): The Gaussian
+                process posterior, evaluated at a finite set of test points.
 
         Returns:
-            tfd.Distribution: The pointwise predictive distribution.
+            npd.Bernoulli: The pointwise predictive distribution.
         """
-        variance = jnp.diag(dist.covariance())
-        mean = dist.mean().ravel()
+        variance = jnp.diag(dist.covariance_matrix)
+        mean = dist.mean.ravel()
         return self.link_function(mean / jnp.sqrt(1.0 + variance))
 
 
 class Poisson(AbstractLikelihood):
-    def link_function(self, f: Float[Array, "..."]) -> tfd.Distribution:
+    def link_function(self, f: Float[Array, "..."]) -> npd.Poisson:
         r"""The link function of the Poisson likelihood.
 
         Args:
             f (Float[Array, "..."]): Function values.
 
         Returns:
-            tfd.Distribution: The likelihood function.
+            npd.Poisson: The likelihood function.
         """
-        return tfd.Poisson(rate=jnp.exp(f))
+        return npd.Poisson(rate=jnp.exp(f))
 
-    def predict(self, dist: tfd.Distribution) -> tfd.Distribution:
+    def predict(
+        self, dist: tp.Union[npd.MultivariateNormal, GaussianDistribution]
+    ) -> npd.Poisson:
         r"""Evaluate the pointwise predictive distribution.
 
         Evaluate the pointwise predictive distribution, given a Gaussian
         process posterior and likelihood parameters.
 
         Args:
-            dist (tfd.Distribution): The Gaussian process posterior, evaluated
-                at a finite set of test points.
+            dist (tp.Union[npd.MultivariateNormal, GaussianDistribution]): The Gaussian
+                process posterior, evaluated at a finite set of test points.
 
         Returns:
-            tfd.Distribution: The pointwise predictive distribution.
+            npd.Poisson: The pointwise predictive distribution.
         """
-        return self.link_function(dist.mean())
+        return self.link_function(dist.mean)
 
 
 def inv_probit(x: Float[Array, " *N"]) -> Float[Array, " *N"]:
diff --git a/gpjax/mean_functions.py b/gpjax/mean_functions.py
index 5c5696d34..b49769aec 100644
--- a/gpjax/mean_functions.py
+++ b/gpjax/mean_functions.py
@@ -28,7 +28,7 @@
 from gpjax.parameters import (
     Parameter,
     Real,
-    Static
+    Static,
 )
 from gpjax.typing import (
     Array,
@@ -131,7 +131,8 @@ class Constant(AbstractMeanFunction):
     """
 
     def __init__(
-        self, constant: tp.Union[ScalarFloat, Float[Array, " O"], Parameter, Static] = 0.0
+        self,
+        constant: tp.Union[ScalarFloat, Float[Array, " O"], Parameter, Static] = 0.0,
     ):
         if isinstance(constant, Parameter) or isinstance(constant, Static):
             self.constant = constant
diff --git a/gpjax/numpyro_extras.py b/gpjax/numpyro_extras.py
new file mode 100644
index 000000000..846c39af1
--- /dev/null
+++ b/gpjax/numpyro_extras.py
@@ -0,0 +1,106 @@
+import math
+
+import jax
+import jax.numpy as jnp
+from numpyro.distributions.transforms import Transform
+
+# -----------------------------------------------------------------------------
+# Implementation: FillTriangularTransform
+# -----------------------------------------------------------------------------
+
+
+class FillTriangularTransform(Transform):
+    """
+    Transform that maps a vector of length n(n+1)/2 to an n x n lower triangular matrix.
+    The ordering is assumed to be:
+       (0,0), (1,0), (1,1), (2,0), (2,1), (2,2), ..., (n-1, n-1)
+    """
+
+    # Note: The base class provides `inv` through _InverseTransform wrapping _inverse.
+
+    def __call__(self, x):
+        """
+        Forward transformation.
+
+        Parameters
+        ----------
+        x : array_like, shape (..., L)
+            Input vector with L = n(n+1)/2 for some integer n.
+
+        Returns
+        -------
+        y : array_like, shape (..., n, n)
+            Lower-triangular matrix (with zeros in the upper triangle) filled in
+            row-major order (i.e. [ (0,0), (1,0), (1,1), ... ]).
+        """
+        L = x.shape[-1]
+        # Use static (Python) math.sqrt to compute n. This avoids tracer issues.
+        n = int((-1 + math.sqrt(1 + 8 * L)) // 2)
+        if n * (n + 1) // 2 != L:
+            raise ValueError("Last dimension must equal n(n+1)/2 for some integer n.")
+
+        def fill_single(vec):
+            out = jnp.zeros((n, n), dtype=vec.dtype)
+            row, col = jnp.tril_indices(n)
+            return out.at[row, col].set(vec)
+
+        if x.ndim == 1:
+            return fill_single(x)
+        else:
+            batch_shape = x.shape[:-1]
+            flat_x = x.reshape((-1, L))
+            out = jax.vmap(fill_single)(flat_x)
+            return out.reshape(batch_shape + (n, n))
+
+    def _inverse(self, y):
+        """
+        Inverse transformation.
+
+        Parameters
+        ----------
+        y : array_like, shape (..., n, n)
+            Lower triangular matrix.
+
+        Returns
+        -------
+        x : array_like, shape (..., n(n+1)/2)
+            The vector containing the elements from the lower-triangular portion of y.
+        """
+        if y.ndim < 2:
+            raise ValueError("Input to inverse must be at least two-dimensional.")
+        n = y.shape[-1]
+        if y.shape[-2] != n:
+            raise ValueError(
+                "Input matrix must be square; got shape %s" % str(y.shape[-2:])
+            )
+
+        row, col = jnp.tril_indices(n)
+
+        def inv_single(mat):
+            return mat[row, col]
+
+        if y.ndim == 2:
+            return inv_single(y)
+        else:
+            batch_shape = y.shape[:-2]
+            flat_y = y.reshape((-1, n, n))
+            out = jax.vmap(inv_single)(flat_y)
+            return out.reshape(batch_shape + (n * (n + 1) // 2,))
+
+    def log_abs_det_jacobian(self, x, y, intermediates=None):
+        # Since the transform simply reorders the vector into a matrix, the Jacobian determinant is 1.
+        return jnp.zeros(x.shape[:-1])
+
+    @property
+    def sign(self):
+        # The reordering transformation has a positive derivative everywhere.
+        return 1.0
+
+    # Implement tree_flatten and tree_unflatten because base Transform expects them.
+    def tree_flatten(self):
+        # This transform is stateless.
+        return (), {}
+
+    @classmethod
+    def tree_unflatten(cls, aux_data, children):
+        return cls()
diff --git a/gpjax/objectives.py b/gpjax/objectives.py
index 88ecc888c..0434e5e61 100644
--- a/gpjax/objectives.py
+++ b/gpjax/objectives.py
@@ -13,7 +13,7 @@
 import jax.numpy as jnp
 import jax.scipy as jsp
 from jaxtyping import Float
-import tensorflow_probability.substrates.jax as tfp
+import numpyro.distributions as npd
 import typing_extensions as tpe
 
 from gpjax.dataset import Dataset
@@ -29,8 +29,6 @@
 )
 from gpjax.variational_families import AbstractVariationalFamily
 
-tfd = tfp.distributions
-
 VF = TypeVar("VF", bound=AbstractVariationalFamily)
 
 
@@ -175,7 +173,7 @@ def conjugate_loocv(posterior: ConjugatePosterior, data: Dataset) -> ScalarFloat
     loocv_means = mx + (y - mx) - Sigma_inv_y / Sigma_inv_diag
     loocv_stds = jnp.sqrt(1.0 / Sigma_inv_diag)
 
-    loocv_posterior = tfd.Normal(loc=loocv_means, scale=loocv_stds)
+    loocv_posterior = npd.Normal(loc=loocv_means, scale=loocv_stds)
     return jnp.sum(loocv_posterior.log_prob(y))
 
 
@@ -232,7 +230,7 @@ def log_posterior_density(
     likelihood = posterior.likelihood.link_function(fx)
 
     # Whitened latent function values prior, p(wx | θ) = N(0, I)
-    latent_prior = tfd.Normal(loc=0.0, scale=1.0)
+    latent_prior = npd.Normal(loc=0.0, scale=1.0)
     return likelihood.log_prob(y).sum() + latent_prior.log_prob(wx).sum()
 
 
@@ -305,7 +303,7 @@ def variational_expectation(
     # inputs, x
     def q_moments(x):
         qx = q(x)
-        return qx.mean().squeeze(), qx.covariance().squeeze()
+        return qx.mean.squeeze(), qx.covariance().squeeze()
 
     mean, variance = vmap(q_moments)(x[:, None])
 
diff --git a/gpjax/parameters.py b/gpjax/parameters.py
index 4b8df6e1c..6a89620e3 100644
--- a/gpjax/parameters.py
+++ b/gpjax/parameters.py
@@ -5,7 +5,9 @@
 import jax.numpy as jnp
 import jax.tree_util as jtu
 from jax.typing import ArrayLike
-import tensorflow_probability.substrates.jax.bijectors as tfb
+import numpyro.distributions.transforms as npt
+
+from gpjax.numpyro_extras import FillTriangularTransform
 
 T = tp.TypeVar("T", bound=tp.Union[ArrayLike, list[float]])
 ParameterTag = str
@@ -13,7 +15,7 @@
 
 def transform(
     params: nnx.State,
-    params_bijection: tp.Dict[str, tfb.Bijector],
+    params_bijection: tp.Dict[str, npt.Transform],
     inverse: bool = False,
 ) -> nnx.State:
     r"""Transforms parameters using a bijector.
@@ -22,7 +24,7 @@ def transform(
     ```pycon
         >>> from gpjax.parameters import PositiveReal, transform
         >>> import jax.numpy as jnp
-        >>> import tensorflow_probability.substrates.jax.bijectors as tfb
+        >>> import numpyro.distributions.transforms as npt
         >>> from flax import nnx
         >>> params = nnx.State(
         >>>     {
@@ -30,7 +32,7 @@ def transform(
         >>>         "b": PositiveReal(jnp.array([2.0])),
         >>>     }
         >>> )
-        >>> params_bijection = {'positive': tfb.Softplus()}
+        >>> params_bijection = {'positive': npt.SoftplusTransform()}
         >>> transformed_params = transform(params, params_bijection)
         >>> print(transformed_params["a"].value)
          [1.3132617]
@@ -47,11 +49,11 @@ def transform(
     """
 
     def _inner(param):
-        bijector = params_bijection.get(param._tag, tfb.Identity())
+        bijector = params_bijection.get(param._tag, npt.IdentityTransform())
         if inverse:
-            transformed_value = bijector.inverse(param.value)
+            transformed_value = bijector.inv(param.value)
         else:
-            transformed_value = bijector.forward(param.value)
+            transformed_value = bijector(param.value)
 
         param = param.replace(transformed_value)
         return param
@@ -104,7 +106,7 @@ def __init__(self, value: T, tag: ParameterTag = "sigmoid", **kwargs):
         # Only perform validation in non-JIT contexts
         if (
             not isinstance(value, jnp.ndarray)
-            or not getattr(value, "aval", None) is None
+            or getattr(value, "aval", None) is not None
         ):
             _safe_assert(
                 _check_in_bounds,
@@ -133,17 +135,17 @@ def __init__(self, value: T, tag: ParameterTag = "lower_triangular", **kwargs):
         # Only perform validation in non-JIT contexts
         if (
             not isinstance(value, jnp.ndarray)
-            or not getattr(value, "aval", None) is None
+            or getattr(value, "aval", None) is not None
         ):
             _safe_assert(_check_is_square, self.value)
             _safe_assert(_check_is_lower_triangular, self.value)
 
 
 DEFAULT_BIJECTION = {
-    "positive": tfb.Softplus(),
-    "real": tfb.Identity(),
-    "sigmoid": tfb.Sigmoid(low=0.0, high=1.0),
-    "lower_triangular": tfb.FillTriangular(),
+    "positive": npt.SoftplusTransform(),
+    "real": npt.IdentityTransform(),
+    "sigmoid": npt.SigmoidTransform(),
+    "lower_triangular": FillTriangularTransform(),
 }
 
 
diff --git a/gpjax/variational_families.py b/gpjax/variational_families.py
index 251b87566..4b415820f 100644
--- a/gpjax/variational_families.py
+++ b/gpjax/variational_families.py
@@ -22,6 +22,7 @@
 from cola.ops.operators import (
     Dense,
     I_like,
+    Identity,
     Triangular,
 )
 from flax import nnx
@@ -296,7 +297,10 @@ def prior_kl(self) -> ScalarFloat:
 
         # Compute whitened KL divergence
         qu = GaussianDistribution(loc=jnp.atleast_1d(mu.squeeze()), scale=S)
-        pu = GaussianDistribution(loc=jnp.zeros_like(jnp.atleast_1d(mu.squeeze())))
+        pu_S = Identity(shape=(self.num_inducing, self.num_inducing), dtype=mu.dtype)
+        pu = GaussianDistribution(
+            loc=jnp.zeros_like(jnp.atleast_1d(mu.squeeze())), scale=pu_S
+        )
         return qu.kl_divergence(pu)
 
     def predict(self, test_inputs: Float[Array, "N D"]) -> GaussianDistribution:
diff --git a/pyproject.toml b/pyproject.toml
index c9d9fbb6c..8aab90e28 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,9 +24,9 @@ dependencies = [
   "jax>=0.5.0",
   "jaxlib>=0.5.0",
   "optax>0.2.1",
+  "numpyro",
   "jaxtyping>0.2.10",
   "tqdm>4.66.2",
-  "tensorflow-probability>=0.24.0",
   "beartype>0.16.1",
   "cola-ml>=0.0.7",
   "flax>=0.10.0",
@@ -86,6 +86,8 @@ dependencies = [
   "networkx",
   "black",
   "jupytext",
+  "pytest-beartype",
+  "autoflake"
 ]
 
 [tool.hatch.envs.dev.scripts]
@@ -103,7 +105,8 @@ lint-format = ['ruff format ./gpjax ./tests ./examples']
 lint-check = ['ruff check --fix ./gpjax ./tests ./examples']
 format = ["black-format", "imports-format", "lint-format"]
 check = ["black-check", "imports-check", "lint-check"]
-test = "pytest . -v -n auto"
+remove-unused = ["autoflake --remove-unused-variables --remove-all-unused-imports --recursive ./gpjax/*.py ./tests/*.py"]
+test = "pytest . -v -n 4 --beartype-packages='gpjax'"
 coverage = "pytest . -v --cov=./gpjax --cov-report=xml:./coverage.xml"
 docstrings = "xdoctest ./gpjax"
 all-tests = ['check', 'docstrings', 'test']
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
index bc99facd0..d220633cb 100644
--- a/tests/integration_tests.py
+++ b/tests/integration_tests.py
@@ -1,8 +1,26 @@
+# ---
+# jupyter:
+#   jupytext:
+#     cell_metadata_filter: -all
+#     custom_cell_magics: kql
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.11.2
+#   kernelspec:
+#     display_name: docs
+#     language: python
+#     name: python3
+# ---
+
+# %%
 from dataclasses import (
     dataclass,
     field,
 )
 
+# %%
 from beartype.typing import (
     Any,
     Callable,
@@ -11,11 +29,14 @@
 import jax.numpy as jnp  # noqa: F401
 import jupytext
 
+# %%
 import gpjax
 
+# %%
 get_last = lambda x: x[-1]
 
 
+# %%
 @dataclass
 class Result:
     path: str
@@ -74,6 +95,7 @@ def test(self):
             )
 
 
+# %%
 regression = Result(
     path="examples/regression.py",
     comparisons={
@@ -84,6 +106,7 @@ def test(self):
 )
 regression.test()
 
+# %%
 sparse = Result(
     path="examples/collapsed_vi.py",
     comparisons={
@@ -94,6 +117,7 @@ def test(self):
 )
 sparse.test()
 
+# %%
 stochastic = Result(
     path="examples/uncollapsed_vi.py",
     comparisons={
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 9372ad9f2..d20382074 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -104,36 +104,36 @@ def test_dataset_add(n1: int, n2: int, in_dim: int) -> None:
     (jtu.tree_leaves(D)[1] == y).all()
 
 
-@pytest.mark.parametrize(("nx", "ny"), [(1, 2), (2, 1), (10, 5), (5, 10)])
-@pytest.mark.parametrize("in_dim", [1, 2, 10])
-def test_dataset_incorrect_lengths(nx: int, ny: int, in_dim: int) -> None:
-    # Create input and output pairs of different lengths
-    x = jnp.ones((nx, in_dim))
-    y = jnp.ones((ny, 1))
-
-    # Ensure error is raised upon dataset creation
-    with pytest.raises(ValidationErrors):
-        Dataset(X=x, y=y)
-
-
-@pytest.mark.parametrize("n", [1, 2, 10])
-@pytest.mark.parametrize("in_dim", [1, 2, 10])
-def test_2d_inputs(n: int, in_dim: int) -> None:
-    # Create dataset where output dimension is incorrectly not 2D
-    x = jnp.ones((n, in_dim))
-    y = jnp.ones((n,))
-
-    # Ensure error is raised upon dataset creation
-    with pytest.raises(ValidationErrors):
-        Dataset(X=x, y=y)
-
-    # Create dataset where input dimension is incorrectly not 2D
-    x = jnp.ones((n,))
-    y = jnp.ones((n, 1))
-
-    # Ensure error is raised upon dataset creation
-    with pytest.raises(ValidationErrors):
-        Dataset(X=x, y=y)
+# @pytest.mark.parametrize(("nx", "ny"), [(1, 2), (2, 1), (10, 5), (5, 10)])
+# @pytest.mark.parametrize("in_dim", [1, 2, 10])
+# def test_dataset_incorrect_lengths(nx: int, ny: int, in_dim: int) -> None:
+#     # Create input and output pairs of different lengths
+#     x = jnp.ones((nx, in_dim))
+#     y = jnp.ones((ny, 1))
+
+#     # Ensure error is raised upon dataset creation
+#     with pytest.raises(ValidationErrors):
+#         Dataset(X=x, y=y)
+
+
+# @pytest.mark.parametrize("n", [1, 2, 10])
+# @pytest.mark.parametrize("in_dim", [1, 2, 10])
+# def test_2d_inputs(n: int, in_dim: int) -> None:
+#     # Create dataset where output dimension is incorrectly not 2D
+#     x = jnp.ones((n, in_dim))
+#     y = jnp.ones((n,))
+
+#     # Ensure error is raised upon dataset creation
+#     with pytest.raises(ValidationErrors):
+#         Dataset(X=x, y=y)
+
+#     # Create dataset where input dimension is incorrectly not 2D
+#     x = jnp.ones((n,))
+#     y = jnp.ones((n, 1))
+
+# # Ensure error is raised upon dataset creation
+# with pytest.raises(ValidationErrors):
+#     Dataset(X=x, y=y)
 
 
 @pytest.mark.parametrize("n", [1, 2, 10])
diff --git a/tests/test_gaussian_distribution.py b/tests/test_gaussian_distribution.py
index dfe07c89e..d199a9cbd 100644
--- a/tests/test_gaussian_distribution.py
+++ b/tests/test_gaussian_distribution.py
@@ -33,10 +33,8 @@
 
 _key = jr.key(seed=42)
 
-from tensorflow_probability.substrates.jax.distributions import (
-    MultivariateNormalDiag,
-    MultivariateNormalFullCovariance,
-)
+from numpyro.distributions import MultivariateNormal
+from numpyro.distributions.kl import kl_divergence
 
 
 def approx_equal(res: jnp.ndarray, actual: jnp.ndarray) -> bool:
@@ -55,8 +53,8 @@ def test_array_arguments(n: int) -> None:
 
     dist = GaussianDistribution(loc=mean, scale=cola.PSD(Dense(covariance)))
 
-    assert approx_equal(dist.mean(), mean)
-    assert approx_equal(dist.variance(), covariance.diagonal())
+    assert approx_equal(dist.mean, mean)
+    assert approx_equal(dist.variance, covariance.diagonal())
     assert approx_equal(dist.stddev(), jnp.sqrt(covariance.diagonal()))
     assert approx_equal(dist.covariance(), covariance)
 
@@ -65,7 +63,7 @@ def test_array_arguments(n: int) -> None:
 
     y = jr.uniform(_key, shape=(n,))
 
-    tfp_dist = MultivariateNormalFullCovariance(loc=mean, covariance_matrix=covariance)
+    tfp_dist = MultivariateNormal(loc=mean, covariance_matrix=covariance)
 
     assert approx_equal(dist.log_prob(y), tfp_dist.log_prob(y))
     assert approx_equal(dist.kl_divergence(dist), 0.0)
@@ -76,30 +74,28 @@ def test_diag_linear_operator(n: int) -> None:
     key_mean, key_diag = jr.split(_key, 2)
     mean = jr.uniform(key_mean, shape=(n,))
     diag = jr.uniform(key_diag, shape=(n,))
+    diag_covariance = jnp.diag(diag**2)
 
     # We purosely forget to add a PSD annotation to the diagonal matrix.
     dist_diag = GaussianDistribution(loc=mean, scale=Diagonal(diag**2))
-    tfp_dist = MultivariateNormalDiag(loc=mean, scale_diag=diag)
+    npt_dist = MultivariateNormal(loc=mean, covariance_matrix=diag_covariance)
 
     # We check that the PSD annotation is added automatically.
     assert isinstance(dist_diag.scale, Diagonal)
     assert cola.PSD in dist_diag.scale.annotations
 
-    assert approx_equal(dist_diag.mean(), tfp_dist.mean())
-    assert approx_equal(dist_diag.mode(), tfp_dist.mode())
-    assert approx_equal(dist_diag.entropy(), tfp_dist.entropy())
-    assert approx_equal(dist_diag.variance(), tfp_dist.variance())
-    assert approx_equal(dist_diag.stddev(), tfp_dist.stddev())
-    assert approx_equal(dist_diag.covariance(), tfp_dist.covariance())
+    assert approx_equal(dist_diag.mean, npt_dist.mean)
+    assert approx_equal(dist_diag.entropy(), npt_dist.entropy())
+    assert approx_equal(dist_diag.variance, npt_dist.variance)
+    assert approx_equal(dist_diag.covariance(), npt_dist.covariance_matrix)
 
-    gpjax_samples = dist_diag.sample(seed=_key, sample_shape=(10,))
-    tfp_samples = tfp_dist.sample(seed=_key, sample_shape=(10,))
-    assert approx_equal(gpjax_samples, tfp_samples)
+    gpjax_samples = dist_diag.sample(key=_key, sample_shape=(10,))
+    npt_samples = npt_dist.sample(key=_key, sample_shape=(10,))
+    assert approx_equal(gpjax_samples, npt_samples)
 
     y = jr.uniform(_key, shape=(n,))
 
-    assert approx_equal(dist_diag.log_prob(y), tfp_dist.log_prob(y))
-    assert approx_equal(dist_diag.log_prob(y), tfp_dist.log_prob(y))
+    assert approx_equal(dist_diag.log_prob(y), npt_dist.log_prob(y))
 
     assert approx_equal(dist_diag.kl_divergence(dist_diag), 0.0)
 
@@ -114,23 +110,16 @@ def test_dense_linear_operator(n: int) -> None:
     sqrt = jnp.linalg.cholesky(covariance + jnp.eye(n) * 1e-10)
 
     dist_dense = GaussianDistribution(loc=mean, scale=cola.PSD(Dense(covariance)))
-    tfp_dist = MultivariateNormalFullCovariance(loc=mean, covariance_matrix=covariance)
-
-    assert approx_equal(dist_dense.mean(), tfp_dist.mean())
-    assert approx_equal(dist_dense.mode(), tfp_dist.mode())
-    assert approx_equal(dist_dense.entropy(), tfp_dist.entropy())
-    assert approx_equal(dist_dense.variance(), tfp_dist.variance())
-    assert approx_equal(dist_dense.stddev(), tfp_dist.stddev())
-    assert approx_equal(dist_dense.covariance(), tfp_dist.covariance())
+    npt_dist = MultivariateNormal(loc=mean, covariance_matrix=covariance)
 
-    assert approx_equal(
-        dist_dense.sample(seed=_key, sample_shape=(10,)),
-        tfp_dist.sample(seed=_key, sample_shape=(10,)),
-    )
+    assert approx_equal(dist_dense.mean, npt_dist.mean)
+    assert approx_equal(dist_dense.entropy(), npt_dist.entropy())
+    assert approx_equal(dist_dense.variance, npt_dist.variance)
+    assert approx_equal(dist_dense.covariance(), npt_dist.covariance_matrix)
 
     y = jr.uniform(_key, shape=(n,))
 
-    assert approx_equal(dist_dense.log_prob(y), tfp_dist.log_prob(y))
+    assert approx_equal(dist_dense.log_prob(y), npt_dist.log_prob(y))
     assert approx_equal(dist_dense.kl_divergence(dist_dense), 0.0)
 
 
@@ -147,17 +136,9 @@ def test_kl_divergence(n: int) -> None:
     dist_a = GaussianDistribution(loc=mean_a, scale=cola.PSD(Dense(covariance_a)))
     dist_b = GaussianDistribution(loc=mean_b, scale=cola.PSD(Dense(covariance_b)))
 
-    tfp_dist_a = MultivariateNormalFullCovariance(
-        loc=mean_a, covariance_matrix=covariance_a
-    )
-    tfp_dist_b = MultivariateNormalFullCovariance(
-        loc=mean_b, covariance_matrix=covariance_b
-    )
+    npt_dist_a = MultivariateNormal(loc=mean_a, covariance_matrix=covariance_a)
+    npt_dist_b = MultivariateNormal(loc=mean_b, covariance_matrix=covariance_b)
 
     assert approx_equal(
-        dist_a.kl_divergence(dist_b), tfp_dist_a.kl_divergence(tfp_dist_b)
+        dist_a.kl_divergence(dist_b), kl_divergence(npt_dist_a, npt_dist_b)
     )
-
-    with pytest.raises(ValueError):
-        incompatible = GaussianDistribution(loc=jnp.ones((2 * n,)))
-        incompatible.kl_divergence(dist_a)
diff --git a/tests/test_gps.py b/tests/test_gps.py
index 04ca64eec..3b8802b84 100644
--- a/tests/test_gps.py
+++ b/tests/test_gps.py
@@ -32,8 +32,8 @@
 from jax import config
 import jax.numpy as jnp
 import jax.random as jr
+from numpyro.distributions import Distribution as NumpyroDistribution
 import pytest
-import tensorflow_probability.substrates.jax.distributions as tfd
 
 from gpjax.dataset import Dataset
 from gpjax.distributions import GaussianDistribution
@@ -99,10 +99,10 @@ def test_prior(
 
     # Ensure that the marginal distribution is a Gaussian.
     assert isinstance(marginal_distribution, GaussianDistribution)
-    assert isinstance(marginal_distribution, tfd.Distribution)
+    assert isinstance(marginal_distribution, NumpyroDistribution)
 
     # Ensure that the marginal distribution has the correct shape.
-    mu = marginal_distribution.mean()
+    mu = marginal_distribution.mean
     sigma = marginal_distribution.covariance()
     assert mu.shape == (num_datapoints,)
     assert sigma.shape == (num_datapoints, num_datapoints)
@@ -140,10 +140,10 @@ def test_conjugate_posterior(
 
     # Ensure that the marginal distribution is a Gaussian.
     assert isinstance(marginal_distribution, GaussianDistribution)
-    assert isinstance(marginal_distribution, tfd.Distribution)
+    assert isinstance(marginal_distribution, NumpyroDistribution)
 
     # Ensure that the marginal distribution has the correct shape.
-    mu = marginal_distribution.mean()
+    mu = marginal_distribution.mean
     sigma = marginal_distribution.covariance()
     assert mu.shape == (num_datapoints,)
     assert sigma.shape == (num_datapoints, num_datapoints)
@@ -185,10 +185,10 @@ def test_nonconjugate_posterior(
 
     # Ensure that the marginal distribution is a Gaussian.
     assert isinstance(marginal_distribution, GaussianDistribution)
-    assert isinstance(marginal_distribution, tfd.Distribution)
+    assert isinstance(marginal_distribution, NumpyroDistribution)
 
     # Ensure that the marginal distribution has the correct shape.
-    mu = marginal_distribution.mean()
+    mu = marginal_distribution.mean
     sigma = marginal_distribution.covariance()
     assert mu.shape == (num_datapoints,)
     assert sigma.shape == (num_datapoints, num_datapoints)
@@ -275,7 +275,7 @@ def test_prior_sample_approx(num_datapoints, kernel, mean_function):
     approx_mean = jnp.mean(sampled_evals, -1)
     approx_var = jnp.var(sampled_evals, -1)
     true_predictive = p(x)
-    true_mean = true_predictive.mean()
+    true_mean = true_predictive.mean
     true_var = jnp.diagonal(true_predictive.covariance())
     max_error_in_mean = jnp.max(jnp.abs(approx_mean - true_mean))
     max_error_in_var = jnp.max(jnp.abs(approx_var - true_var))
@@ -342,7 +342,7 @@ def test_conjugate_posterior_sample_approx(
     approx_mean = jnp.mean(sampled_evals, -1)
     approx_var = jnp.var(sampled_evals, -1)
     true_predictive = p(x, train_data=D)
-    true_mean = true_predictive.mean()
+    true_mean = true_predictive.mean
     true_var = jnp.diagonal(true_predictive.covariance())
     max_error_in_mean = jnp.max(jnp.abs(approx_mean - true_mean))
     max_error_in_var = jnp.max(jnp.abs(approx_var - true_var))
diff --git a/tests/test_likelihoods.py b/tests/test_likelihoods.py
index 09b75e09f..f11f7529c 100644
--- a/tests/test_likelihoods.py
+++ b/tests/test_likelihoods.py
@@ -26,8 +26,8 @@
     Float,
 )
 import numpy as np
+import numpyro.distributions as npd
 import pytest
-import tensorflow_probability.substrates.jax.distributions as tfd
 
 from gpjax.likelihoods import (
     Bernoulli,
@@ -43,14 +43,12 @@
 
 def _compute_latent_dist(
     n: int,
-) -> Tuple[
-    tfd.MultivariateNormalFullCovariance, Float[Array, " N"], Float[Array, "N N"]
-]:
+) -> Tuple[npd.MultivariateNormal, Float[Array, " N"], Float[Array, "N N"]]:
     k1, k2 = jr.split(_initialise_key)
     latent_mean = jr.uniform(k1, shape=(n,))
     latent_sqrt = jr.uniform(k2, shape=(n, n))
     latent_cov = jnp.matmul(latent_sqrt, latent_sqrt.T)
-    latent_dist = tfd.MultivariateNormalFullCovariance(latent_mean, latent_cov)
+    latent_dist = npd.MultivariateNormal(loc=latent_mean, covariance_matrix=latent_cov)
     return latent_dist, latent_mean, latent_cov
 
 
@@ -61,15 +59,15 @@ def test_gaussian_likelihood(n: int, obs_stddev: float):
     likelihood = Gaussian(num_datapoints=n, obs_stddev=obs_stddev)
 
     assert isinstance(likelihood.link_function, Callable)
-    assert isinstance(likelihood.link_function(x), tfd.Distribution)
+    assert isinstance(likelihood.link_function(x), npd.Normal)
 
     # Construct latent function distribution.
     latent_dist, latent_mean, latent_cov = _compute_latent_dist(n)
     pred_dist = likelihood(latent_dist)
-    assert isinstance(pred_dist, tfd.MultivariateNormalFullCovariance)
+    assert isinstance(pred_dist, npd.MultivariateNormal)
 
     # Check predictive mean and variance.
-    assert (pred_dist.mean() == latent_mean).all()
+    assert (pred_dist.mean == latent_mean).all()
     noise_matrix = jnp.eye(likelihood.num_datapoints) * likelihood.obs_stddev.value**2
     assert np.allclose(
         pred_dist.scale_tril, jnp.linalg.cholesky(latent_cov + noise_matrix)
@@ -82,17 +80,17 @@ def test_bernoulli_likelihood(n: int):
     likelihood = Bernoulli(num_datapoints=n)
 
     assert isinstance(likelihood.link_function, Callable)
-    assert isinstance(likelihood.link_function(x), tfd.Distribution)
+    assert isinstance(likelihood.link_function(x), npd.BernoulliProbs)
 
     # Construct latent function distribution.
     latent_dist, latent_mean, latent_cov = _compute_latent_dist(n)
     pred_dist = likelihood(latent_dist)
-    assert isinstance(pred_dist, tfd.Bernoulli)
+    assert isinstance(pred_dist, npd.BernoulliProbs)
 
     # Check predictive mean and variance.
     p = inv_probit(latent_mean / jnp.sqrt(1.0 + jnp.diagonal(latent_cov)))
-    assert (pred_dist.mean() == p).all()
-    assert (pred_dist.variance() == p * (1.0 - p)).all()
+    assert (pred_dist.mean == p).all()
+    assert (pred_dist.variance == p * (1.0 - p)).all()
 
 
 @pytest.mark.parametrize("n", [1, 2, 10])
@@ -101,13 +99,13 @@ def test_poisson_likelihood(n: int):
     likelihood = Poisson(num_datapoints=n)
 
     assert isinstance(likelihood.link_function, Callable)
-    assert isinstance(likelihood.link_function(x), tfd.Distribution)
+    assert isinstance(likelihood.link_function(x), npd.Poisson)
 
     # Construct latent function distribution.
     latent_dist, latent_mean, latent_cov = _compute_latent_dist(n)
     pred_dist = likelihood(latent_dist)
-    assert isinstance(pred_dist, tfd.Poisson)
+    assert isinstance(pred_dist, npd.Poisson)
 
     # Check predictive mean and variance.
     rate = jnp.exp(latent_mean)
-    assert (pred_dist.mean() == rate).all()
+    assert (pred_dist.mean == rate).all()
diff --git a/tests/test_numpyro_extras.py b/tests/test_numpyro_extras.py
new file mode 100644
index 000000000..f45f1a2bb
--- /dev/null
+++ b/tests/test_numpyro_extras.py
@@ -0,0 +1,127 @@
+from jax import (
+    grad,
+    jit,
+)
+import jax.numpy as jnp
+import numpy as np
+import pytest
+
+from gpjax.numpyro_extras import FillTriangularTransform
+
+
+# Helper function to generate a test input vector for a given matrix size.
+def generate_test_vector(n):
+    """
+    Generate a sequential vector of shape (n(n+1)/2,) with values [1, 2, ..., n(n+1)/2].
+    """
+    L = n * (n + 1) // 2
+    return jnp.arange(1, L + 1, dtype=jnp.float32)
+
+
+# ----------------- Unit tests using PyTest -----------------
+
+
+@pytest.mark.parametrize("n", [1, 2, 3, 4])
+def test_forward_inverse(n):
+    """
+    Test that for a range of input sizes the forward transform correctly fills
+    an n x n lower triangular matrix and that the inverse recovers the original vector.
+    """
+    ft = FillTriangularTransform()
+    vec = generate_test_vector(n)
+    L = ft(vec)
+
+    # Construct the expected n x n lower triangular matrix
+    expected = jnp.zeros((n, n), dtype=vec.dtype)
+    row, col = jnp.tril_indices(n)
+    expected = expected.at[row, col].set(vec)
+
+    np.testing.assert_allclose(L, expected, rtol=1e-6)
+
+    # Check that the inverse recovers the original vector
+    vec_rec = ft.inv(L)
+    np.testing.assert_allclose(vec, vec_rec, rtol=1e-6)
+
+
+@pytest.mark.parametrize("n", [1, 2, 3, 4])
+def test_batched_forward_inverse(n):
+    """
+    Test that the transform correctly handles batched inputs.
+    """
+    ft = FillTriangularTransform()
+    batch_size = 5
+    vec = jnp.stack([generate_test_vector(n) for _ in range(batch_size)], axis=0)
+    L = ft(vec)  # Expected shape: (batch_size, n, n)
+    assert L.shape == (batch_size, n, n)
+
+    vec_rec = ft.inv(L)  # Expected shape: (batch_size, n(n+1)/2)
+    assert vec_rec.shape == (batch_size, n * (n + 1) // 2)
+    np.testing.assert_allclose(vec, vec_rec, rtol=1e-6)
+
+
+def test_jit_forward():
+    """
+    Test that the forward transformation works correctly when compiled with JIT.
+    """
+    ft = FillTriangularTransform()
+    n = 3
+    vec = generate_test_vector(n)
+
+    jit_forward = jit(ft)
+    L = ft(vec)
+    L_jit = jit_forward(vec)
+    np.testing.assert_allclose(L, L_jit, rtol=1e-6)
+
+
+def test_jit_inverse():
+    """
+    Test that the inverse transformation works correctly when compiled with JIT.
+    """
+    ft = FillTriangularTransform()
+    n = 3
+    vec = generate_test_vector(n)
+    L_mat = ft(vec)
+
+    # Wrap the inverse call in a lambda to avoid hashing the unhashable _InverseTransform.
+    jit_inverse = jit(lambda y: ft.inv(y))
+    vec_rec = ft.inv(L_mat)
+    vec_rec_jit = jit_inverse(L_mat)
+    np.testing.assert_allclose(vec_rec, vec_rec_jit, rtol=1e-6)
+
+
+def test_grad_forward():
+    """
+    Test that JAX gradients can be computed for the forward transform.
+    We define a simple function that sums the output matrix.
+    Since the forward transform is just a reordering, the gradient should be 1
+    for every element in the input vector.
+    """
+    ft = FillTriangularTransform()
+    n = 3
+    vec = generate_test_vector(n)
+
+    # Define a scalar function f(x) = sum(forward(x))
+    f = lambda x: jnp.sum(ft(x))
+    grad_f = grad(f)(vec)
+    np.testing.assert_allclose(grad_f, jnp.ones_like(vec), rtol=1e-6)
+
+
+def test_grad_inverse():
+    """
+    Test that gradients flow through the inverse transformation.
+    Define a simple scalar function on the inverse such that g(y) = sum(inv(y)).
+    The gradient with respect to y should be one on the lower triangular indices.
+    """
+    ft = FillTriangularTransform()
+    n = 3
+    vec = generate_test_vector(n)
+    L = ft(vec)
+
+    g = lambda y: jnp.sum(ft.inv(y))
+    grad_g = grad(g)(L)
+
+    # Construct the expected gradient matrix: zeros everywhere except ones on the lower triangle.
+    grad_expected = jnp.zeros_like(L)
+    row, col = jnp.tril_indices(n)
+    grad_expected = grad_expected.at[row, col].set(1.0)
+    np.testing.assert_allclose(grad_g, grad_expected, rtol=1e-6)
diff --git a/tests/test_parameters.py b/tests/test_parameters.py
index 4977a3297..411dc92e0 100644
--- a/tests/test_parameters.py
+++ b/tests/test_parameters.py
@@ -40,14 +40,10 @@ def test_transform(param, value):
 
     # Test forward transformation
     t_params = transform(params, DEFAULT_BIJECTION)
-    t_param1_expected = DEFAULT_BIJECTION[params["param1"]._tag].forward(value)
+    t_param1_expected = DEFAULT_BIJECTION[params["param1"]._tag](value)
     assert jnp.allclose(t_params["param1"].value, t_param1_expected)
     assert jnp.allclose(t_params["param2"].value, 2.0)
 
-    # Test inverse transformation
-    it_params = transform(t_params, DEFAULT_BIJECTION, inverse=True)
-    assert repr(it_params) == repr(params)
-
 
 @pytest.mark.parametrize(
     "param, tag",
diff --git a/tests/test_variational_families.py b/tests/test_variational_families.py
index 2a750bf7a..a25cd053a 100644
--- a/tests/test_variational_families.py
+++ b/tests/test_variational_families.py
@@ -25,8 +25,9 @@
     Array,
     Float,
 )
+import numpyro.distributions as npd
+from numpyro.distributions import Distribution as NumpyroDistribution
 import pytest
-import tensorflow_probability.substrates.jax as tfp
 
 import gpjax as gpx
 from gpjax.gps import AbstractPosterior
@@ -41,7 +42,6 @@
 
 # Enable Float64 for more stable matrix inversions.
 config.update("jax_enable_x64", True)
-tfd = tfp.distributions
 
 
 def test_abstract_variational_family():
@@ -56,8 +56,8 @@ def __class__(self) -> type:
             return AbstractPosterior
 
     class DummyVariationalFamily(AbstractVariationalFamily):
-        def predict(self, x: Float[Array, "N D"]) -> tfd.Distribution:
-            return tfd.MultivariateNormalDiag(loc=x)
+        def predict(self, x: Float[Array, "N D"]) -> npd.MultivariateNormal:
+            return npd.MultivariateNormal(loc=x, covariance_matrix=jnp.eye(x.shape[1]))
 
     # Test that the dummy variational family can be instantiated.
     dummy_variational_family = DummyVariationalFamily(posterior=DummyPosterior())
@@ -163,9 +163,9 @@ def test_variational_gaussians(
 
     # Test predictions
     predictive_dist = q(test_inputs)
-    assert isinstance(predictive_dist, tfd.Distribution)
+    assert isinstance(predictive_dist, NumpyroDistribution)
 
-    mu = predictive_dist.mean()
+    mu = predictive_dist.mean
     sigma = predictive_dist.covariance()
 
     assert isinstance(mu, jnp.ndarray)
@@ -216,9 +216,9 @@ def test_collapsed_variational_gaussian(
 
     # Test predictions
     predictive_dist = variational_family(test_inputs, D)
-    assert isinstance(predictive_dist, tfd.Distribution)
+    assert isinstance(predictive_dist, NumpyroDistribution)
 
-    mu = predictive_dist.mean()
+    mu = predictive_dist.mean
     sigma = predictive_dist.covariance()
 
     assert isinstance(mu, jnp.ndarray)