Correct rrule for perturbed (#75)

gdalle · web-flow · commit 18ee995bc449 · 2023-06-29T23:55:42.000+02:00
* Correct rrule for perturbed

* Replace sum with mean and add propert tests
diff --git a/src/perturbed/additive.jl b/src/perturbed/additive.jl
@@ -69,8 +69,11 @@ function ChainRulesCore.rrule(
     Z_samples = sample_perturbations(perturbed, θ)
     probadist = compute_probability_distribution(perturbed, θ, Z_samples; kwargs...)
     function perturbed_additive_probadist_pullback(probadist_tangent)
-        weigths_tangent = probadist_tangent.weights
-        dθ = inv(ε) * sum(wt * Z for (wt, Z) in zip(weigths_tangent, Z_samples))
+        weights_tangent = probadist_tangent.weights
+        if length(weights_tangent) != length(Z_samples)
+            throw(ArgumentError("Probadist tangent has invalid number of atoms"))
+        end
+        dθ = inv(ε) * mean(wt * Z for (wt, Z) in zip(weights_tangent, Z_samples))
         return NoTangent(), NoTangent(), dθ
     end
     return probadist, perturbed_additive_probadist_pullback
diff --git a/src/perturbed/multiplicative.jl b/src/perturbed/multiplicative.jl
@@ -71,8 +71,11 @@ function ChainRulesCore.rrule(
     Z_samples = sample_perturbations(perturbed, θ)
     probadist = compute_probability_distribution(perturbed, θ, Z_samples; kwargs...)
     function perturbed_multiplicative_probadist_pullback(probadist_tangent)
-        weigths_tangent = probadist_tangent.weights
-        dθ = inv.(ε .* θ) .* sum(wt * Z for (wt, Z) in zip(weigths_tangent, Z_samples))
+        weights_tangent = probadist_tangent.weights
+        if length(weights_tangent) != length(Z_samples)
+            throw(ArgumentError("Probadist tangent has invalid number of atoms"))
+        end
+        dθ = inv.(ε .* θ) .* mean(wt * Z for (wt, Z) in zip(weights_tangent, Z_samples))
         return NoTangent(), NoTangent(), dθ
     end
     return probadist, perturbed_multiplicative_probadist_pullback
diff --git a/src/utils/probability_distribution.jl b/src/utils/probability_distribution.jl
@@ -36,33 +36,6 @@ end
 
 Base.rand(probadist::FixedAtomsProbabilityDistribution) = rand(GLOBAL_RNG, probadist)
 
-"""
-    compress_distribution!(probadist[; atol])
-
-Remove duplicated atoms in `probadist` (up to a tolerance on equality).
-"""
-function compress_distribution!(
-    probadist::FixedAtomsProbabilityDistribution{A,W}; atol=0
-) where {A,W}
-    (; atoms, weights) = probadist
-    to_delete = Int[]
-    for i in length(probadist):-1:1
-        ai = atoms[i]
-        for j in 1:(i - 1)
-            aj = atoms[j]
-            if isapprox(ai, aj; atol=atol)
-                weights[j] += weights[i]
-                push!(to_delete, i)
-                break
-            end
-        end
-    end
-    sort!(to_delete)
-    deleteat!(atoms, to_delete)
-    deleteat!(weights, to_delete)
-    return probadist
-end
-
 """
     apply_on_atoms(post_processing, probadist)
 
@@ -121,3 +94,32 @@ The following layer types are supported:
 - [`RegularizedGeneric`](@ref)
 """
 function compute_probability_distribution end
+
+"""
+    compress_distribution!(probadist[; atol])
+
+Remove duplicated atoms in `probadist` (up to a tolerance on equality).
+
+This function can break probabilistic layers if used during training. It is only meant for analyzing outputs.
+"""
+function compress_distribution!(
+    probadist::FixedAtomsProbabilityDistribution{A,W}; atol=0
+) where {A,W}
+    (; atoms, weights) = probadist
+    to_delete = Int[]
+    for i in length(probadist):-1:1
+        ai = atoms[i]
+        for j in 1:(i - 1)
+            aj = atoms[j]
+            if isapprox(ai, aj; atol=atol)
+                weights[j] += weights[i]
+                push!(to_delete, i)
+                break
+            end
+        end
+    end
+    sort!(to_delete)
+    deleteat!(atoms, to_delete)
+    deleteat!(weights, to_delete)
+    return probadist
+end
diff --git a/test/jacobian_approx.jl b/test/jacobian_approx.jl
@@ -4,29 +4,38 @@
     using Test
     using Zygote
 
-    Random.seed!(63)
+    # Random.seed!(63)
 
     θ = [3, 5, 4, 2]
 
-    perturbed1 = PerturbedAdditive(one_hot_argmax; ε=2, nb_samples=1000, seed=0)
-    perturbed2 = PerturbedMultiplicative(one_hot_argmax; ε=0.5, nb_samples=1000, seed=0)
+    perturbed1 = PerturbedAdditive(one_hot_argmax; ε=2, nb_samples=1_000, seed=0)
+    perturbed1_big = PerturbedAdditive(one_hot_argmax; ε=2, nb_samples=10_000, seed=0)
+    perturbed2 = PerturbedMultiplicative(one_hot_argmax; ε=0.5, nb_samples=1_000, seed=0)
+    perturbed2_big = PerturbedMultiplicative(
+        one_hot_argmax; ε=0.5, nb_samples=10_000, seed=0
+    )
 
     @testset "PerturbedAdditive" begin
         # Compute jacobian with reverse mode
         jac1 = Zygote.jacobian(perturbed1, θ)[1]
+        jac1_big = Zygote.jacobian(perturbed1_big, θ)[1]
+        @show jac1 jac1_big
         # Only diagonal should be positive
         @test all(diag(jac1) .>= 0)
         @test all(jac1 - Diagonal(jac1) .<= 0)
         # Order of diagonal coefficients should follow order of θ
         @test sortperm(diag(jac1)) == sortperm(θ)
+        # No scaling with nb of samples
+        @test norm(jac1) ≈ norm(jac1_big) rtol = 1e-2
     end
 
     @testset "PerturbedMultiplicative" begin
         jac2 = Zygote.jacobian(perturbed2, θ)[1]
+        jac2_big = Zygote.jacobian(perturbed2_big, θ)[1]
         @test all(diag(jac2) .>= 0)
         @test all(jac2 - Diagonal(jac2) .<= 0)
-        @test_broken sortperm(diag(jac2)) == sortperm(θ)
-        # This is broken because the diagonal coefficient for θ₃ = 4 is often larger than the one for θ₂ = 5
-        # Maybe because θ₃ has the opportunity to *become* the argmax (and hence switch from 0 to 1), whereas θ₂ already *is* the argmax?
+        @test sortperm(diag(jac2)) != sortperm(θ)
+        # This is not equal because the diagonal coefficient for θ₃ = 4 is often larger than the one for θ₂ = 5. It happens because θ₃ has the opportunity to *become* the argmax (and hence switch from 0 to 1), whereas θ₂ already *is* the argmax.
+        @test norm(jac2) ≈ norm(jac2_big) rtol = 1e-2
     end
 end