Merge pull request #312 from FluxML/ignore

yuehhua · web-flow · commit 3635dba5daaa · 2022-07-03T19:15:12.000+08:00
Replace Zygote.ignore as ChainRulesCore.ignore_derivatives
diff --git a/Project.toml b/Project.toml
@@ -23,7 +23,6 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Word2Vec = "c64b6f0f-98cd-51d1-af78-58ae84944834"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 CUDA = "3"
@@ -40,7 +39,6 @@ Optimisers = "0.2"
 Reexport = "1.1"
 StatsBase = "0.33"
 Word2Vec = "0.5"
-Zygote = "0.6"
 julia = "1.6"
 
 [extras]
diff --git a/src/GeometricFlux.jl b/src/GeometricFlux.jl
@@ -17,7 +17,6 @@ using Flux: glorot_uniform, leakyrelu, GRUCell, @functor
 using Graphs
 using NNlib, NNlibCUDA
 using Optimisers
-using Zygote
 
 import Word2Vec: word2vec, wordvectors, get_vector
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -45,7 +45,7 @@ end
 # For variable graph
 function (l::GCNConv)(fg::AbstractFeaturedGraph)
     nf = node_feature(fg)
-    Ã = Zygote.ignore() do
+    Ã = ChainRulesCore.ignore_derivatives() do
         GraphSignals.normalized_adjacency_matrix(fg, eltype(nf); selfloop=true)
     end
     return ConcreteFeaturedGraph(fg, nf = l(Ã, nf))
@@ -127,7 +127,7 @@ function (l::ChebConv)(fg::AbstractFeaturedGraph)
     GraphSignals.check_num_nodes(fg, nf)
     @assert size(nf, 1) == size(l.weight, 2) "Input feature size must match input channel size."
     
-    L̃ = Zygote.ignore() do
+    L̃ = ChainRulesCore.ignore_derivatives() do
         GraphSignals.scaled_laplacian(fg, eltype(nf))
     end
     return ConcreteFeaturedGraph(fg, nf = l(L̃, nf))
@@ -331,7 +331,7 @@ function (l::GATConv)(fg::AbstractFeaturedGraph)
     X = node_feature(fg)
     GraphSignals.check_num_nodes(fg, X)
     sg = graph(fg)
-    @assert Zygote.ignore(() -> GraphSignals.has_all_self_loops(sg)) "a vertex must have self loop (receive a message from itself)."
+    @assert ChainRulesCore.ignore_derivatives(() -> GraphSignals.has_all_self_loops(sg)) "a vertex must have self loop (receive a message from itself)."
     el = to_namedtuple(sg)
     _, V, _ = propagate(l, el, nothing, X, nothing, hcat, nothing, nothing)
     return ConcreteFeaturedGraph(fg, nf=V)
@@ -459,7 +459,7 @@ function (l::GATv2Conv)(fg::AbstractFeaturedGraph)
     X = node_feature(fg)
     GraphSignals.check_num_nodes(fg, X)
     sg = graph(fg)
-    @assert Zygote.ignore(() -> GraphSignals.has_all_self_loops(sg)) "a vertex must have self loop (receive a message from itself)."
+    @assert ChainRulesCore.ignore_derivatives(() -> GraphSignals.has_all_self_loops(sg)) "a vertex must have self loop (receive a message from itself)."
     el = to_namedtuple(sg)
     _, V, _ = propagate(l, el, nothing, X, nothing, hcat, nothing, nothing)
     return ConcreteFeaturedGraph(fg, nf=V)
@@ -546,7 +546,7 @@ function (l::GatedGraphConv)(el::NamedTuple, H::AbstractArray{T}) where {T<:Real
     m, n = size(H)[1:2]
     @assert (m <= l.out_ch) "number of input features must less or equals to output features."
     if m < l.out_ch
-        Hpad = Zygote.ignore() do
+        Hpad = ChainRulesCore.ignore_derivatives() do
             fill!(similar(H, T, l.out_ch - m, n, size(H)[3:end]...), 0)
         end
         H = vcat(H, Hpad)
diff --git a/src/models.jl b/src/models.jl
@@ -134,7 +134,7 @@ function summarize(ve::VariationalGraphEncoder, X::AbstractArray)
 end
 
 function sample(μ::AbstractArray{T}, logσ::AbstractArray{T}) where {T<:Real}
-    R = Zygote.ignore(() -> randn!(similar(logσ)))
+    R = ChainRulesCore.ignore_derivatives(() -> randn!(similar(logσ)))
     return μ + exp.(logσ) .* R
 end
 
diff --git a/test/cuda/conv.jl b/test/cuda/conv.jl
@@ -24,7 +24,7 @@
             fg_ = gc(fg)
             @test size(node_feature(fg_)) == (out_channel, N)
 
-            g = Zygote.gradient(() -> sum(node_feature(gc(fg))), Flux.params(gc))
+            g = gradient(() -> sum(node_feature(gc(fg))), Flux.params(gc))
             @test length(g.grads) == 4
         end
 
@@ -33,7 +33,7 @@
             Y = gc(X |> gpu)
             @test size(Y) == (out_channel, N)
 
-            g = Zygote.gradient(() -> sum(gc(X |> gpu)), Flux.params(gc))
+            g = gradient(() -> sum(gc(X |> gpu)), Flux.params(gc))
             @test length(g.grads) == 3
         end
     end
@@ -53,7 +53,7 @@
             fg_ = cc(fg)
             @test size(node_feature(fg_)) == (out_channel, N)
 
-            g = Zygote.gradient(() -> sum(node_feature(cc(fg))), Flux.params(cc))
+            g = gradient(() -> sum(node_feature(cc(fg))), Flux.params(cc))
             @test length(g.grads) == 4
         end
 
@@ -62,7 +62,7 @@
             Y = cc(X |> gpu)
             @test size(Y) == (out_channel, N)
 
-            g = Zygote.gradient(() -> sum(cc(X |> gpu)), Flux.params(cc))
+            g = gradient(() -> sum(cc(X |> gpu)), Flux.params(cc))
             @test length(g.grads) == 3
         end
     end
@@ -79,7 +79,7 @@
             fg_ = gc(fg)
             @test size(node_feature(fg_)) == (out_channel, N)
 
-            g = Zygote.gradient(() -> sum(node_feature(gc(fg))), Flux.params(gc))
+            g = gradient(() -> sum(node_feature(gc(fg))), Flux.params(gc))
             @test length(g.grads) == 5
         end
 
@@ -89,7 +89,7 @@
             Y = gc(X |> gpu)
             @test size(Y) == (out_channel, N, batch_size)
 
-            g = Zygote.gradient(() -> sum(gc(X |> gpu)), Flux.params(gc))
+            g = gradient(() -> sum(gc(X |> gpu)), Flux.params(gc))
             @test length(g.grads) == 4
         end
     end
@@ -112,7 +112,7 @@
             fg_ = gat(fg)
             @test size(node_feature(fg_)) == (out_channel * heads, N)
     
-            g = Zygote.gradient(() -> sum(node_feature(gat(fg))), Flux.params(gat))
+            g = gradient(() -> sum(node_feature(gat(fg))), Flux.params(gat))
             @test length(g.grads) == 5
         end
 
@@ -122,7 +122,7 @@
             Y = gat(X |> gpu)
             @test size(Y) == (out_channel * heads, N, batch_size)
     
-            g = Zygote.gradient(() -> sum(gat(X |> gpu)), Flux.params(gat))
+            g = gradient(() -> sum(gat(X |> gpu)), Flux.params(gat))
             @test length(g.grads) == 4
         end
     end
@@ -138,7 +138,7 @@
             fg_ = ggc(fg)
             @test size(node_feature(fg_)) == (out_channel, N)
 
-            g = Zygote.gradient(() -> sum(node_feature(ggc(fg))), Flux.params(ggc))
+            g = gradient(() -> sum(node_feature(ggc(fg))), Flux.params(ggc))
             @test length(g.grads) == 8
         end
 
@@ -148,7 +148,7 @@
             @test_broken Y = ggc(X |> gpu)
             @test_broken size(Y) == (out_channel, N, batch_size)
 
-            @test_broken g = Zygote.gradient(() -> sum(ggc(X |> gpu)), Flux.params(ggc))
+            @test_broken g = gradient(() -> sum(ggc(X |> gpu)), Flux.params(ggc))
             @test_broken length(g.grads) == 6
         end
     end
@@ -162,7 +162,7 @@
             fg_ = ec(fg)
             @test size(node_feature(fg_)) == (out_channel, N)
 
-            g = Zygote.gradient(() -> sum(node_feature(ec(fg))), Flux.params(ec))
+            g = gradient(() -> sum(node_feature(ec(fg))), Flux.params(ec))
             @test length(g.grads) == 4
         end
 
@@ -172,7 +172,7 @@
             Y = ec(X |> gpu)
             @test size(Y) == (out_channel, N, batch_size)
 
-            g = Zygote.gradient(() -> sum(ec(X |> gpu)), Flux.params(ec))
+            g = gradient(() -> sum(ec(X |> gpu)), Flux.params(ec))
             @test length(g.grads) == 3
         end
     end
diff --git a/test/cuda/msgpass.jl b/test/cuda/msgpass.jl
@@ -44,7 +44,7 @@
         fg_ = l(fg)
         @test size(node_feature(fg_)) == (out_channel, N)
 
-        g = Zygote.gradient(() -> sum(node_feature(l(fg))), Flux.params(l))
+        g = gradient(() -> sum(node_feature(l(fg))), Flux.params(l))
         @test length(g.grads) == 3
     end
 
@@ -56,7 +56,7 @@
         Y = l(X |> gpu)
         @test size(Y) == (out_channel, N, batch_size)
 
-        g = Zygote.gradient(() -> sum(l(X |> gpu)), Flux.params(l))
+        g = gradient(() -> sum(l(X |> gpu)), Flux.params(l))
         @test length(g.grads) == 2
     end
 end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
diff --git a/test/models.jl b/test/models.jl
diff --git a/test/runtests.jl b/test/runtests.jl