diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl index 20130b1d..6b94a6a9 100644 --- a/src/cuda/cudnn.jl +++ b/src/cuda/cudnn.jl @@ -176,9 +176,6 @@ end # Flux Interface -import ..Flux: Flux -import ..Tracker: track, back, @back, istracked, TrackedArray - (BN::Flux.BatchNorm)(x::Union{CuParam{T,4},CuParam{T,5}}, cache = nothing) where T<:Union{Float32, Float64} = batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = BN.active) @@ -188,10 +185,11 @@ batchnorm(g::TrackedArray, b::TrackedArray, x::TrackedArray, running_mean::CuArr batchnorm(g::TrackedArray, b::TrackedArray, x::CuArray{T}, running_mean::CuArray{T}, running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} = - track(_batchnorm, g, b, x, running_mean, running_var, momentum, kw...) + track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...) @grad function batchnorm(g, b, x, running_mean, running_var, momentum; kw...) y = batchnorm(data(g), data(b), data(x), running_mean, running_var, momentum; kw...) deriv_tup = ∇batchnorm(data(g), data(b), data(x), Δ, running_mean, running_var, momentum, cache = cache, alpha = alpha, beta = beta, eps = eps, training = training) y, Δ -> (nobacksies(:batchnorm, ∇batchnorm(data.(g, b, x, Δ), running_mean, running_var, momentum; kw...)), nothing, nothing, nothing) +end