Flux.jl/src/tracker/back.jl

# The AD generates fairly large backtraces that are unhelpful if you interrupt
# while training; this just cleans that up.
macro interrupts(ex)
  :(try $(esc(ex))
    catch e
      e isa InterruptException || rethrow()
      throw(e)
    end)
end

# In-place gradients

init_grad(x) = zero(x)
zero_grad!(x) = zero(x)
zero_grad!(x::AbstractArray) = (x .= 0)

scan(c::Call) = foreach(scan, c.args)

function scan(x::Tracked)
  x.isleaf && return
  ref = x.ref += 1
  if ref == 1
    scan(x.f)
    isdefined(x, :grad) && (x.grad = zero_grad!(x.grad))
  end
  return
end

function scan(x)
  istracked(x) && scan(tracker(x))
  return
end

function back_(c::Call, Δ, once)
  Δs = c.func(Δ)
  (Δs isa Tuple && length(Δs) >= length(c.args)) ||
    error("Gradient is not a tuple of length $(length(c.args))")
  foreach((x, d) -> back(x, d, once), c.args, data.(Δs))
end

back_(::Call{Nothing}, Δ, once) = nothing
back_(::Call{Missing}, Δ, once) = error("`back!` was already used")

accum!(x, Δ) = x .+ Δ
accum!(x::AbstractArray, Δ) = (x .+= Δ)

function back(x::Tracked, Δ, once)
  x.isleaf && (x.grad = accum!(x.grad, Δ); return)
  ref = x.ref -= 1
  grad = if isdefined(x, :grad)
    x.grad = accum!(x.grad, Δ)
  elseif ref > 0
    x.grad = Δ
  else
    Δ
  end
  if ref == 0
    back_(x.f, grad, once)
    once && !x.isleaf && (x.f = Call(missing, ()))
  end
  return
end

back(::Nothing, Δ, once) = return

# Interface methods

# TODO: if an error occurs in `back` the refcounts will be broken
# and `back` will silently fail to update.
# (but only if you re-use intermediate values between passes)
# Refcounts are also probably not safe in some situations (e.g. back called
# from within a backpropagator)

function back!(x, Δ; once = true)
  istracked(x) || return
  scan(x)
  back(tracker(x), Δ, once)
  return
end

function extract_grad!(x)
  x̄ = copy(grad(x))
  x̄ = nobacksies("Use `gradient(...; nest = true)` for nested derivatives", x̄)
  tracker(x).grad = zero_grad!(grad(x))
  return x̄
end

function gradient_(f, xs...)
  xs = param.(data.(xs))
  l = f(xs...)
  losscheck(l)
  @interrupts back!(l)
  extract_grad!.(xs)
end

function gradient_(f, xs::Params)
  l = f()
  losscheck(l)
  @interrupts back!(l)
  gs = Grads()
  for x in xs
    gs[tracker(x)] = extract_grad!(x)
  end
  return gs
end

# Out-of-place gradients

function back_(g::Grads, c::Call, Δ)
  Δs = c.func(Δ)
  (Δs isa Tuple && length(Δs) >= length(c.args)) ||
    error("Gradient is not a tuple of length $(length(c.args))")
  foreach((x, Δ) -> back(g, x, Δ), c.args, Δs)
end

back_(g::Grads, ::Call{Nothing}, Δ) = nothing

function back(g::Grads, x::Tracked, Δ)
  x.isleaf && (accum!(g, x, Δ); return)
  ref = x.ref -= 1
  if ref > 0 || haskey(g, x)
    accum!(g, x, Δ)
    ref == 0 && back_(g, x.f, g[x])
  else
    ref == 0 && back_(g, x.f, Δ)
  end
  return
end

back(::Grads, ::Nothing, _) = return

collectmemaybe(xs) = xs

function forward(f, ps::Params)
  y = collectmemaybe(f())
  y, function (Δ)
    g = Grads(ps)
    if istracked(y)
      scan(y)
      back(g, tracker(y), Δ)
    end
    return g
  end
end

function forward(f, args...)
  args = param.(args)
  y, back = forward(() -> f(args...), Params(args))
  y, Δ -> getindex.(Ref(back(Δ)), args)
end

function losscheck(x)
  x isa Real || error("Function output is not scalar")
  isinf(x) && error("Loss is infinite")
  isnan(x) && error("Loss is NaN")
end

function gradient_nested(f, args...)
  y, back = forward(f, args...)
  losscheck(y)
  return back(1)
end

gradient(f, xs...; nest = false) =
  nest ? gradient_nested(f, xs...) : gradient_(f, xs...)

# Jacobians and Hessians

import ..Flux

"""
    J = jacobian(m,x)

Calculate the output jacobian `J = d/dx m(x)` such that each row `i` of `J` corresponds to the gradient `J[i,:] = ∇ₓ(m(x)[i])`
"""
function jacobian(m,x)
    xp = param(x)
    y  = m(xp)
    k  = length(y)
    n  = length(x)
    J  = Matrix{eltype(x)}(undef,k,n)
    for i = 1:k
        Flux.back!(y[i], once = false) # Populate gradient accumulator
        J[i,:] = xp.grad
        xp.grad .= 0 # Reset gradient accumulator
    end
    J
end

hessian(f, x) = jacobian(x -> gradient(f, x, nest=true)[1], x)
simpler/nicer training loop 2019-02-28 14:58:42 +00:00			`# The AD generates fairly large backtraces that are unhelpful if you interrupt`
			`# while training; this just cleans that up.`
			`macro interrupts(ex)`
			`:(try $(esc(ex))`
			`catch e`
			`e isa InterruptException \|\| rethrow()`
			`throw(e)`
			`end)`
			`end`

organise params 2019-02-28 13:44:54 +00:00			`# In-place gradients`

tracked tuples 2018-02-07 22:20:44 +00:00			`init_grad(x) = zero(x)`
fixes #171 2018-02-12 12:31:15 +00:00			`zero_grad!(x) = zero(x)`
			`zero_grad!(x::AbstractArray) = (x .= 0)`
tracked tuples 2018-02-07 22:20:44 +00:00
efficient traversal 2017-09-07 03:09:32 +00:00			`scan(c::Call) = foreach(scan, c.args)`

seperate tracking infrastructure from array wrapper 2018-02-07 17:43:25 +00:00			`function scan(x::Tracked)`
fixes #171 2018-02-12 12:31:15 +00:00			`x.isleaf && return`
tracked array restructure 2017-10-18 21:54:58 +00:00			`ref = x.ref += 1`
efficient traversal 2017-09-07 03:09:32 +00:00			`if ref == 1`
			`scan(x.f)`
fixes #171 2018-02-12 12:31:15 +00:00			`isdefined(x, :grad) && (x.grad = zero_grad!(x.grad))`
efficient traversal 2017-09-07 03:09:32 +00:00			`end`
			`return`
grad refactor 2017-09-07 01:21:35 +00:00			`end`

seperate tracking infrastructure from array wrapper 2018-02-07 17:43:25 +00:00			`function scan(x)`
			`istracked(x) && scan(tracker(x))`
			`return`
			`end`

destroy AD graph when doing in-place gradients 2018-10-26 15:57:19 +00:00			`function back_(c::Call, Δ, once)`
new grad api 2018-07-06 10:28:18 +00:00			`Δs = c.func(Δ)`
fix gradient definitions 2018-07-09 12:39:10 +00:00			`(Δs isa Tuple && length(Δs) >= length(c.args)) \|\|`
new grad api 2018-07-06 10:28:18 +00:00			`error("Gradient is not a tuple of length $(length(c.args))")`
destroy AD graph when doing in-place gradients 2018-10-26 15:57:19 +00:00			`foreach((x, d) -> back(x, d, once), c.args, data.(Δs))`
new grad api 2018-07-06 10:28:18 +00:00			`end`

0.7 fix 2018-10-27 11:23:14 +00:00			`back_(::Call{Nothing}, Δ, once) = nothing`
			back_(::Call{Missing}, Δ, once) = error("`back!` was already used")
efficient traversal 2017-09-07 03:09:32 +00:00
tracked tuples 2018-02-07 22:20:44 +00:00			`accum!(x, Δ) = x .+ Δ`
			`accum!(x::AbstractArray, Δ) = (x .+= Δ)`
seperate number type 2018-02-07 20:39:36 +00:00
destroy AD graph when doing in-place gradients 2018-10-26 15:57:19 +00:00			`function back(x::Tracked, Δ, once)`
fix 2018-03-21 11:25:47 +00:00			`x.isleaf && (x.grad = accum!(x.grad, Δ); return)`
tracked array restructure 2017-10-18 21:54:58 +00:00			`ref = x.ref -= 1`
destroy AD graph when doing in-place gradients 2018-10-26 15:57:19 +00:00			`grad = if isdefined(x, :grad)`
			`x.grad = accum!(x.grad, Δ)`
			`elseif ref > 0`
			`x.grad = Δ`
efficient traversal 2017-09-07 03:09:32 +00:00			`else`
destroy AD graph when doing in-place gradients 2018-10-26 15:57:19 +00:00			`Δ`
			`end`
			`if ref == 0`
			`back_(x.f, grad, once)`
			`once && !x.isleaf && (x.f = Call(missing, ()))`
efficient traversal 2017-09-07 03:09:32 +00:00			`end`
			`return`
			`end`
grad refactor 2017-09-07 01:21:35 +00:00
0.7 fix 2018-10-27 11:23:14 +00:00			`back(::Nothing, Δ, once) = return`
seperate tracking infrastructure from array wrapper 2018-02-07 17:43:25 +00:00
efficient traversal 2017-09-07 03:09:32 +00:00			`# Interface methods`

todo 2017-12-15 16:17:45 +00:00			# TODO: if an error occurs in `back` the refcounts will be broken
			# and `back` will silently fail to update.
destroy AD graph when doing in-place gradients 2018-10-26 15:57:19 +00:00			`# (but only if you re-use intermediate values between passes)`
functional API 2018-07-09 15:57:44 +00:00			`# Refcounts are also probably not safe in some situations (e.g. back called`
			`# from within a backpropagator)`
todo 2017-12-15 16:17:45 +00:00
destroy AD graph when doing in-place gradients 2018-10-26 15:57:19 +00:00			`function back!(x, Δ; once = true)`
shave some memory 2018-07-09 18:44:14 +00:00			`istracked(x) \|\| return`
efficient traversal 2017-09-07 03:09:32 +00:00			`scan(x)`
destroy AD graph when doing in-place gradients 2018-10-26 15:57:19 +00:00			`back(tracker(x), Δ, once)`
shave some memory 2018-07-09 18:44:14 +00:00			`return`
efficient traversal 2017-09-07 03:09:32 +00:00			`end`

in place implicit gradients 2019-02-28 14:08:01 +00:00			`function extract_grad!(x)`
			`x̄ = copy(grad(x))`
			x̄ = nobacksies("Use `gradient(...; nest = true)` for nested derivatives", x̄)
			`tracker(x).grad = zero_grad!(grad(x))`
			`return x̄`
			`end`

faster default gradient performance 2018-11-12 23:39:25 +00:00			`function gradient_(f, xs...)`
fixes #516 2019-01-15 15:48:38 +00:00			`xs = param.(data.(xs))`
faster default gradient performance 2018-11-12 23:39:25 +00:00			`l = f(xs...)`
			`losscheck(l)`
simpler/nicer training loop 2019-02-28 14:58:42 +00:00			`@interrupts back!(l)`
in place implicit gradients 2019-02-28 14:08:01 +00:00			`extract_grad!.(xs)`
			`end`

			`function gradient_(f, xs::Params)`
			`l = f()`
			`losscheck(l)`
simpler/nicer training loop 2019-02-28 14:58:42 +00:00			`@interrupts back!(l)`
in place implicit gradients 2019-02-28 14:08:01 +00:00			`gs = Grads()`
			`for x in xs`
			`gs[tracker(x)] = extract_grad!(x)`
			`end`
			`return gs`
faster default gradient performance 2018-11-12 23:39:25 +00:00			`end`

functional API 2018-07-09 15:57:44 +00:00			`# Out-of-place gradients`

			`function back_(g::Grads, c::Call, Δ)`
			`Δs = c.func(Δ)`
			`(Δs isa Tuple && length(Δs) >= length(c.args)) \|\|`
			`error("Gradient is not a tuple of length $(length(c.args))")`
shave some memory 2018-07-09 18:44:14 +00:00			`foreach((x, Δ) -> back(g, x, Δ), c.args, Δs)`
functional API 2018-07-09 15:57:44 +00:00			`end`

deprecated Void 2018-06-12 17:09:18 +00:00			`back_(g::Grads, ::Call{Nothing}, Δ) = nothing`
functional API 2018-07-09 15:57:44 +00:00
			`function back(g::Grads, x::Tracked, Δ)`
			`x.isleaf && (accum!(g, x, Δ); return)`
			`ref = x.ref -= 1`
			`if ref > 0 \|\| haskey(g, x)`
			`accum!(g, x, Δ)`
			`ref == 0 && back_(g, x.f, g[x])`
			`else`
			`ref == 0 && back_(g, x.f, Δ)`
			`end`
			`return`
			`end`

deprecated Void 2018-06-12 17:09:18 +00:00			`back(::Grads, ::Nothing, _) = return`
functional API 2018-07-09 15:57:44 +00:00
auto-collect in forward 2019-02-04 10:37:02 +00:00			`collectmemaybe(xs) = xs`

functional API 2018-07-09 15:57:44 +00:00			`function forward(f, ps::Params)`
auto-collect in forward 2019-02-04 10:37:02 +00:00			`y = collectmemaybe(f())`
functional API 2018-07-09 15:57:44 +00:00			`y, function (Δ)`
basic nested AD 2018-07-10 08:03:09 +00:00			`g = Grads(ps)`
functional API 2018-07-09 15:57:44 +00:00			`if istracked(y)`
			`scan(y)`
shave some memory 2018-07-09 18:44:14 +00:00			`back(g, tracker(y), Δ)`
functional API 2018-07-09 15:57:44 +00:00			`end`
			`return g`
			`end`
			`end`

			`function forward(f, args...)`
			`args = param.(args)`
			`y, back = forward(() -> f(args...), Params(args))`
Fix back scalar with a Ref and fix diagonal test 2018-08-11 13:27:56 +00:00			`y, Δ -> getindex.(Ref(back(Δ)), args)`
functional API 2018-07-09 15:57:44 +00:00			`end`

			`function losscheck(x)`
			`x isa Real \|\| error("Function output is not scalar")`
			`isinf(x) && error("Loss is infinite")`
			`isnan(x) && error("Loss is NaN")`
			`end`

faster default gradient performance 2018-11-12 23:39:25 +00:00			`function gradient_nested(f, args...)`
functional API 2018-07-09 15:57:44 +00:00			`y, back = forward(f, args...)`
			`losscheck(y)`
			`return back(1)`
			`end`

faster default gradient performance 2018-11-12 23:39:25 +00:00			`gradient(f, xs...; nest = false) =`
			`nest ? gradient_nested(f, xs...) : gradient_(f, xs...)`
fixes #516 2019-01-15 15:48:38 +00:00
add hessian 2019-01-29 08:37:30 +00:00			`# Jacobians and Hessians`

			`import ..Flux`

			`"""`
			`J = jacobian(m,x)`

			Calculate the output jacobian `J = d/dx m(x)` such that each row `i` of `J` corresponds to the gradient `J[i,:] = ∇ₓ(m(x)[i])`
			`"""`
			`function jacobian(m,x)`
			`xp = param(x)`
			`y = m(xp)`
			`k = length(y)`
			`n = length(x)`
			`J = Matrix{eltype(x)}(undef,k,n)`
			`for i = 1:k`
			`Flux.back!(y[i], once = false) # Populate gradient accumulator`
			`J[i,:] = xp.grad`
			`xp.grad .= 0 # Reset gradient accumulator`
			`end`
			`J`
			`end`

			`hessian(f, x) = jacobian(x -> gradient(f, x, nest=true)[1], x)`