From 96d1c5526361b1c9243ee9dfcbc1a1b06fb23eeb Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 22 Sep 2017 15:27:06 +0100
Subject: [PATCH 01/16] wording tweak

---
 docs/src/training/training.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 6a3ee3f7..1eaa8a46 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -2,14 +2,14 @@
 
 To actually train a model we need three things:
 
-* A *loss function*, that evaluates how well a model is doing given some input data.
+* A *model loss function*, that evaluates how well a model is doing given some input data.
 * A collection of data points that will be provided to the loss function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 
 With these we can call `Flux.train!`:
 
 ```julia
-Flux.train!(loss, data, opt)
+Flux.train!(model, data, opt)
 ```
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
@@ -23,6 +23,7 @@ m = Chain(
   Dense(784, 32, σ),
   Dense(32, 10), softmax)
 
+# Model loss function
 loss(x, y) = Flux.mse(m(x), y)
 ```
 

From 94e38c05b84dfc283e863f1d96ca5d48be3d3c68 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 27 Sep 2017 18:33:23 +0100
Subject: [PATCH 02/16] more informative

---
 src/optimise/train.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 8ad437db..0a91e978 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -8,8 +8,8 @@ function train!(m, data, opt; cb = () -> ())
   cb = tocb(cb)
   @progress for x in data
     l = m(x...)
-    isinf(l.data[]) && error("Inf")
-    isnan(l.data[]) && error("NaN")
+    isinf(l.data[]) && error("Loss is Inf")
+    isnan(l.data[]) && error("Loss is NaN")
     back!(l)
     opt()
     cb()

From c51f5afb3d2d14247e332ae0cf711a904f374288 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 27 Sep 2017 18:37:07 +0100
Subject: [PATCH 03/16] clarity

---
 docs/src/training/training.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 1eaa8a46..d4bed5fe 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -9,7 +9,7 @@ To actually train a model we need three things:
 With these we can call `Flux.train!`:
 
 ```julia
-Flux.train!(model, data, opt)
+Flux.train!(modelLoss, data, opt)
 ```
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).

From 2ec8401d2c31c767278793adad9412918a66fa24 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 27 Sep 2017 20:37:25 +0100
Subject: [PATCH 04/16] remove compiler

---
 src/Flux.jl              |   2 -
 src/compiler/Compiler.jl |  14 ---
 src/compiler/code.jl     |  77 ----------------
 src/compiler/interp.jl   |  39 --------
 src/compiler/loops.jl    | 191 ---------------------------------------
 src/layers/basic.jl      |   3 -
 test/compiler.jl         |  86 ------------------
 test/runtests.jl         |   1 -
 8 files changed, 413 deletions(-)
 delete mode 100644 src/compiler/Compiler.jl
 delete mode 100644 src/compiler/code.jl
 delete mode 100644 src/compiler/interp.jl
 delete mode 100644 src/compiler/loops.jl
 delete mode 100644 test/compiler.jl

diff --git a/src/Flux.jl b/src/Flux.jl
index 7b55eacf..8c88d229 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -22,8 +22,6 @@ using .Optimise
 include("utils.jl")
 include("onehot.jl")
 
-include("compiler/Compiler.jl")
-
 include("layers/stateless.jl")
 include("layers/basic.jl")
 include("layers/recurrent.jl")
diff --git a/src/compiler/Compiler.jl b/src/compiler/Compiler.jl
deleted file mode 100644
index 7a30ef2e..00000000
--- a/src/compiler/Compiler.jl
+++ /dev/null
@@ -1,14 +0,0 @@
-module Compiler
-
-using MacroTools, DataFlow, DataFlow.Interpreter
-
-using DataFlow: graphm, syntax, prewalk!, postwalk!, prewalk, postwalk,
-  iscyclic, Constant, constant, isconstant, group, Split,
-  detuple, value, inputs, thread!, value, inputs, inputnode,
-  spliceinputs, bumpinputs, Line, Frame, applylines, graphinputs
-
-include("code.jl")
-include("interp.jl")
-include("loops.jl")
-
-end
diff --git a/src/compiler/code.jl b/src/compiler/code.jl
deleted file mode 100644
index b873547a..00000000
--- a/src/compiler/code.jl
+++ /dev/null
@@ -1,77 +0,0 @@
-import DataFlow: cse
-using MacroTools: @q, @>
-
-graph(m) = nothing
-
-function graphdef(ex, params = [])
-  @capture(shortdef(ex), (args__,) -> body_)
-  body = @> body MacroTools.flatten liftloops graphm DataFlow.il
-  body = map(x -> x in params ? :(self.$x) : x, body)
-  return args, body
-end
-
-function makegraph(graph, args, params = [])
-  graph = prewalk(graph) do v
-    isconstant(v) && (i = findfirst(args, value(v[1]))) ≠ 0 ?
-      inputnode(i) :
-      v
-  end
-  graph = map(graph) do x
-    x isa Offset ?
-      :(Flux.Compiler.Offset($(Expr(:quote, x.name)), $(x.n),
-                    $(x.name in params ? :(self.$(x.name)) : x.name))) :
-      x
-  end
-  vertex(:($DataFlow.Frame(self)), graph)
-end
-
-function build_type(T, params)
-  @esc T
-  :(type $T
-      $(params...)
-    end)
-end
-
-function build_forward(body, args)
-  iscyclic(body) && return :(error("Can't run forward pass on a cyclic graph"))
-  applylines(syntax(cse(body)))
-end
-
-import Lazy: groupby
-
-# TODO: type hints for parameters
-
-function process_type(ex)
-  @capture(ex, type T_ fs__ end)
-  @destruct [params = false || [],
-             funcs  = true || []] = groupby(x->isexpr(x, :->, :function), fs)
-  @assert length(funcs) == 1
-  pnames = namify.(params)
-  args, body = graphdef(funcs[1], pnames)
-  self = esc(:self)
-  quote
-    $(build_type(T, params))
-    $(esc(:((self::$T)($(args...)) = $(build_forward(body, args)))))
-    $(esc(:(Flux.Compiler.graph(self::$T)))) = $(DataFlow.constructor(map(esc, makegraph(body, args, params))))
-    nothing
-  end
-end
-
-function process_anon(ex)
-  args, body = graphdef(ex)
-  :(Capacitor($(DataFlow.constructor(map(esc, makegraph(body, args)[1])))))
-end
-
-function process_def(ex)
-  # TODO: make a singleton net type
-  @capture(ex, f_(xs__) = body_)
-  :($(esc(f)) = @net $(esc(:(($(xs...),) -> $body))); nothing)
-end
-
-macro net(ex)
-  ex = shortdef(ex)
-  isexpr(ex, :type) ? process_type(ex) :
-  @capture(ex, (__,) -> _) ? process_anon(ex) :
-  @capture(ex, _(__) = _) ? process_def(ex) :
-  error("Unsupported model expression $ex")
-end
diff --git a/src/compiler/interp.jl b/src/compiler/interp.jl
deleted file mode 100644
index d9759260..00000000
--- a/src/compiler/interp.jl
+++ /dev/null
@@ -1,39 +0,0 @@
-function astuple(xs::Vertex)
-  isconstant(xs) && value(xs[1]) isa Tuple ? value(xs[1]) :
-  xs isa Vertex && value(xs) == tuple ? inputs(xs) :
-  nothing
-end
-
-astuple(xs::Tuple) = xs
-
-astuple(xs) = nothing
-
-function astuples(xs)
-  xs = [astuple(x) for x in xs]
-  all(x->!(x==nothing), xs) ? xs : nothing
-end
-
-function interp(ctx, f, xs...)
-  g = graph(f)
-  g ≠ nothing && iscyclic(g) && error("Can't interpret cyclic graph")
-  @icatch(ctx, g ≠ nothing ?
-    interpret(ctx, g, xs...) :
-    f(xs...))
-end
-
-function interpmodel(m, args...)
-  ctx = Context(mux(iconst, iline, ilambda, iargs, ituple, interp))
-  @ithrow interp(ctx, m, args...)
-end
-
-# Anonymous models
-
-struct Capacitor
-  graph::IVertex{Any}
-end
-
-(m::Capacitor)(xs...) = interpmodel(m, xs...)
-
-graph(cap::Capacitor) = cap.graph
-
-Base.show(io::IO, ::Capacitor) = print(io, "Capacitor(...)")
diff --git a/src/compiler/loops.jl b/src/compiler/loops.jl
deleted file mode 100644
index 62fe0533..00000000
--- a/src/compiler/loops.jl
+++ /dev/null
@@ -1,191 +0,0 @@
-using ..Flux: stack, unstack, squeeze, unsqueeze
-
-# Stateful Models
-
-mutable struct Stateful
-  model
-  states::Vector{Any}
-  istate::Vector{Any}
-  ostate::Vector{Any}
-end
-
-Stateful(model, ss) = Stateful(model, ss, ss, ss)
-
-function Base.show(io::IO, m::Stateful)
-  print(io, "Stateful(")
-  show(io, m.model)
-  print(io, ")")
-end
-
-function (m::Stateful)(xs...)
-  m.istate = m.ostate
-  state, y = m.model((m.istate...,), xs...)
-  m.ostate = collect(state)
-  return y
-end
-
-# Seq Models
-
-struct SeqModel
-  model
-  steps::Int
-end
-
-seqtuple(x, n) = x
-seqtuple(xs::Tuple, n) = seqtuple.(xs, n)
-
-seqtuple(xs::AbstractArray, n) =
-  ndims(xs) < 3 ? xs :
-  n ≠ 0 && size(xs, 2) ≠ n ? error("Expecting sequence length $n, got $(size(xs, 2))") :
-  (unstack(xs, 2)...)
-
-reseq(x) = x
-reseq(x::Tuple{}) = ()
-reseq(xs::Tuple) = all(isa.(xs, AbstractArray) .& (ndims.(xs) .≥ 2)) ? stack(xs, 2) : reseq.(xs)
-
-function (m::SeqModel)(xs...)
-  xs = seqtuple(xs, m.steps)
-  reseq(m.model(xs...))
-end
-
-graph(m::SeqModel) = graph(m.model)
-
-# Recurrent Graphs
-
-struct Offset
-  name::Symbol
-  n::Int
-  default::Nullable{Any}
-end
-
-Offset(name, n) = Offset(name, n, nothing)
-
-Base.:-(o::Offset) = Offset(o.name, -o.n, o.default)
-
-function liftloops(ex)
-  ex = DataFlow.normedges(ex)
-  decls = Dict()
-  ex = MacroTools.postwalk(ex) do ex
-    @capture(ex, x_{n_}) || return ex
-    haskey(decls, (x,n)) && return namify(decls[(x,n)])
-    @gensym edge
-    decls[(x,n)] = :($edge = $(Offset(x,n))($x))
-    edge
-  end
-  prepend!(ex.args, collect(values(decls)))
-  ex
-end
-
-function hasloops(model)
-  g = graph(model)
-  g == nothing && return false
-  iscyclic(g) && return true
-  result = false
-  map(m -> hasloops(m) && (result = true), g)
-  return result
-end
-
-function atomise(model)
-  postwalk(graph(model)) do v
-    hasloops(value(v)) || return v
-    spliceinputs(atomise(value(v)), inputs(v)...)
-  end
-end
-
-function collect_state(v::IVertex)
-  state = typeof(v)[]
-  offset = Int[]
-  default = []
-  prewalk!(v) do v
-    value(v) isa Offset || return v
-    if (i = findfirst(state, v[1])) == 0
-      push!(state, v[1])
-      push!(offset, max(0, -value(v).n))
-      push!(default, get(value(v).default))
-    else
-      offset[i] = max(offset[i], -value(v).n)
-    end
-    v
-  end
-  return state, offset, default
-end
-
-hiddeninput(n, t) = vertex(Split(t), inputnode(n))
-
-# TODO: nicer way to do this.
-create_steps(v::IVertex, n) = [bumpinputs(spliceinputs(v, [hiddeninput(n, t) for n = 1:graphinputs(v)]...)) for t = 1:n]
-
-function getvar(n, step, steps, offset, default)
-  if step < 1
-    hiddeninput(1, sum(offset[1:n-1]) + 1 - step)
-  elseif step ∉ 1:length(steps)
-    constant(default[n])
-  else
-    steps[step][1,n]
-  end
-end
-
-function stateout(steps, offset, default)
-  outs = []
-  defaults = []
-  for i = 1:length(offset), j = 1:offset[i]
-    push!(outs, getvar(i, length(steps)-j+1, steps, offset, default))
-    push!(defaults, default[i])
-  end
-  group(outs...), defaults
-end
-
-# Input:  (hidden1, hidden2, ...), (x1, x2, ...)
-# Output: (hidden1, hidden2, ...), (y1, y2, ...)
-# TODO: make sure there's a reasonable order for hidden states
-
-function unrollgraph(v::IVertex, n)
-  state, offset, default = collect_state(v)
-  v = group(group(state...), v)
-  steps = create_steps(v, n)
-  for i = 1:n
-    vars = inputs(steps[i][1])
-    postwalk!(steps[i]) do v
-      value(v) isa Offset || return v
-      varid = findfirst(vars,v[1])
-      getvar(varid, value(v).n + i, steps, offset, default)
-    end
-  end
-  out = group(map(x->x[2], steps)...)
-  state, defaults = stateout(steps, offset, default)
-  group(state,out), defaults
-end
-
-unrollgraph(m, n; kws...) = unrollgraph(atomise(m), n; kws...)
-
-function unroll(model, n)
-  graph, state = unrollgraph(model, n)
-  SeqModel(Stateful(Capacitor(graph), state), n)
-end
-
-function stateless(s::Stateful)
-  v = graph(s.model)
-  v = spliceinputs(v, group(constant.(s.states)...),
-                   [inputnode(i) for i = 1:graphinputs(v)-1]...)
-  Capacitor(v[2])
-end
-
-stateless(s::SeqModel) = SeqModel(stateless(s.model), s.steps)
-
-function unseqin(v::IVertex)
-  prewalk(v) do v
-    # TODO: inputidx function
-    isa(value(v), Split) && DataFlow.isinput(v[1]) && value(v[1]).n > 1 ? v[1] : v
-  end
-end
-
-unseqout(v::IVertex) = group(v[1], v[2][1])
-
-unseq(graph) = unseqout(unseqin(graph))
-
-function unroll1(model)
-  graph, state = unrollgraph(model, 1)
-  Stateful(Capacitor(unseq(graph)), state)
-end
-
-flip(model) = Capacitor(map(x -> x isa Offset ? -x : x, atomise(model)))
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index dfe73ab9..71de15fe 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -26,9 +26,6 @@ Optimise.children(c::Chain) = c.layers
 
 (s::Chain)(x) = foldl((x, m) -> m(x), x, s.layers)
 
-Compiler.graph(s::Chain) =
-  foldl((v, m) -> vertex(m, v), constant(inputnode(1)), s.layers)
-
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
 function Base.show(io::IO, c::Chain)
diff --git a/test/compiler.jl b/test/compiler.jl
deleted file mode 100644
index a82550e8..00000000
--- a/test/compiler.jl
+++ /dev/null
@@ -1,86 +0,0 @@
-using DataFlow, MacroTools
-using Flux: stack, unsqueeze
-using Flux.Compiler: @net, graph
-using DataFlow: Line, Frame
-
-@net type Affine
-  W
-  b
-  x -> x*W .+ b
-end
-
-Affine(in::Integer, out::Integer; init = Flux.initn) =
-  Affine(init(in, out), init(1, out))
-
-@net type TLP
-  first
-  second
-  function (x)
-    l1 = σ.(first(x))
-    l2 = softmax(second(l1))
-  end
-end
-
-@net type Recurrent
-  Wxy; Wyy; by
-  y
-  function (x)
-    y = tanh.( x * Wxy .+ y{-1} * Wyy .+ by )
-  end
-end
-
-Recurrent(in, out; init = Flux.initn) =
-  Recurrent(init((in, out)), init((out, out)), init(1, out), init(1, out))
-
-syntax(v::Vertex) = prettify(DataFlow.syntax(v))
-syntax(x) = syntax(graph(x))
-
-@testset "Compiler" begin
-
-xs = randn(1, 10)
-d = Affine(10, 20)
-
-@test d(xs) ≈ (xs*d.W + d.b)
-
-d1 = @net x -> x * d.W + d.b
-
-let
-  @capture(syntax(d), _Frame(_Line((+).(x_[1] * W_, b_))))
-  @test isa(x, DataFlow.Input) && W isa Array && b isa Array
-end
-
-let a1 = Affine(10, 20), a2 = Affine(20, 15)
-  tlp = TLP(a1, a2)
-  @test tlp(xs) ≈ softmax(a2(σ.(a1(xs))))
-  @test Flux.Compiler.interpmodel(tlp, xs) ≈ softmax(a2(σ.(a1(xs))))
-end
-
-let tlp = TLP(Affine(10, 21), Affine(20, 15))
-  e = try
-    Flux.Compiler.interpmodel(tlp, rand(1, 10))
-  catch e
-    e
-  end
-  @test e.trace[end].func == :TLP
-  @test e.trace[end-1].func == Symbol("Affine")
-end
-
-function apply(model, xs, state)
-  ys = similar(xs, 0)
-  for x in xs
-    state, y = model(state, x)
-    push!(ys, y)
-  end
-  state, ys
-end
-
-@testset "RNN unrolling" begin
-  r = Recurrent(10, 5)
-  xs = [rand(1, 10) for _ = 1:3]
-  _, ys = apply(Flux.Compiler.unroll1(r).model, xs, (r.y,))
-  @test ys[1] == tanh.(xs[1] * r.Wxy .+ r.y * r.Wyy .+ r.by)
-  ru = Flux.Compiler.unroll(r, 3)
-  ru(unsqueeze(stack(squeeze.(xs, 1), 1), 1))[1] == squeeze.(ys, 1)
-end
-
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index f237133a..2ab0e447 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,7 +2,6 @@ using Flux, Base.Test
 
 @testset "Flux" begin
 
-include("compiler.jl")
 include("utils.jl")
 include("tracker.jl")
 

From 4bafa2b374a987db56b9055319f8ded8143b836c Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 27 Sep 2017 21:11:21 +0100
Subject: [PATCH 05/16] generic tree functions

---
 src/Flux.jl              |  1 +
 src/layers/basic.jl      |  5 +++--
 src/layers/recurrent.jl  |  9 ++++-----
 src/optimise/Optimise.jl | 10 +++++++---
 src/optimise/params.jl   | 18 ------------------
 src/tree.jl              | 20 ++++++++++++++++++++
 6 files changed, 35 insertions(+), 28 deletions(-)
 delete mode 100644 src/optimise/params.jl
 create mode 100644 src/tree.jl

diff --git a/src/Flux.jl b/src/Flux.jl
index 8c88d229..ba9a6327 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -21,6 +21,7 @@ using .Optimise
 
 include("utils.jl")
 include("onehot.jl")
+include("tree.jl")
 
 include("layers/stateless.jl")
 include("layers/basic.jl")
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 71de15fe..37c1b787 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -22,7 +22,8 @@ end
 @forward Chain.layers Base.getindex, Base.first, Base.last, Base.endof, Base.push!
 @forward Chain.layers Base.start, Base.next, Base.done
 
-Optimise.children(c::Chain) = c.layers
+children(c::Chain) = c.layers
+mapchildren(f, c::Chain) = Chain(f.(c.layers)...)
 
 (s::Chain)(x) = foldl((x, m) -> m(x), x, s.layers)
 
@@ -53,7 +54,7 @@ end
 Dense(in::Integer, out::Integer, σ = identity; init = initn) =
   Dense(σ, param(init(out, in)), param(init(out)))
 
-Optimise.children(d::Dense) = (d.W, d.b)
+treelike(Dense)
 
 (a::Dense)(x) = a.σ.(a.W*x .+ a.b)
 
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 5d44e1bf..491209a0 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -16,7 +16,7 @@ function (m::Recur)(xs...)
   return y
 end
 
-Optimise.children(m::Recur) = (m.cell,)
+treelike(Recur)
 
 Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
 
@@ -24,7 +24,7 @@ _truncate(x::AbstractArray) = x
 _truncate(x::TrackedArray) = x.data
 _truncate(x::Tuple) = _truncate.(x)
 
-truncate!(m) = foreach(truncate!, Optimise.children(m))
+truncate!(m) = foreach(truncate!, children(m))
 truncate!(m::Recur) = (m.state = _truncate(m.state))
 
 # Vanilla RNN
@@ -44,7 +44,7 @@ end
 
 hidden(m::RNNCell) = m.h
 
-Optimise.children(m::RNNCell) = (m.d, m.h)
+treelike(RNNCell)
 
 function Base.show(io::IO, m::RNNCell)
   print(io, "RNNCell(", m.d, ")")
@@ -82,8 +82,7 @@ end
 
 hidden(m::LSTMCell) = (m.h, m.c)
 
-Optimise.children(m::LSTMCell) =
-  (m.forget, m.input, m.output, m.cell, m.h, m.c)
+treelike(LSTMCell)
 
 Base.show(io::IO, m::LSTMCell) =
   print(io, "LSTMCell(",
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index 57c202eb..57956426 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -3,15 +3,19 @@ module Optimise
 export update!, params, train!,
   SGD
 
-include("params.jl")
+struct Param{T}
+  x::T
+  Δ::T
+end
+
+Base.convert(::Type{Param}, x::AbstractArray) = Param(x, zeros(x))
+
 include("optimisers.jl")
 include("interface.jl")
 include("train.jl")
 
 using Flux.Tracker: TrackedArray
 
-params(ps, p::TrackedArray) = push!(ps, p)
-
 Base.convert(::Type{Param}, x::TrackedArray) = Param(x.data, x.grad[])
 
 end
diff --git a/src/optimise/params.jl b/src/optimise/params.jl
deleted file mode 100644
index c5163dbe..00000000
--- a/src/optimise/params.jl
+++ /dev/null
@@ -1,18 +0,0 @@
-using DataFlow: OSet
-
-children(x) = ()
-
-params(ps, m) = foreach(m -> params(ps, m), children(m))
-
-function params(m)
-  ps = OSet()
-  params(ps, m)
-  return collect(ps)
-end
-
-struct Param{T}
-  x::T
-  Δ::T
-end
-
-convert(::Type{Param}, x::AbstractArray) = Param(x, zeros(x))
diff --git a/src/tree.jl b/src/tree.jl
new file mode 100644
index 00000000..438685d5
--- /dev/null
+++ b/src/tree.jl
@@ -0,0 +1,20 @@
+children(x) = ()
+mapchildren(f, x) = x
+
+function treelike(T, fs = fieldnames(T))
+  @eval begin
+    children(x::$T) = ($([:(x.$f) for f in fs]...),)
+    mapchildren(f, x::$T) = $T(f.(children(x))...)
+  end
+end
+
+using DataFlow: OSet
+
+params(ps, p::AbstractArray) = push!(ps, p)
+params(ps, m) = foreach(m -> params(ps, m), children(m))
+
+function params(m)
+  ps = OSet()
+  params(ps, m)
+  return collect(ps)
+end

From a60a754d68df79aa6e19f7f7b3992164d47703b0 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 27 Sep 2017 21:58:34 +0100
Subject: [PATCH 06/16] beginnings of gpu support

---
 src/Flux.jl         |  4 ++--
 src/layers/basic.jl |  5 ++++-
 src/onehot.jl       |  8 ++++++--
 src/tree.jl         | 13 +++++++++----
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index ba9a6327..45e3044e 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -4,11 +4,11 @@ module Flux
 
 # Zero Flux Given
 
-using Juno
+using Juno, Requires
 using Lazy: @forward
 
 export Chain, Dense, RNN, LSTM,
-  SGD, params
+  SGD, params, mapparams
 
 using NNlib
 export σ, relu, softmax
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 37c1b787..0ae5f8fa 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -56,7 +56,10 @@ Dense(in::Integer, out::Integer, σ = identity; init = initn) =
 
 treelike(Dense)
 
-(a::Dense)(x) = a.σ.(a.W*x .+ a.b)
+function (a::Dense)(x)
+  W, b, σ = a.W, a.b, a.σ
+  σ.(W*x .+ b)
+end
 
 function Base.show(io::IO, l::Dense)
   print(io, "Dense(", size(l.W, 2), ", ", size(l.W, 1))
diff --git a/src/onehot.jl b/src/onehot.jl
index 1e147397..48b7ccf5 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -9,8 +9,8 @@ Base.getindex(xs::OneHotVector, i::Integer) = i == xs.ix
 
 Base.:*(A::AbstractMatrix, b::OneHotVector) = A[:, b.ix]
 
-struct OneHotMatrix <: AbstractMatrix{Bool}
-  data::Vector{OneHotVector}
+struct OneHotMatrix{A<:AbstractVector{OneHotVector}} <: AbstractMatrix{Bool}
+  data::A
 end
 
 Base.size(xs::OneHotMatrix) = (Int64(length(xs.data[1])),length(xs.data))
@@ -21,6 +21,10 @@ Base.:*(A::AbstractMatrix, B::OneHotMatrix) = A[:, map(x->x.ix, B.data)]
 
 Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix([x, xs...])
 
+@require CuArrays begin
+  CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(CuArrays.cu(xs.data))
+end
+
 onehot(l, labels) = OneHotVector(findfirst(labels, l), length(labels))
 onehotbatch(ls, labels) = OneHotMatrix([onehot(l, labels) for l in ls])
 
diff --git a/src/tree.jl b/src/tree.jl
index 438685d5..bd6b2d73 100644
--- a/src/tree.jl
+++ b/src/tree.jl
@@ -8,13 +8,18 @@ function treelike(T, fs = fieldnames(T))
   end
 end
 
-using DataFlow: OSet
+# TODO: prewalk/postwalk with correct caching
+# This is only correct in general for idempotent functions
 
-params(ps, p::AbstractArray) = push!(ps, p)
-params(ps, m) = foreach(m -> params(ps, m), children(m))
+mapparams(f, x::AbstractArray) = f(x)
+mapparams(f, x) = mapchildren(x -> mapparams(f, x), x)
+
+forparams(f, x) = (mapparams(x -> (f(x); x), x); return)
+
+using DataFlow: OSet
 
 function params(m)
   ps = OSet()
-  params(ps, m)
+  forparams(p -> push!(ps, p), m)
   return collect(ps)
 end

From a32ae4914c1f3d446d0f68fb264e18551f9ea4c7 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 27 Sep 2017 22:51:00 +0100
Subject: [PATCH 07/16] onehotmatrix cuda support

---
 src/onehot.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/onehot.jl b/src/onehot.jl
index 48b7ccf5..2f1eb365 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -22,7 +22,10 @@ Base.:*(A::AbstractMatrix, B::OneHotMatrix) = A[:, map(x->x.ix, B.data)]
 Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix([x, xs...])
 
 @require CuArrays begin
+  import CuArrays: CuArray, cudaconvert
   CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(CuArrays.cu(xs.data))
+  Base.Broadcast._containertype(::Type{<:OneHotMatrix{<:CuArray}}) = CuArray
+  cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(cudaconvert(x.data))
 end
 
 onehot(l, labels) = OneHotVector(findfirst(labels, l), length(labels))

From 7c8dba0b85d1cd8e114ce402879d252d69767876 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 27 Sep 2017 23:14:58 +0100
Subject: [PATCH 08/16] gc in training loop

---
 src/optimise/train.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 0a91e978..4ecc7793 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -13,5 +13,6 @@ function train!(m, data, opt; cb = () -> ())
     back!(l)
     opt()
     cb()
+    gc()
   end
 end

From 8e63ac766e7d1c077f2beecfa798c081d7b0d796 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 28 Sep 2017 11:08:37 +0100
Subject: [PATCH 09/16] gpu support docs

---
 docs/make.jl    |  4 ++--
 docs/src/gpu.md | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 docs/src/gpu.md

diff --git a/docs/make.jl b/docs/make.jl
index e0a5a281..f87ff300 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -14,8 +14,8 @@ makedocs(modules=[Flux],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
                      "Training" => "training/training.md"],
-                  "Data Munging" =>
-                    ["One-Hot Encoding" => "data/onehot.md"],
+                  "One-Hot Encoding" => "data/onehot.md",
+                  "GPU Support" => "gpu.md",
                   "Contributing & Help" => "contributing.md"])
 
 deploydocs(
diff --git a/docs/src/gpu.md b/docs/src/gpu.md
new file mode 100644
index 00000000..db21db36
--- /dev/null
+++ b/docs/src/gpu.md
@@ -0,0 +1,33 @@
+# GPU Support
+
+Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) and [CLArrays](https://github.com/JuliaGPU/CLArrays.jl). Flux doesn't care what array type you use, so we can just plug these in without any other changes.
+
+For example, we can use `CuArrays` (with the `cu` array converter) to run our [basic example](models/basics.md) on an NVIDIA GPU.
+
+```julia
+using CuArrays
+
+W = cu(rand(2, 5))
+b = cu(rand(2))
+
+predict(x) = W*x .+ b
+loss(x, y) = sum((predict(x) .- y).^2)
+
+x, y = cu(rand(5)), cu(rand(2)) # Dummy data
+loss(x, y) # ~ 3
+```
+
+Note that we convert both the parameters (`W`, `b`) and the data set (`x`, `y`) to cuda arrays. Taking derivatives and training works exactly as before.
+
+If you define a structured model, like a `Dense` layer or `Chain`, you just need to convert the internal parameters. Flux provides `mapparams`, which allows you to alter all parameters of a model at once.
+
+```julia
+d = Dense(10, 5, σ)
+d = mapparams(cu, d)
+d.W # Tracked CuArray
+d(cu(rand(10))) # CuArray output
+
+m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+m = mapparams(cu, m)
+d(cu(rand(10)))
+```

From d3419c943bd8c8c3d06ffa8ea5618625b3b14613 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 28 Sep 2017 11:11:11 +0100
Subject: [PATCH 10/16] example link

---
 docs/src/gpu.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index db21db36..4452e856 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -2,12 +2,12 @@
 
 Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) and [CLArrays](https://github.com/JuliaGPU/CLArrays.jl). Flux doesn't care what array type you use, so we can just plug these in without any other changes.
 
-For example, we can use `CuArrays` (with the `cu` array converter) to run our [basic example](models/basics.md) on an NVIDIA GPU.
+For example, we can use `CuArrays` (with the `cu` converter) to run our [basic example](models/basics.md) on an NVIDIA GPU.
 
 ```julia
 using CuArrays
 
-W = cu(rand(2, 5))
+W = cu(rand(2, 5)) # a 2×5 CuArray
 b = cu(rand(2))
 
 predict(x) = W*x .+ b
@@ -31,3 +31,5 @@ m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
 m = mapparams(cu, m)
 d(cu(rand(10)))
 ```
+
+The [mnist example](https://github.com/FluxML/model-zoo/blob/master/mnist/mnist.jl) contains the code needed to run the model on the GPU; just uncomment the lines after `using CuArrays`.

From 1b91e6b38de5669e97ddaafef510134b587ba5ed Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Mon, 2 Oct 2017 20:50:11 +0100
Subject: [PATCH 11/16] store onehotmatrix height

---
 src/onehot.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/onehot.jl b/src/onehot.jl
index 2f1eb365..d01dc9e1 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -10,10 +10,11 @@ Base.getindex(xs::OneHotVector, i::Integer) = i == xs.ix
 Base.:*(A::AbstractMatrix, b::OneHotVector) = A[:, b.ix]
 
 struct OneHotMatrix{A<:AbstractVector{OneHotVector}} <: AbstractMatrix{Bool}
+  height::Int
   data::A
 end
 
-Base.size(xs::OneHotMatrix) = (Int64(length(xs.data[1])),length(xs.data))
+Base.size(xs::OneHotMatrix) = (Int64(xs.height),length(xs.data))
 
 Base.getindex(xs::OneHotMatrix, i::Int, j::Int) = xs.data[j][i]
 
@@ -23,13 +24,13 @@ Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix([x, xs...])
 
 @require CuArrays begin
   import CuArrays: CuArray, cudaconvert
-  CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(CuArrays.cu(xs.data))
+  CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(xs.height, CuArrays.cu(xs.data))
   Base.Broadcast._containertype(::Type{<:OneHotMatrix{<:CuArray}}) = CuArray
-  cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(cudaconvert(x.data))
+  cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
 end
 
 onehot(l, labels) = OneHotVector(findfirst(labels, l), length(labels))
-onehotbatch(ls, labels) = OneHotMatrix([onehot(l, labels) for l in ls])
+onehotbatch(ls, labels) = OneHotMatrix(length(labels), [onehot(l, labels) for l in ls])
 
 argmax(y::AbstractVector, labels = 1:length(y)) =
   labels[findfirst(y, maximum(y))]

From 5fd1b7d9a23c8cf6c49cbef47d4e0029e99d91e1 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Mon, 2 Oct 2017 20:50:18 +0100
Subject: [PATCH 12/16] remove gc hack

---
 src/optimise/train.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 4ecc7793..0a91e978 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -13,6 +13,5 @@ function train!(m, data, opt; cb = () -> ())
     back!(l)
     opt()
     cb()
-    gc()
   end
 end

From c202e2bc1ac17b9ebf0fff1374f72acd487c162f Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Tue, 3 Oct 2017 19:00:42 +0100
Subject: [PATCH 13/16] clarify

---
 docs/src/training/training.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index d4bed5fe..94a43348 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -25,6 +25,9 @@ m = Chain(
 
 # Model loss function
 loss(x, y) = Flux.mse(m(x), y)
+
+# later
+Flux.train!(loss, data, opt)
 ```
 
 The loss will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `logloss` for cross entropy loss, but you can calculate it however you want.

From 2b95aff15859576dc300216950731bb91574c2c8 Mon Sep 17 00:00:00 2001
From: Dave Kleinschmidt <dave.f.kleinschmidt@gmail.com>
Date: Tue, 3 Oct 2017 14:16:51 -0400
Subject: [PATCH 14/16] actually use init argument in LSTMCell

---
 src/layers/recurrent.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 491209a0..3387a5f8 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -63,9 +63,9 @@ struct LSTMCell{D1,D2,V}
 end
 
 function LSTMCell(in, out; init = initn)
-  cell = LSTMCell([Dense(in+out, out, σ, init = initn) for _ = 1:3]...,
-                  Dense(in+out, out, tanh, init = initn),
-                  param(initn(out)), param(initn(out)))
+  cell = LSTMCell([Dense(in+out, out, σ, init = init) for _ = 1:3]...,
+                  Dense(in+out, out, tanh, init = init),
+                  param(init(out)), param(init(out)))
   cell.forget.b.data .= 1
   return cell
 end

From 1abc4febe67ff04b7f3decc77a8fa207189f8026 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 4 Oct 2017 18:55:56 +0100
Subject: [PATCH 15/16] more general adaptors

---
 src/onehot.jl          | 5 ++++-
 src/tracker/Tracker.jl | 9 ++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/onehot.jl b/src/onehot.jl
index d01dc9e1..aea68829 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -22,9 +22,12 @@ Base.:*(A::AbstractMatrix, B::OneHotMatrix) = A[:, map(x->x.ix, B.data)]
 
 Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix([x, xs...])
 
+import NNlib.adapt
+
+adapt(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
+
 @require CuArrays begin
   import CuArrays: CuArray, cudaconvert
-  CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(xs.height, CuArrays.cu(xs.data))
   Base.Broadcast._containertype(::Type{<:OneHotMatrix{<:CuArray}}) = CuArray
   cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
 end
diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index 74fcb2b8..e218c3ea 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -71,11 +71,10 @@ include("back.jl")
 include("lib.jl")
 include("numeric.jl")
 
-using Requires
+import NNlib.adapt
 
-@require CuArrays begin
-  import CuArrays: cu
-  cu(xs::TrackedArray) = TrackedArray(xs.f, cu(xs.data), RefValue(cu(grad(xs))))
-end
+adapt(T, xs::TrackedArray) =
+  TrackedArray(xs.f, adapt(T, xs.data),
+               RefValue(adapt(T, grad(xs))))
 
 end

From bfcc1ac25d11c5c3e04ddc1ea6c59b7b8efd968d Mon Sep 17 00:00:00 2001
From: pevnak <pevnak@gmail.com>
Date: Mon, 25 Sep 2017 21:08:35 +0200
Subject: [PATCH 16/16] exposing optimisers

---
 src/optimise/interface.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/optimise/interface.jl b/src/optimise/interface.jl
index cffe62fc..364f7358 100644
--- a/src/optimise/interface.jl
+++ b/src/optimise/interface.jl
@@ -10,3 +10,9 @@ function optimiser(ps, fs...)
 end
 
 SGD(ps, η = 1) = optimiser(ps, p -> descent(p, η))
+ADAM(ps, η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0.0) = optimiser(ps, p -> adam(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
+Momentum(ps,ρ, decay = 0.0) = optimiser(ps, p -> momentum(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
+Nesterov(ps,ρ, decay = 0.0) = optimiser(ps, p -> nesterov(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
+RMSProp(ps, η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> rmsprop(p; η = η, ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
+ADAGrad(ps, η = 0.01, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> adagrad(p; η = η, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
+ADADelta(ps, η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> adadelta(p; ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))