Merge branch 'master' into cat-fix

2017-10-06 15:00:26 +02:00 · 2017-10-06 15:00:26 +02:00 · 2084df96ae
commit 2084df96ae
parent 136f9bbf74 bfcc1ac25d
19 changed files with 119 additions and 462 deletions
--- a/docs/make.jl
+++ b/docs/make.jl
@ -14,8 +14,8 @@ makedocs(modules=[Flux],
                  "Training Models" =>
                    ["Optimisers" => "training/optimisers.md",
                     "Training" => "training/training.md"],
-                  "Data Munging" =>
+                  "One-Hot Encoding" => "data/onehot.md",
-                    ["One-Hot Encoding" => "data/onehot.md"],
+                  "GPU Support" => "gpu.md",
                  "Contributing & Help" => "contributing.md"])
 deploydocs(
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@ -0,0 +1,35 @@
 # GPU Support
 Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) and [CLArrays](https://github.com/JuliaGPU/CLArrays.jl). Flux doesn't care what array type you use, so we can just plug these in without any other changes.
 For example, we can use `CuArrays` (with the `cu` converter) to run our [basic example](models/basics.md) on an NVIDIA GPU.
 ```julia
 using CuArrays
 W = cu(rand(2, 5)) # a 2×5 CuArray
 b = cu(rand(2))
 predict(x) = W*x .+ b
 loss(x, y) = sum((predict(x) .- y).^2)
 x, y = cu(rand(5)), cu(rand(2)) # Dummy data
 loss(x, y) # ~ 3
 ```
 Note that we convert both the parameters (`W`, `b`) and the data set (`x`, `y`) to cuda arrays. Taking derivatives and training works exactly as before.
 If you define a structured model, like a `Dense` layer or `Chain`, you just need to convert the internal parameters. Flux provides `mapparams`, which allows you to alter all parameters of a model at once.
 ```julia
 d = Dense(10, 5, σ)
 d = mapparams(cu, d)
 d.W # Tracked CuArray
 d(cu(rand(10))) # CuArray output
 m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
 m = mapparams(cu, m)
 d(cu(rand(10)))
 ```
 The [mnist example](https://github.com/FluxML/model-zoo/blob/master/mnist/mnist.jl) contains the code needed to run the model on the GPU; just uncomment the lines after `using CuArrays`.
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@ -2,14 +2,14 @@
 To actually train a model we need three things:
-* A *loss function*, that evaluates how well a model is doing given some input data.
+* A *model loss function*, that evaluates how well a model is doing given some input data.
 * A collection of data points that will be provided to the loss function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 With these we can call `Flux.train!`:
 ```julia
-Flux.train!(loss, data, opt)
+Flux.train!(modelLoss, data, opt)
 ```
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
@ -23,7 +23,11 @@ m = Chain(
  Dense(784, 32, σ),
  Dense(32, 10), softmax)
 # Model loss function
 loss(x, y) = Flux.mse(m(x), y)
 # later
 Flux.train!(loss, data, opt)
 ```
 The loss will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `logloss` for cross entropy loss, but you can calculate it however you want.
--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -4,11 +4,11 @@ module Flux
 # Zero Flux Given
-using Juno
+using Juno, Requires
 using Lazy: @forward
 export Chain, Dense, RNN, LSTM,
-  SGD, params
+  SGD, params, mapparams
 using NNlib
 export σ, relu, softmax
@ -21,8 +21,7 @@ using .Optimise
 include("utils.jl")
 include("onehot.jl")
-
+include("tree.jl")
 include("compiler/Compiler.jl")
 include("layers/stateless.jl")
 include("layers/basic.jl")
--- a/src/compiler/Compiler.jl
+++ b/src/compiler/Compiler.jl
@ -1,14 +0,0 @@
 module Compiler
 using MacroTools, DataFlow, DataFlow.Interpreter
 using DataFlow: graphm, syntax, prewalk!, postwalk!, prewalk, postwalk,
  iscyclic, Constant, constant, isconstant, group, Split,
  detuple, value, inputs, thread!, value, inputs, inputnode,
  spliceinputs, bumpinputs, Line, Frame, applylines, graphinputs
 include("code.jl")
 include("interp.jl")
 include("loops.jl")
 end
--- a/src/compiler/code.jl
+++ b/src/compiler/code.jl
@ -1,77 +0,0 @@
 import DataFlow: cse
 using MacroTools: @q, @>
 graph(m) = nothing
 function graphdef(ex, params = [])
  @capture(shortdef(ex), (args__,) -> body_)
  body = @> body MacroTools.flatten liftloops graphm DataFlow.il
  body = map(x -> x in params ? :(self.$x) : x, body)
  return args, body
 end
 function makegraph(graph, args, params = [])
  graph = prewalk(graph) do v
    isconstant(v) && (i = findfirst(args, value(v[1]))) ≠ 0 ?
      inputnode(i) :
      v
  end
  graph = map(graph) do x
    x isa Offset ?
      :(Flux.Compiler.Offset($(Expr(:quote, x.name)), $(x.n),
                    $(x.name in params ? :(self.$(x.name)) : x.name))) :
      x
  end
  vertex(:($DataFlow.Frame(self)), graph)
 end
 function build_type(T, params)
  @esc T
  :(type $T
      $(params...)
    end)
 end
 function build_forward(body, args)
  iscyclic(body) && return :(error("Can't run forward pass on a cyclic graph"))
  applylines(syntax(cse(body)))
 end
 import Lazy: groupby
 # TODO: type hints for parameters
 function process_type(ex)
  @capture(ex, type T_ fs__ end)
  @destruct [params = false || [],
             funcs  = true || []] = groupby(x->isexpr(x, :->, :function), fs)
  @assert length(funcs) == 1
  pnames = namify.(params)
  args, body = graphdef(funcs[1], pnames)
  self = esc(:self)
  quote
    $(build_type(T, params))
    $(esc(:((self::$T)($(args...)) = $(build_forward(body, args)))))
    $(esc(:(Flux.Compiler.graph(self::$T)))) = $(DataFlow.constructor(map(esc, makegraph(body, args, params))))
    nothing
  end
 end
 function process_anon(ex)
  args, body = graphdef(ex)
  :(Capacitor($(DataFlow.constructor(map(esc, makegraph(body, args)[1])))))
 end
 function process_def(ex)
  # TODO: make a singleton net type
  @capture(ex, f_(xs__) = body_)
  :($(esc(f)) = @net $(esc(:(($(xs...),) -> $body))); nothing)
 end
 macro net(ex)
  ex = shortdef(ex)
  isexpr(ex, :type) ? process_type(ex) :
  @capture(ex, (__,) -> _) ? process_anon(ex) :
  @capture(ex, _(__) = _) ? process_def(ex) :
  error("Unsupported model expression $ex")
 end
--- a/src/compiler/interp.jl
+++ b/src/compiler/interp.jl
@ -1,39 +0,0 @@
 function astuple(xs::Vertex)
  isconstant(xs) && value(xs[1]) isa Tuple ? value(xs[1]) :
  xs isa Vertex && value(xs) == tuple ? inputs(xs) :
  nothing
 end
 astuple(xs::Tuple) = xs
 astuple(xs) = nothing
 function astuples(xs)
  xs = [astuple(x) for x in xs]
  all(x->!(x==nothing), xs) ? xs : nothing
 end
 function interp(ctx, f, xs...)
  g = graph(f)
  g ≠ nothing && iscyclic(g) && error("Can't interpret cyclic graph")
  @icatch(ctx, g ≠ nothing ?
    interpret(ctx, g, xs...) :
    f(xs...))
 end
 function interpmodel(m, args...)
  ctx = Context(mux(iconst, iline, ilambda, iargs, ituple, interp))
  @ithrow interp(ctx, m, args...)
 end
 # Anonymous models
 struct Capacitor
  graph::IVertex{Any}
 end
 (m::Capacitor)(xs...) = interpmodel(m, xs...)
 graph(cap::Capacitor) = cap.graph
 Base.show(io::IO, ::Capacitor) = print(io, "Capacitor(...)")
--- a/src/compiler/loops.jl
+++ b/src/compiler/loops.jl
@ -1,191 +0,0 @@
 using ..Flux: stack, unstack, squeeze, unsqueeze
 # Stateful Models
 mutable struct Stateful
  model
  states::Vector{Any}
  istate::Vector{Any}
  ostate::Vector{Any}
 end
 Stateful(model, ss) = Stateful(model, ss, ss, ss)
 function Base.show(io::IO, m::Stateful)
  print(io, "Stateful(")
  show(io, m.model)
  print(io, ")")
 end
 function (m::Stateful)(xs...)
  m.istate = m.ostate
  state, y = m.model((m.istate...,), xs...)
  m.ostate = collect(state)
  return y
 end
 # Seq Models
 struct SeqModel
  model
  steps::Int
 end
 seqtuple(x, n) = x
 seqtuple(xs::Tuple, n) = seqtuple.(xs, n)
 seqtuple(xs::AbstractArray, n) =
  ndims(xs) < 3 ? xs :
  n ≠ 0 && size(xs, 2) ≠ n ? error("Expecting sequence length $n, got $(size(xs, 2))") :
  (unstack(xs, 2)...)
 reseq(x) = x
 reseq(x::Tuple{}) = ()
 reseq(xs::Tuple) = all(isa.(xs, AbstractArray) .& (ndims.(xs) .≥ 2)) ? stack(xs, 2) : reseq.(xs)
 function (m::SeqModel)(xs...)
  xs = seqtuple(xs, m.steps)
  reseq(m.model(xs...))
 end
 graph(m::SeqModel) = graph(m.model)
 # Recurrent Graphs
 struct Offset
  name::Symbol
  n::Int
  default::Nullable{Any}
 end
 Offset(name, n) = Offset(name, n, nothing)
 Base.:-(o::Offset) = Offset(o.name, -o.n, o.default)
 function liftloops(ex)
  ex = DataFlow.normedges(ex)
  decls = Dict()
  ex = MacroTools.postwalk(ex) do ex
    @capture(ex, x_{n_}) || return ex
    haskey(decls, (x,n)) && return namify(decls[(x,n)])
    @gensym edge
    decls[(x,n)] = :($edge = $(Offset(x,n))($x))
    edge
  end
  prepend!(ex.args, collect(values(decls)))
  ex
 end
 function hasloops(model)
  g = graph(model)
  g == nothing && return false
  iscyclic(g) && return true
  result = false
  map(m -> hasloops(m) && (result = true), g)
  return result
 end
 function atomise(model)
  postwalk(graph(model)) do v
    hasloops(value(v)) || return v
    spliceinputs(atomise(value(v)), inputs(v)...)
  end
 end
 function collect_state(v::IVertex)
  state = typeof(v)[]
  offset = Int[]
  default = []
  prewalk!(v) do v
    value(v) isa Offset || return v
    if (i = findfirst(state, v[1])) == 0
      push!(state, v[1])
      push!(offset, max(0, -value(v).n))
      push!(default, get(value(v).default))
    else
      offset[i] = max(offset[i], -value(v).n)
    end
    v
  end
  return state, offset, default
 end
 hiddeninput(n, t) = vertex(Split(t), inputnode(n))
 # TODO: nicer way to do this.
 create_steps(v::IVertex, n) = [bumpinputs(spliceinputs(v, [hiddeninput(n, t) for n = 1:graphinputs(v)]...)) for t = 1:n]
 function getvar(n, step, steps, offset, default)
  if step < 1
    hiddeninput(1, sum(offset[1:n-1]) + 1 - step)
  elseif step ∉ 1:length(steps)
    constant(default[n])
  else
    steps[step][1,n]
  end
 end
 function stateout(steps, offset, default)
  outs = []
  defaults = []
  for i = 1:length(offset), j = 1:offset[i]
    push!(outs, getvar(i, length(steps)-j+1, steps, offset, default))
    push!(defaults, default[i])
  end
  group(outs...), defaults
 end
 # Input:  (hidden1, hidden2, ...), (x1, x2, ...)
 # Output: (hidden1, hidden2, ...), (y1, y2, ...)
 # TODO: make sure there's a reasonable order for hidden states
 function unrollgraph(v::IVertex, n)
  state, offset, default = collect_state(v)
  v = group(group(state...), v)
  steps = create_steps(v, n)
  for i = 1:n
    vars = inputs(steps[i][1])
    postwalk!(steps[i]) do v
      value(v) isa Offset || return v
      varid = findfirst(vars,v[1])
      getvar(varid, value(v).n + i, steps, offset, default)
    end
  end
  out = group(map(x->x[2], steps)...)
  state, defaults = stateout(steps, offset, default)
  group(state,out), defaults
 end
 unrollgraph(m, n; kws...) = unrollgraph(atomise(m), n; kws...)
 function unroll(model, n)
  graph, state = unrollgraph(model, n)
  SeqModel(Stateful(Capacitor(graph), state), n)
 end
 function stateless(s::Stateful)
  v = graph(s.model)
  v = spliceinputs(v, group(constant.(s.states)...),
                   [inputnode(i) for i = 1:graphinputs(v)-1]...)
  Capacitor(v[2])
 end
 stateless(s::SeqModel) = SeqModel(stateless(s.model), s.steps)
 function unseqin(v::IVertex)
  prewalk(v) do v
    # TODO: inputidx function
    isa(value(v), Split) && DataFlow.isinput(v[1]) && value(v[1]).n > 1 ? v[1] : v
  end
 end
 unseqout(v::IVertex) = group(v[1], v[2][1])
 unseq(graph) = unseqout(unseqin(graph))
 function unroll1(model)
  graph, state = unrollgraph(model, 1)
  Stateful(Capacitor(unseq(graph)), state)
 end
 flip(model) = Capacitor(map(x -> x isa Offset ? -x : x, atomise(model)))
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@ -22,13 +22,11 @@ end
@forward Chain.layers Base.getindex, Base.first, Base.last, Base.endof, Base.push!
@forward Chain.layers Base.start, Base.next, Base.done
-Optimise.children(c::Chain) = c.layers
+children(c::Chain) = c.layers
 mapchildren(f, c::Chain) = Chain(f.(c.layers)...)
 (s::Chain)(x) = foldl((x, m) -> m(x), x, s.layers)
 Compiler.graph(s::Chain) =
  foldl((v, m) -> vertex(m, v), constant(inputnode(1)), s.layers)
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 function Base.show(io::IO, c::Chain)
@ -56,9 +54,12 @@ end
 Dense(in::Integer, out::Integer, σ = identity; init = initn) =
  Dense(σ, param(init(out, in)), param(init(out)))
-Optimise.children(d::Dense) = (d.W, d.b)
+treelike(Dense)
-(a::Dense)(x) = a.σ.(a.W*x .+ a.b)
+function (a::Dense)(x)
  W, b, σ = a.W, a.b, a.σ
  σ.(W*x .+ b)
 end
 function Base.show(io::IO, l::Dense)
  print(io, "Dense(", size(l.W, 2), ", ", size(l.W, 1))
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@ -16,7 +16,7 @@ function (m::Recur)(xs...)
  return y
 end
-Optimise.children(m::Recur) = (m.cell,)
+treelike(Recur)
 Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
@ -24,7 +24,7 @@ _truncate(x::AbstractArray) = x
 _truncate(x::TrackedArray) = x.data
 _truncate(x::Tuple) = _truncate.(x)
-truncate!(m) = foreach(truncate!, Optimise.children(m))
+truncate!(m) = foreach(truncate!, children(m))
 truncate!(m::Recur) = (m.state = _truncate(m.state))
 # Vanilla RNN
@ -44,7 +44,7 @@ end
 hidden(m::RNNCell) = m.h
-Optimise.children(m::RNNCell) = (m.d, m.h)
+treelike(RNNCell)
 function Base.show(io::IO, m::RNNCell)
  print(io, "RNNCell(", m.d, ")")
@ -63,9 +63,9 @@ struct LSTMCell{D1,D2,V}
 end
 function LSTMCell(in, out; init = initn)
-  cell = LSTMCell([Dense(in+out, out, σ, init = initn) for _ = 1:3]...,
+  cell = LSTMCell([Dense(in+out, out, σ, init = init) for _ = 1:3]...,
-                  Dense(in+out, out, tanh, init = initn),
+                  Dense(in+out, out, tanh, init = init),
-                  param(initn(out)), param(initn(out)))
+                  param(init(out)), param(init(out)))
  cell.forget.b.data .= 1
  return cell
 end
@ -82,8 +82,7 @@ end
 hidden(m::LSTMCell) = (m.h, m.c)
-Optimise.children(m::LSTMCell) =
+treelike(LSTMCell)
  (m.forget, m.input, m.output, m.cell, m.h, m.c)
 Base.show(io::IO, m::LSTMCell) =
  print(io, "LSTMCell(",
--- a/src/onehot.jl
+++ b/src/onehot.jl
@ -9,11 +9,12 @@ Base.getindex(xs::OneHotVector, i::Integer) = i == xs.ix
 Base.:*(A::AbstractMatrix, b::OneHotVector) = A[:, b.ix]
-struct OneHotMatrix <: AbstractMatrix{Bool}
+struct OneHotMatrix{A<:AbstractVector{OneHotVector}} <: AbstractMatrix{Bool}
-  data::Vector{OneHotVector}
+  height::Int
  data::A
 end
-Base.size(xs::OneHotMatrix) = (Int64(length(xs.data[1])),length(xs.data))
+Base.size(xs::OneHotMatrix) = (Int64(xs.height),length(xs.data))
 Base.getindex(xs::OneHotMatrix, i::Int, j::Int) = xs.data[j][i]
@ -21,8 +22,18 @@ Base.:*(A::AbstractMatrix, B::OneHotMatrix) = A[:, map(x->x.ix, B.data)]
 Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix([x, xs...])
 import NNlib.adapt
 adapt(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
@require CuArrays begin
  import CuArrays: CuArray, cudaconvert
  Base.Broadcast._containertype(::Type{<:OneHotMatrix{<:CuArray}}) = CuArray
  cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
 end
 onehot(l, labels) = OneHotVector(findfirst(labels, l), length(labels))
-onehotbatch(ls, labels) = OneHotMatrix([onehot(l, labels) for l in ls])
+onehotbatch(ls, labels) = OneHotMatrix(length(labels), [onehot(l, labels) for l in ls])
 argmax(y::AbstractVector, labels = 1:length(y)) =
  labels[findfirst(y, maximum(y))]
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@ -3,15 +3,19 @@ module Optimise
 export update!, params, train!,
  SGD
-include("params.jl")
+struct Param{T}
  x::T
  Δ::T
 end
 Base.convert(::Type{Param}, x::AbstractArray) = Param(x, zeros(x))
 include("optimisers.jl")
 include("interface.jl")
 include("train.jl")
 using Flux.Tracker: TrackedArray
 params(ps, p::TrackedArray) = push!(ps, p)
 Base.convert(::Type{Param}, x::TrackedArray) = Param(x.data, x.grad[])
 end
--- a/src/optimise/interface.jl
+++ b/src/optimise/interface.jl
@ -10,3 +10,9 @@ function optimiser(ps, fs...)
 end
 SGD(ps, η = 1) = optimiser(ps, p -> descent(p, η))
 ADAM(ps, η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0.0) = optimiser(ps, p -> adam(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
 Momentum(ps,ρ, decay = 0.0) = optimiser(ps, p -> momentum(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
 Nesterov(ps,ρ, decay = 0.0) = optimiser(ps, p -> nesterov(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
 RMSProp(ps, η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> rmsprop(p; η = η, ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
 ADAGrad(ps, η = 0.01, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> adagrad(p; η = η, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
 ADADelta(ps, η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> adadelta(p; ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
--- a/src/optimise/params.jl
+++ b/src/optimise/params.jl
@ -1,18 +0,0 @@
 using DataFlow: OSet
 children(x) = ()
 params(ps, m) = foreach(m -> params(ps, m), children(m))
 function params(m)
  ps = OSet()
  params(ps, m)
  return collect(ps)
 end
 struct Param{T}
  x::T
  Δ::T
 end
 convert(::Type{Param}, x::AbstractArray) = Param(x, zeros(x))
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@ -8,8 +8,8 @@ function train!(m, data, opt; cb = () -> ())
  cb = tocb(cb)
  @progress for x in data
    l = m(x...)
-    isinf(l.data[]) && error("Inf")
+    isinf(l.data[]) && error("Loss is Inf")
-    isnan(l.data[]) && error("NaN")
+    isnan(l.data[]) && error("Loss is NaN")
    back!(l)
    opt()
    cb()
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@ -71,11 +71,10 @@ include("back.jl")
 include("lib.jl")
 include("numeric.jl")
-using Requires
+import NNlib.adapt
-@require CuArrays begin
+adapt(T, xs::TrackedArray) =
-  import CuArrays: cu
+  TrackedArray(xs.f, adapt(T, xs.data),
-  cu(xs::TrackedArray) = TrackedArray(xs.f, cu(xs.data), RefValue(cu(grad(xs))))
+               RefValue(adapt(T, grad(xs))))
 end
 end
--- a/src/tree.jl
+++ b/src/tree.jl
@ -0,0 +1,25 @@
 children(x) = ()
 mapchildren(f, x) = x
 function treelike(T, fs = fieldnames(T))
  @eval begin
    children(x::$T) = ($([:(x.$f) for f in fs]...),)
    mapchildren(f, x::$T) = $T(f.(children(x))...)
  end
 end
 # TODO: prewalk/postwalk with correct caching
 # This is only correct in general for idempotent functions
 mapparams(f, x::AbstractArray) = f(x)
 mapparams(f, x) = mapchildren(x -> mapparams(f, x), x)
 forparams(f, x) = (mapparams(x -> (f(x); x), x); return)
 using DataFlow: OSet
 function params(m)
  ps = OSet()
  forparams(p -> push!(ps, p), m)
  return collect(ps)
 end
--- a/test/compiler.jl
+++ b/test/compiler.jl
@ -1,86 +0,0 @@
 using DataFlow, MacroTools
 using Flux: stack, unsqueeze
 using Flux.Compiler: @net, graph
 using DataFlow: Line, Frame
@net type Affine
  W
  b
  x -> x*W .+ b
 end
 Affine(in::Integer, out::Integer; init = Flux.initn) =
  Affine(init(in, out), init(1, out))
@net type TLP
  first
  second
  function (x)
    l1 = σ.(first(x))
    l2 = softmax(second(l1))
  end
 end
@net type Recurrent
  Wxy; Wyy; by
  y
  function (x)
    y = tanh.( x * Wxy .+ y{-1} * Wyy .+ by )
  end
 end
 Recurrent(in, out; init = Flux.initn) =
  Recurrent(init((in, out)), init((out, out)), init(1, out), init(1, out))
 syntax(v::Vertex) = prettify(DataFlow.syntax(v))
 syntax(x) = syntax(graph(x))
@testset "Compiler" begin
 xs = randn(1, 10)
 d = Affine(10, 20)
@test d(xs) ≈ (xs*d.W + d.b)
 d1 = @net x -> x * d.W + d.b
 let
  @capture(syntax(d), _Frame(_Line((+).(x_[1] * W_, b_))))
  @test isa(x, DataFlow.Input) && W isa Array && b isa Array
 end
 let a1 = Affine(10, 20), a2 = Affine(20, 15)
  tlp = TLP(a1, a2)
  @test tlp(xs) ≈ softmax(a2(σ.(a1(xs))))
  @test Flux.Compiler.interpmodel(tlp, xs) ≈ softmax(a2(σ.(a1(xs))))
 end
 let tlp = TLP(Affine(10, 21), Affine(20, 15))
  e = try
    Flux.Compiler.interpmodel(tlp, rand(1, 10))
  catch e
    e
  end
  @test e.trace[end].func == :TLP
  @test e.trace[end-1].func == Symbol("Affine")
 end
 function apply(model, xs, state)
  ys = similar(xs, 0)
  for x in xs
    state, y = model(state, x)
    push!(ys, y)
  end
  state, ys
 end
@testset "RNN unrolling" begin
  r = Recurrent(10, 5)
  xs = [rand(1, 10) for _ = 1:3]
  _, ys = apply(Flux.Compiler.unroll1(r).model, xs, (r.y,))
  @test ys[1] == tanh.(xs[1] * r.Wxy .+ r.y * r.Wyy .+ r.by)
  ru = Flux.Compiler.unroll(r, 3)
  ru(unsqueeze(stack(squeeze.(xs, 1), 1), 1))[1] == squeeze.(ys, 1)
 end
 end
--- a/test/runtests.jl
+++ b/test/runtests.jl
@ -2,7 +2,6 @@ using Flux, Base.Test
@testset "Flux" begin
 include("compiler.jl")
 include("utils.jl")
 include("tracker.jl")