From 96d1c5526361b1c9243ee9dfcbc1a1b06fb23eeb Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Fri, 22 Sep 2017 15:27:06 +0100 Subject: [PATCH 01/16] wording tweak --- docs/src/training/training.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/src/training/training.md b/docs/src/training/training.md index 6a3ee3f7..1eaa8a46 100644 --- a/docs/src/training/training.md +++ b/docs/src/training/training.md @@ -2,14 +2,14 @@ To actually train a model we need three things: -* A *loss function*, that evaluates how well a model is doing given some input data. +* A *model loss function*, that evaluates how well a model is doing given some input data. * A collection of data points that will be provided to the loss function. * An [optimiser](optimisers.md) that will update the model parameters appropriately. With these we can call `Flux.train!`: ```julia -Flux.train!(loss, data, opt) +Flux.train!(model, data, opt) ``` There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo). @@ -23,6 +23,7 @@ m = Chain( Dense(784, 32, σ), Dense(32, 10), softmax) +# Model loss function loss(x, y) = Flux.mse(m(x), y) ``` From 94e38c05b84dfc283e863f1d96ca5d48be3d3c68 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 27 Sep 2017 18:33:23 +0100 Subject: [PATCH 02/16] more informative --- src/optimise/train.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/optimise/train.jl b/src/optimise/train.jl index 8ad437db..0a91e978 100644 --- a/src/optimise/train.jl +++ b/src/optimise/train.jl @@ -8,8 +8,8 @@ function train!(m, data, opt; cb = () -> ()) cb = tocb(cb) @progress for x in data l = m(x...) - isinf(l.data[]) && error("Inf") - isnan(l.data[]) && error("NaN") + isinf(l.data[]) && error("Loss is Inf") + isnan(l.data[]) && error("Loss is NaN") back!(l) opt() cb() From c51f5afb3d2d14247e332ae0cf711a904f374288 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 27 Sep 2017 18:37:07 +0100 Subject: [PATCH 03/16] clarity --- docs/src/training/training.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/training/training.md b/docs/src/training/training.md index 1eaa8a46..d4bed5fe 100644 --- a/docs/src/training/training.md +++ b/docs/src/training/training.md @@ -9,7 +9,7 @@ To actually train a model we need three things: With these we can call `Flux.train!`: ```julia -Flux.train!(model, data, opt) +Flux.train!(modelLoss, data, opt) ``` There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo). From 2ec8401d2c31c767278793adad9412918a66fa24 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 27 Sep 2017 20:37:25 +0100 Subject: [PATCH 04/16] remove compiler --- src/Flux.jl | 2 - src/compiler/Compiler.jl | 14 --- src/compiler/code.jl | 77 ---------------- src/compiler/interp.jl | 39 -------- src/compiler/loops.jl | 191 --------------------------------------- src/layers/basic.jl | 3 - test/compiler.jl | 86 ------------------ test/runtests.jl | 1 - 8 files changed, 413 deletions(-) delete mode 100644 src/compiler/Compiler.jl delete mode 100644 src/compiler/code.jl delete mode 100644 src/compiler/interp.jl delete mode 100644 src/compiler/loops.jl delete mode 100644 test/compiler.jl diff --git a/src/Flux.jl b/src/Flux.jl index 7b55eacf..8c88d229 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -22,8 +22,6 @@ using .Optimise include("utils.jl") include("onehot.jl") -include("compiler/Compiler.jl") - include("layers/stateless.jl") include("layers/basic.jl") include("layers/recurrent.jl") diff --git a/src/compiler/Compiler.jl b/src/compiler/Compiler.jl deleted file mode 100644 index 7a30ef2e..00000000 --- a/src/compiler/Compiler.jl +++ /dev/null @@ -1,14 +0,0 @@ -module Compiler - -using MacroTools, DataFlow, DataFlow.Interpreter - -using DataFlow: graphm, syntax, prewalk!, postwalk!, prewalk, postwalk, - iscyclic, Constant, constant, isconstant, group, Split, - detuple, value, inputs, thread!, value, inputs, inputnode, - spliceinputs, bumpinputs, Line, Frame, applylines, graphinputs - -include("code.jl") -include("interp.jl") -include("loops.jl") - -end diff --git a/src/compiler/code.jl b/src/compiler/code.jl deleted file mode 100644 index b873547a..00000000 --- a/src/compiler/code.jl +++ /dev/null @@ -1,77 +0,0 @@ -import DataFlow: cse -using MacroTools: @q, @> - -graph(m) = nothing - -function graphdef(ex, params = []) - @capture(shortdef(ex), (args__,) -> body_) - body = @> body MacroTools.flatten liftloops graphm DataFlow.il - body = map(x -> x in params ? :(self.$x) : x, body) - return args, body -end - -function makegraph(graph, args, params = []) - graph = prewalk(graph) do v - isconstant(v) && (i = findfirst(args, value(v[1]))) ≠ 0 ? - inputnode(i) : - v - end - graph = map(graph) do x - x isa Offset ? - :(Flux.Compiler.Offset($(Expr(:quote, x.name)), $(x.n), - $(x.name in params ? :(self.$(x.name)) : x.name))) : - x - end - vertex(:($DataFlow.Frame(self)), graph) -end - -function build_type(T, params) - @esc T - :(type $T - $(params...) - end) -end - -function build_forward(body, args) - iscyclic(body) && return :(error("Can't run forward pass on a cyclic graph")) - applylines(syntax(cse(body))) -end - -import Lazy: groupby - -# TODO: type hints for parameters - -function process_type(ex) - @capture(ex, type T_ fs__ end) - @destruct [params = false || [], - funcs = true || []] = groupby(x->isexpr(x, :->, :function), fs) - @assert length(funcs) == 1 - pnames = namify.(params) - args, body = graphdef(funcs[1], pnames) - self = esc(:self) - quote - $(build_type(T, params)) - $(esc(:((self::$T)($(args...)) = $(build_forward(body, args))))) - $(esc(:(Flux.Compiler.graph(self::$T)))) = $(DataFlow.constructor(map(esc, makegraph(body, args, params)))) - nothing - end -end - -function process_anon(ex) - args, body = graphdef(ex) - :(Capacitor($(DataFlow.constructor(map(esc, makegraph(body, args)[1]))))) -end - -function process_def(ex) - # TODO: make a singleton net type - @capture(ex, f_(xs__) = body_) - :($(esc(f)) = @net $(esc(:(($(xs...),) -> $body))); nothing) -end - -macro net(ex) - ex = shortdef(ex) - isexpr(ex, :type) ? process_type(ex) : - @capture(ex, (__,) -> _) ? process_anon(ex) : - @capture(ex, _(__) = _) ? process_def(ex) : - error("Unsupported model expression $ex") -end diff --git a/src/compiler/interp.jl b/src/compiler/interp.jl deleted file mode 100644 index d9759260..00000000 --- a/src/compiler/interp.jl +++ /dev/null @@ -1,39 +0,0 @@ -function astuple(xs::Vertex) - isconstant(xs) && value(xs[1]) isa Tuple ? value(xs[1]) : - xs isa Vertex && value(xs) == tuple ? inputs(xs) : - nothing -end - -astuple(xs::Tuple) = xs - -astuple(xs) = nothing - -function astuples(xs) - xs = [astuple(x) for x in xs] - all(x->!(x==nothing), xs) ? xs : nothing -end - -function interp(ctx, f, xs...) - g = graph(f) - g ≠ nothing && iscyclic(g) && error("Can't interpret cyclic graph") - @icatch(ctx, g ≠ nothing ? - interpret(ctx, g, xs...) : - f(xs...)) -end - -function interpmodel(m, args...) - ctx = Context(mux(iconst, iline, ilambda, iargs, ituple, interp)) - @ithrow interp(ctx, m, args...) -end - -# Anonymous models - -struct Capacitor - graph::IVertex{Any} -end - -(m::Capacitor)(xs...) = interpmodel(m, xs...) - -graph(cap::Capacitor) = cap.graph - -Base.show(io::IO, ::Capacitor) = print(io, "Capacitor(...)") diff --git a/src/compiler/loops.jl b/src/compiler/loops.jl deleted file mode 100644 index 62fe0533..00000000 --- a/src/compiler/loops.jl +++ /dev/null @@ -1,191 +0,0 @@ -using ..Flux: stack, unstack, squeeze, unsqueeze - -# Stateful Models - -mutable struct Stateful - model - states::Vector{Any} - istate::Vector{Any} - ostate::Vector{Any} -end - -Stateful(model, ss) = Stateful(model, ss, ss, ss) - -function Base.show(io::IO, m::Stateful) - print(io, "Stateful(") - show(io, m.model) - print(io, ")") -end - -function (m::Stateful)(xs...) - m.istate = m.ostate - state, y = m.model((m.istate...,), xs...) - m.ostate = collect(state) - return y -end - -# Seq Models - -struct SeqModel - model - steps::Int -end - -seqtuple(x, n) = x -seqtuple(xs::Tuple, n) = seqtuple.(xs, n) - -seqtuple(xs::AbstractArray, n) = - ndims(xs) < 3 ? xs : - n ≠ 0 && size(xs, 2) ≠ n ? error("Expecting sequence length $n, got $(size(xs, 2))") : - (unstack(xs, 2)...) - -reseq(x) = x -reseq(x::Tuple{}) = () -reseq(xs::Tuple) = all(isa.(xs, AbstractArray) .& (ndims.(xs) .≥ 2)) ? stack(xs, 2) : reseq.(xs) - -function (m::SeqModel)(xs...) - xs = seqtuple(xs, m.steps) - reseq(m.model(xs...)) -end - -graph(m::SeqModel) = graph(m.model) - -# Recurrent Graphs - -struct Offset - name::Symbol - n::Int - default::Nullable{Any} -end - -Offset(name, n) = Offset(name, n, nothing) - -Base.:-(o::Offset) = Offset(o.name, -o.n, o.default) - -function liftloops(ex) - ex = DataFlow.normedges(ex) - decls = Dict() - ex = MacroTools.postwalk(ex) do ex - @capture(ex, x_{n_}) || return ex - haskey(decls, (x,n)) && return namify(decls[(x,n)]) - @gensym edge - decls[(x,n)] = :($edge = $(Offset(x,n))($x)) - edge - end - prepend!(ex.args, collect(values(decls))) - ex -end - -function hasloops(model) - g = graph(model) - g == nothing && return false - iscyclic(g) && return true - result = false - map(m -> hasloops(m) && (result = true), g) - return result -end - -function atomise(model) - postwalk(graph(model)) do v - hasloops(value(v)) || return v - spliceinputs(atomise(value(v)), inputs(v)...) - end -end - -function collect_state(v::IVertex) - state = typeof(v)[] - offset = Int[] - default = [] - prewalk!(v) do v - value(v) isa Offset || return v - if (i = findfirst(state, v[1])) == 0 - push!(state, v[1]) - push!(offset, max(0, -value(v).n)) - push!(default, get(value(v).default)) - else - offset[i] = max(offset[i], -value(v).n) - end - v - end - return state, offset, default -end - -hiddeninput(n, t) = vertex(Split(t), inputnode(n)) - -# TODO: nicer way to do this. -create_steps(v::IVertex, n) = [bumpinputs(spliceinputs(v, [hiddeninput(n, t) for n = 1:graphinputs(v)]...)) for t = 1:n] - -function getvar(n, step, steps, offset, default) - if step < 1 - hiddeninput(1, sum(offset[1:n-1]) + 1 - step) - elseif step ∉ 1:length(steps) - constant(default[n]) - else - steps[step][1,n] - end -end - -function stateout(steps, offset, default) - outs = [] - defaults = [] - for i = 1:length(offset), j = 1:offset[i] - push!(outs, getvar(i, length(steps)-j+1, steps, offset, default)) - push!(defaults, default[i]) - end - group(outs...), defaults -end - -# Input: (hidden1, hidden2, ...), (x1, x2, ...) -# Output: (hidden1, hidden2, ...), (y1, y2, ...) -# TODO: make sure there's a reasonable order for hidden states - -function unrollgraph(v::IVertex, n) - state, offset, default = collect_state(v) - v = group(group(state...), v) - steps = create_steps(v, n) - for i = 1:n - vars = inputs(steps[i][1]) - postwalk!(steps[i]) do v - value(v) isa Offset || return v - varid = findfirst(vars,v[1]) - getvar(varid, value(v).n + i, steps, offset, default) - end - end - out = group(map(x->x[2], steps)...) - state, defaults = stateout(steps, offset, default) - group(state,out), defaults -end - -unrollgraph(m, n; kws...) = unrollgraph(atomise(m), n; kws...) - -function unroll(model, n) - graph, state = unrollgraph(model, n) - SeqModel(Stateful(Capacitor(graph), state), n) -end - -function stateless(s::Stateful) - v = graph(s.model) - v = spliceinputs(v, group(constant.(s.states)...), - [inputnode(i) for i = 1:graphinputs(v)-1]...) - Capacitor(v[2]) -end - -stateless(s::SeqModel) = SeqModel(stateless(s.model), s.steps) - -function unseqin(v::IVertex) - prewalk(v) do v - # TODO: inputidx function - isa(value(v), Split) && DataFlow.isinput(v[1]) && value(v[1]).n > 1 ? v[1] : v - end -end - -unseqout(v::IVertex) = group(v[1], v[2][1]) - -unseq(graph) = unseqout(unseqin(graph)) - -function unroll1(model) - graph, state = unrollgraph(model, 1) - Stateful(Capacitor(unseq(graph)), state) -end - -flip(model) = Capacitor(map(x -> x isa Offset ? -x : x, atomise(model))) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index dfe73ab9..71de15fe 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -26,9 +26,6 @@ Optimise.children(c::Chain) = c.layers (s::Chain)(x) = foldl((x, m) -> m(x), x, s.layers) -Compiler.graph(s::Chain) = - foldl((v, m) -> vertex(m, v), constant(inputnode(1)), s.layers) - Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...) function Base.show(io::IO, c::Chain) diff --git a/test/compiler.jl b/test/compiler.jl deleted file mode 100644 index a82550e8..00000000 --- a/test/compiler.jl +++ /dev/null @@ -1,86 +0,0 @@ -using DataFlow, MacroTools -using Flux: stack, unsqueeze -using Flux.Compiler: @net, graph -using DataFlow: Line, Frame - -@net type Affine - W - b - x -> x*W .+ b -end - -Affine(in::Integer, out::Integer; init = Flux.initn) = - Affine(init(in, out), init(1, out)) - -@net type TLP - first - second - function (x) - l1 = σ.(first(x)) - l2 = softmax(second(l1)) - end -end - -@net type Recurrent - Wxy; Wyy; by - y - function (x) - y = tanh.( x * Wxy .+ y{-1} * Wyy .+ by ) - end -end - -Recurrent(in, out; init = Flux.initn) = - Recurrent(init((in, out)), init((out, out)), init(1, out), init(1, out)) - -syntax(v::Vertex) = prettify(DataFlow.syntax(v)) -syntax(x) = syntax(graph(x)) - -@testset "Compiler" begin - -xs = randn(1, 10) -d = Affine(10, 20) - -@test d(xs) ≈ (xs*d.W + d.b) - -d1 = @net x -> x * d.W + d.b - -let - @capture(syntax(d), _Frame(_Line((+).(x_[1] * W_, b_)))) - @test isa(x, DataFlow.Input) && W isa Array && b isa Array -end - -let a1 = Affine(10, 20), a2 = Affine(20, 15) - tlp = TLP(a1, a2) - @test tlp(xs) ≈ softmax(a2(σ.(a1(xs)))) - @test Flux.Compiler.interpmodel(tlp, xs) ≈ softmax(a2(σ.(a1(xs)))) -end - -let tlp = TLP(Affine(10, 21), Affine(20, 15)) - e = try - Flux.Compiler.interpmodel(tlp, rand(1, 10)) - catch e - e - end - @test e.trace[end].func == :TLP - @test e.trace[end-1].func == Symbol("Affine") -end - -function apply(model, xs, state) - ys = similar(xs, 0) - for x in xs - state, y = model(state, x) - push!(ys, y) - end - state, ys -end - -@testset "RNN unrolling" begin - r = Recurrent(10, 5) - xs = [rand(1, 10) for _ = 1:3] - _, ys = apply(Flux.Compiler.unroll1(r).model, xs, (r.y,)) - @test ys[1] == tanh.(xs[1] * r.Wxy .+ r.y * r.Wyy .+ r.by) - ru = Flux.Compiler.unroll(r, 3) - ru(unsqueeze(stack(squeeze.(xs, 1), 1), 1))[1] == squeeze.(ys, 1) -end - -end diff --git a/test/runtests.jl b/test/runtests.jl index f237133a..2ab0e447 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,7 +2,6 @@ using Flux, Base.Test @testset "Flux" begin -include("compiler.jl") include("utils.jl") include("tracker.jl") From 4bafa2b374a987db56b9055319f8ded8143b836c Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 27 Sep 2017 21:11:21 +0100 Subject: [PATCH 05/16] generic tree functions --- src/Flux.jl | 1 + src/layers/basic.jl | 5 +++-- src/layers/recurrent.jl | 9 ++++----- src/optimise/Optimise.jl | 10 +++++++--- src/optimise/params.jl | 18 ------------------ src/tree.jl | 20 ++++++++++++++++++++ 6 files changed, 35 insertions(+), 28 deletions(-) delete mode 100644 src/optimise/params.jl create mode 100644 src/tree.jl diff --git a/src/Flux.jl b/src/Flux.jl index 8c88d229..ba9a6327 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -21,6 +21,7 @@ using .Optimise include("utils.jl") include("onehot.jl") +include("tree.jl") include("layers/stateless.jl") include("layers/basic.jl") diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 71de15fe..37c1b787 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -22,7 +22,8 @@ end @forward Chain.layers Base.getindex, Base.first, Base.last, Base.endof, Base.push! @forward Chain.layers Base.start, Base.next, Base.done -Optimise.children(c::Chain) = c.layers +children(c::Chain) = c.layers +mapchildren(f, c::Chain) = Chain(f.(c.layers)...) (s::Chain)(x) = foldl((x, m) -> m(x), x, s.layers) @@ -53,7 +54,7 @@ end Dense(in::Integer, out::Integer, σ = identity; init = initn) = Dense(σ, param(init(out, in)), param(init(out))) -Optimise.children(d::Dense) = (d.W, d.b) +treelike(Dense) (a::Dense)(x) = a.σ.(a.W*x .+ a.b) diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index 5d44e1bf..491209a0 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -16,7 +16,7 @@ function (m::Recur)(xs...) return y end -Optimise.children(m::Recur) = (m.cell,) +treelike(Recur) Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")") @@ -24,7 +24,7 @@ _truncate(x::AbstractArray) = x _truncate(x::TrackedArray) = x.data _truncate(x::Tuple) = _truncate.(x) -truncate!(m) = foreach(truncate!, Optimise.children(m)) +truncate!(m) = foreach(truncate!, children(m)) truncate!(m::Recur) = (m.state = _truncate(m.state)) # Vanilla RNN @@ -44,7 +44,7 @@ end hidden(m::RNNCell) = m.h -Optimise.children(m::RNNCell) = (m.d, m.h) +treelike(RNNCell) function Base.show(io::IO, m::RNNCell) print(io, "RNNCell(", m.d, ")") @@ -82,8 +82,7 @@ end hidden(m::LSTMCell) = (m.h, m.c) -Optimise.children(m::LSTMCell) = - (m.forget, m.input, m.output, m.cell, m.h, m.c) +treelike(LSTMCell) Base.show(io::IO, m::LSTMCell) = print(io, "LSTMCell(", diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl index 57c202eb..57956426 100644 --- a/src/optimise/Optimise.jl +++ b/src/optimise/Optimise.jl @@ -3,15 +3,19 @@ module Optimise export update!, params, train!, SGD -include("params.jl") +struct Param{T} + x::T + Δ::T +end + +Base.convert(::Type{Param}, x::AbstractArray) = Param(x, zeros(x)) + include("optimisers.jl") include("interface.jl") include("train.jl") using Flux.Tracker: TrackedArray -params(ps, p::TrackedArray) = push!(ps, p) - Base.convert(::Type{Param}, x::TrackedArray) = Param(x.data, x.grad[]) end diff --git a/src/optimise/params.jl b/src/optimise/params.jl deleted file mode 100644 index c5163dbe..00000000 --- a/src/optimise/params.jl +++ /dev/null @@ -1,18 +0,0 @@ -using DataFlow: OSet - -children(x) = () - -params(ps, m) = foreach(m -> params(ps, m), children(m)) - -function params(m) - ps = OSet() - params(ps, m) - return collect(ps) -end - -struct Param{T} - x::T - Δ::T -end - -convert(::Type{Param}, x::AbstractArray) = Param(x, zeros(x)) diff --git a/src/tree.jl b/src/tree.jl new file mode 100644 index 00000000..438685d5 --- /dev/null +++ b/src/tree.jl @@ -0,0 +1,20 @@ +children(x) = () +mapchildren(f, x) = x + +function treelike(T, fs = fieldnames(T)) + @eval begin + children(x::$T) = ($([:(x.$f) for f in fs]...),) + mapchildren(f, x::$T) = $T(f.(children(x))...) + end +end + +using DataFlow: OSet + +params(ps, p::AbstractArray) = push!(ps, p) +params(ps, m) = foreach(m -> params(ps, m), children(m)) + +function params(m) + ps = OSet() + params(ps, m) + return collect(ps) +end From a60a754d68df79aa6e19f7f7b3992164d47703b0 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 27 Sep 2017 21:58:34 +0100 Subject: [PATCH 06/16] beginnings of gpu support --- src/Flux.jl | 4 ++-- src/layers/basic.jl | 5 ++++- src/onehot.jl | 8 ++++++-- src/tree.jl | 13 +++++++++---- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/Flux.jl b/src/Flux.jl index ba9a6327..45e3044e 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -4,11 +4,11 @@ module Flux # Zero Flux Given -using Juno +using Juno, Requires using Lazy: @forward export Chain, Dense, RNN, LSTM, - SGD, params + SGD, params, mapparams using NNlib export σ, relu, softmax diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 37c1b787..0ae5f8fa 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -56,7 +56,10 @@ Dense(in::Integer, out::Integer, σ = identity; init = initn) = treelike(Dense) -(a::Dense)(x) = a.σ.(a.W*x .+ a.b) +function (a::Dense)(x) + W, b, σ = a.W, a.b, a.σ + σ.(W*x .+ b) +end function Base.show(io::IO, l::Dense) print(io, "Dense(", size(l.W, 2), ", ", size(l.W, 1)) diff --git a/src/onehot.jl b/src/onehot.jl index 1e147397..48b7ccf5 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -9,8 +9,8 @@ Base.getindex(xs::OneHotVector, i::Integer) = i == xs.ix Base.:*(A::AbstractMatrix, b::OneHotVector) = A[:, b.ix] -struct OneHotMatrix <: AbstractMatrix{Bool} - data::Vector{OneHotVector} +struct OneHotMatrix{A<:AbstractVector{OneHotVector}} <: AbstractMatrix{Bool} + data::A end Base.size(xs::OneHotMatrix) = (Int64(length(xs.data[1])),length(xs.data)) @@ -21,6 +21,10 @@ Base.:*(A::AbstractMatrix, B::OneHotMatrix) = A[:, map(x->x.ix, B.data)] Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix([x, xs...]) +@require CuArrays begin + CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(CuArrays.cu(xs.data)) +end + onehot(l, labels) = OneHotVector(findfirst(labels, l), length(labels)) onehotbatch(ls, labels) = OneHotMatrix([onehot(l, labels) for l in ls]) diff --git a/src/tree.jl b/src/tree.jl index 438685d5..bd6b2d73 100644 --- a/src/tree.jl +++ b/src/tree.jl @@ -8,13 +8,18 @@ function treelike(T, fs = fieldnames(T)) end end -using DataFlow: OSet +# TODO: prewalk/postwalk with correct caching +# This is only correct in general for idempotent functions -params(ps, p::AbstractArray) = push!(ps, p) -params(ps, m) = foreach(m -> params(ps, m), children(m)) +mapparams(f, x::AbstractArray) = f(x) +mapparams(f, x) = mapchildren(x -> mapparams(f, x), x) + +forparams(f, x) = (mapparams(x -> (f(x); x), x); return) + +using DataFlow: OSet function params(m) ps = OSet() - params(ps, m) + forparams(p -> push!(ps, p), m) return collect(ps) end From a32ae4914c1f3d446d0f68fb264e18551f9ea4c7 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 27 Sep 2017 22:51:00 +0100 Subject: [PATCH 07/16] onehotmatrix cuda support --- src/onehot.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/onehot.jl b/src/onehot.jl index 48b7ccf5..2f1eb365 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -22,7 +22,10 @@ Base.:*(A::AbstractMatrix, B::OneHotMatrix) = A[:, map(x->x.ix, B.data)] Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix([x, xs...]) @require CuArrays begin + import CuArrays: CuArray, cudaconvert CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(CuArrays.cu(xs.data)) + Base.Broadcast._containertype(::Type{<:OneHotMatrix{<:CuArray}}) = CuArray + cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(cudaconvert(x.data)) end onehot(l, labels) = OneHotVector(findfirst(labels, l), length(labels)) From 7c8dba0b85d1cd8e114ce402879d252d69767876 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 27 Sep 2017 23:14:58 +0100 Subject: [PATCH 08/16] gc in training loop --- src/optimise/train.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/optimise/train.jl b/src/optimise/train.jl index 0a91e978..4ecc7793 100644 --- a/src/optimise/train.jl +++ b/src/optimise/train.jl @@ -13,5 +13,6 @@ function train!(m, data, opt; cb = () -> ()) back!(l) opt() cb() + gc() end end From 8e63ac766e7d1c077f2beecfa798c081d7b0d796 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 28 Sep 2017 11:08:37 +0100 Subject: [PATCH 09/16] gpu support docs --- docs/make.jl | 4 ++-- docs/src/gpu.md | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 docs/src/gpu.md diff --git a/docs/make.jl b/docs/make.jl index e0a5a281..f87ff300 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,8 +14,8 @@ makedocs(modules=[Flux], "Training Models" => ["Optimisers" => "training/optimisers.md", "Training" => "training/training.md"], - "Data Munging" => - ["One-Hot Encoding" => "data/onehot.md"], + "One-Hot Encoding" => "data/onehot.md", + "GPU Support" => "gpu.md", "Contributing & Help" => "contributing.md"]) deploydocs( diff --git a/docs/src/gpu.md b/docs/src/gpu.md new file mode 100644 index 00000000..db21db36 --- /dev/null +++ b/docs/src/gpu.md @@ -0,0 +1,33 @@ +# GPU Support + +Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) and [CLArrays](https://github.com/JuliaGPU/CLArrays.jl). Flux doesn't care what array type you use, so we can just plug these in without any other changes. + +For example, we can use `CuArrays` (with the `cu` array converter) to run our [basic example](models/basics.md) on an NVIDIA GPU. + +```julia +using CuArrays + +W = cu(rand(2, 5)) +b = cu(rand(2)) + +predict(x) = W*x .+ b +loss(x, y) = sum((predict(x) .- y).^2) + +x, y = cu(rand(5)), cu(rand(2)) # Dummy data +loss(x, y) # ~ 3 +``` + +Note that we convert both the parameters (`W`, `b`) and the data set (`x`, `y`) to cuda arrays. Taking derivatives and training works exactly as before. + +If you define a structured model, like a `Dense` layer or `Chain`, you just need to convert the internal parameters. Flux provides `mapparams`, which allows you to alter all parameters of a model at once. + +```julia +d = Dense(10, 5, σ) +d = mapparams(cu, d) +d.W # Tracked CuArray +d(cu(rand(10))) # CuArray output + +m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax) +m = mapparams(cu, m) +d(cu(rand(10))) +``` From d3419c943bd8c8c3d06ffa8ea5618625b3b14613 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 28 Sep 2017 11:11:11 +0100 Subject: [PATCH 10/16] example link --- docs/src/gpu.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/src/gpu.md b/docs/src/gpu.md index db21db36..4452e856 100644 --- a/docs/src/gpu.md +++ b/docs/src/gpu.md @@ -2,12 +2,12 @@ Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) and [CLArrays](https://github.com/JuliaGPU/CLArrays.jl). Flux doesn't care what array type you use, so we can just plug these in without any other changes. -For example, we can use `CuArrays` (with the `cu` array converter) to run our [basic example](models/basics.md) on an NVIDIA GPU. +For example, we can use `CuArrays` (with the `cu` converter) to run our [basic example](models/basics.md) on an NVIDIA GPU. ```julia using CuArrays -W = cu(rand(2, 5)) +W = cu(rand(2, 5)) # a 2×5 CuArray b = cu(rand(2)) predict(x) = W*x .+ b @@ -31,3 +31,5 @@ m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax) m = mapparams(cu, m) d(cu(rand(10))) ``` + +The [mnist example](https://github.com/FluxML/model-zoo/blob/master/mnist/mnist.jl) contains the code needed to run the model on the GPU; just uncomment the lines after `using CuArrays`. From 1b91e6b38de5669e97ddaafef510134b587ba5ed Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Mon, 2 Oct 2017 20:50:11 +0100 Subject: [PATCH 11/16] store onehotmatrix height --- src/onehot.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/onehot.jl b/src/onehot.jl index 2f1eb365..d01dc9e1 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -10,10 +10,11 @@ Base.getindex(xs::OneHotVector, i::Integer) = i == xs.ix Base.:*(A::AbstractMatrix, b::OneHotVector) = A[:, b.ix] struct OneHotMatrix{A<:AbstractVector{OneHotVector}} <: AbstractMatrix{Bool} + height::Int data::A end -Base.size(xs::OneHotMatrix) = (Int64(length(xs.data[1])),length(xs.data)) +Base.size(xs::OneHotMatrix) = (Int64(xs.height),length(xs.data)) Base.getindex(xs::OneHotMatrix, i::Int, j::Int) = xs.data[j][i] @@ -23,13 +24,13 @@ Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix([x, xs...]) @require CuArrays begin import CuArrays: CuArray, cudaconvert - CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(CuArrays.cu(xs.data)) + CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(xs.height, CuArrays.cu(xs.data)) Base.Broadcast._containertype(::Type{<:OneHotMatrix{<:CuArray}}) = CuArray - cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(cudaconvert(x.data)) + cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data)) end onehot(l, labels) = OneHotVector(findfirst(labels, l), length(labels)) -onehotbatch(ls, labels) = OneHotMatrix([onehot(l, labels) for l in ls]) +onehotbatch(ls, labels) = OneHotMatrix(length(labels), [onehot(l, labels) for l in ls]) argmax(y::AbstractVector, labels = 1:length(y)) = labels[findfirst(y, maximum(y))] From 5fd1b7d9a23c8cf6c49cbef47d4e0029e99d91e1 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Mon, 2 Oct 2017 20:50:18 +0100 Subject: [PATCH 12/16] remove gc hack --- src/optimise/train.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/optimise/train.jl b/src/optimise/train.jl index 4ecc7793..0a91e978 100644 --- a/src/optimise/train.jl +++ b/src/optimise/train.jl @@ -13,6 +13,5 @@ function train!(m, data, opt; cb = () -> ()) back!(l) opt() cb() - gc() end end From c202e2bc1ac17b9ebf0fff1374f72acd487c162f Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 3 Oct 2017 19:00:42 +0100 Subject: [PATCH 13/16] clarify --- docs/src/training/training.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/src/training/training.md b/docs/src/training/training.md index d4bed5fe..94a43348 100644 --- a/docs/src/training/training.md +++ b/docs/src/training/training.md @@ -25,6 +25,9 @@ m = Chain( # Model loss function loss(x, y) = Flux.mse(m(x), y) + +# later +Flux.train!(loss, data, opt) ``` The loss will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `logloss` for cross entropy loss, but you can calculate it however you want. From 2b95aff15859576dc300216950731bb91574c2c8 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 3 Oct 2017 14:16:51 -0400 Subject: [PATCH 14/16] actually use init argument in LSTMCell --- src/layers/recurrent.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index 491209a0..3387a5f8 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -63,9 +63,9 @@ struct LSTMCell{D1,D2,V} end function LSTMCell(in, out; init = initn) - cell = LSTMCell([Dense(in+out, out, σ, init = initn) for _ = 1:3]..., - Dense(in+out, out, tanh, init = initn), - param(initn(out)), param(initn(out))) + cell = LSTMCell([Dense(in+out, out, σ, init = init) for _ = 1:3]..., + Dense(in+out, out, tanh, init = init), + param(init(out)), param(init(out))) cell.forget.b.data .= 1 return cell end From 1abc4febe67ff04b7f3decc77a8fa207189f8026 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 4 Oct 2017 18:55:56 +0100 Subject: [PATCH 15/16] more general adaptors --- src/onehot.jl | 5 ++++- src/tracker/Tracker.jl | 9 ++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/onehot.jl b/src/onehot.jl index d01dc9e1..aea68829 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -22,9 +22,12 @@ Base.:*(A::AbstractMatrix, B::OneHotMatrix) = A[:, map(x->x.ix, B.data)] Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix([x, xs...]) +import NNlib.adapt + +adapt(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data)) + @require CuArrays begin import CuArrays: CuArray, cudaconvert - CuArrays.cu(xs::OneHotMatrix) = OneHotMatrix(xs.height, CuArrays.cu(xs.data)) Base.Broadcast._containertype(::Type{<:OneHotMatrix{<:CuArray}}) = CuArray cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data)) end diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 74fcb2b8..e218c3ea 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -71,11 +71,10 @@ include("back.jl") include("lib.jl") include("numeric.jl") -using Requires +import NNlib.adapt -@require CuArrays begin - import CuArrays: cu - cu(xs::TrackedArray) = TrackedArray(xs.f, cu(xs.data), RefValue(cu(grad(xs)))) -end +adapt(T, xs::TrackedArray) = + TrackedArray(xs.f, adapt(T, xs.data), + RefValue(adapt(T, grad(xs)))) end From bfcc1ac25d11c5c3e04ddc1ea6c59b7b8efd968d Mon Sep 17 00:00:00 2001 From: pevnak Date: Mon, 25 Sep 2017 21:08:35 +0200 Subject: [PATCH 16/16] exposing optimisers --- src/optimise/interface.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/optimise/interface.jl b/src/optimise/interface.jl index cffe62fc..364f7358 100644 --- a/src/optimise/interface.jl +++ b/src/optimise/interface.jl @@ -10,3 +10,9 @@ function optimiser(ps, fs...) end SGD(ps, η = 1) = optimiser(ps, p -> descent(p, η)) +ADAM(ps, η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0.0) = optimiser(ps, p -> adam(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) +Momentum(ps,ρ, decay = 0.0) = optimiser(ps, p -> momentum(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1)) +Nesterov(ps,ρ, decay = 0.0) = optimiser(ps, p -> nesterov(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1)) +RMSProp(ps, η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> rmsprop(p; η = η, ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) +ADAGrad(ps, η = 0.01, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> adagrad(p; η = η, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) +ADADelta(ps, η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> adadelta(p; ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))