From 2e1ed4c3fce1ccb3fad7e558e6eef00936907e89 Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Mon, 23 Oct 2017 10:12:53 +0200 Subject: [PATCH 01/32] add dropout --- src/Flux.jl | 4 ++-- src/layers/basic.jl | 42 +++++++++++++++++++++++++++++++++++++++++- test/layers.jl | 23 +++++++++++++++++++++++ test/runtests.jl | 1 + 4 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 test/layers.jl diff --git a/src/Flux.jl b/src/Flux.jl index e4f170f2..daeaa9ac 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -7,9 +7,9 @@ module Flux using Juno, Requires using Lazy: @forward -export Chain, Dense, RNN, LSTM, +export Chain, Dense, RNN, LSTM, Dropout, SGD, ADAM, Momentum, Nesterov, - param, params, mapleaves + param, params, mapleaves, setmode! using NNlib export σ, relu, leakyrelu, elu, swish, softmax diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 9c8b1016..088cf1e1 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -27,7 +27,7 @@ end children(c::Chain) = c.layers mapchildren(f, c::Chain) = Chain(f.(c.layers)...) -(s::Chain)(x) = foldl((x, m) -> m(x), x, s.layers) +(c::Chain)(x) = foldl((x, m) -> m(x), x, c.layers) Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...) @@ -78,3 +78,43 @@ function Base.show(io::IO, l::Dense) l.σ == identity || print(io, ", ", l.σ) print(io, ")") end + + +""" + Dropout(p; mode=:train) + +A Dropout layer. In `:train` mode sets input components `x[i]` to zero with +probability `p` and to `x[i]/(1-p)` with probability `(1-p)`. + +In `:eval` mode it doesn't alter the input: `x == Dropout(p; mode=:eval)(x)`. +Change the mode with [`setmode!`](@ref). +""" +mutable struct Dropout{F} + p::F + mode::Symbol +end +Dropout(p::F; mode=:train) where {F} = Dropout{F}(p, mode) + +function (a::Dropout)(x) + if a.mode == :eval + return x + else + if 0 < a.p < 1 + y = similar(x) + rand!(y) + q = 1 - a.p + @inbounds for i=1:length(y) + y[i] = y[i] > a.p ? 1 / q : 0 + end + return y .* x + elseif a.p == 0 + return x + elseif a.p == 1 + return zeros(x) + end + end +end + +setmode!(a, mode::Symbol) = nothing +setmode!(c::Chain, mode::Symbol) = mapchildren(x->setmode!(x, mode), c) +setmode!(a::Dropout, mode::Symbol) = a.mode = mode diff --git a/test/layers.jl b/test/layers.jl new file mode 100644 index 00000000..ead9c343 --- /dev/null +++ b/test/layers.jl @@ -0,0 +1,23 @@ +@testset "dropout" begin + x = [1.,2.,3.] + @test x === Dropout(0.1, mode=:eval)(x) + @test x === Dropout(0, mode=:train)(x) + @test all(zeros(x) .== Dropout(1, mode=:train)(x)) + + x = rand(100) + m = Dropout(0.9) + y = m(x) + @test count(a->a==0, y) > 50 + setmode!(m, :eval) + y = m(x) + @test count(a->a==0, y) == 0 + + x = rand(100) + m = Chain(Dense(100,100), + Dropout(0.9)) + y = m(x) + @test count(a->a.data[] == 0, y) > 50 + setmode!(m, :eval) + y = m(x) + @test count(a->a.data[] == 0, y) == 0 +end diff --git a/test/runtests.jl b/test/runtests.jl index 2ab0e447..b7b838df 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,5 +4,6 @@ using Flux, Base.Test include("utils.jl") include("tracker.jl") +include("layers.jl") end From 86c7c9246ea60fafaae844eddde6e7ef7daa4216 Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Mon, 23 Oct 2017 11:41:08 +0200 Subject: [PATCH 02/32] add == and < for tracked arrays --- src/layers/basic.jl | 7 +++++++ src/tracker/Tracker.jl | 9 ++++++++- test/layers.jl | 4 ++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 088cf1e1..0c7e1fd0 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -115,6 +115,13 @@ function (a::Dropout)(x) end end +""" + setmode!(m, mode::Symbol) + +Change the mode of model `m` to `mode`. Possible values for `mode` are +`:train` and `:eval`. +This has an affect only if `m` contains [`Dropout`](@ref) of `BatchNorm` layers. +""" setmode!(a, mode::Symbol) = nothing setmode!(c::Chain, mode::Symbol) = mapchildren(x->setmode!(x, mode), c) setmode!(a::Dropout, mode::Symbol) = a.mode = mode diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index a2a6c745..8f495f82 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -1,5 +1,5 @@ module Tracker - +import Base: <, == export TrackedArray, param, back! data(x) = x @@ -54,6 +54,13 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) = Base.similar(x::TrackedArray, T::Type) = similar(data(x), T) +==(x::TrackedArray, y) = data(x) == y +==(y, x::TrackedArray) = y == data(x) +==(x::TrackedScalar, y) = data(x)[] == y +==(y, x::TrackedScalar) = y == data(x)[] +<(x::TrackedScalar, y) = data(x)[] < y +<(x, y::TrackedScalar) = x < data(y)[] + Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} = print(io, "TrackedArray{…,$A}") diff --git a/test/layers.jl b/test/layers.jl index ead9c343..d0a5cbe1 100644 --- a/test/layers.jl +++ b/test/layers.jl @@ -16,8 +16,8 @@ m = Chain(Dense(100,100), Dropout(0.9)) y = m(x) - @test count(a->a.data[] == 0, y) > 50 + @test count(a->a == 0, y) > 50 setmode!(m, :eval) y = m(x) - @test count(a->a.data[] == 0, y) == 0 + @test count(a->a == 0, y) == 0 end From 536ab3861dcf40a726272763efd8e489f700d667 Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Mon, 23 Oct 2017 16:23:29 +0200 Subject: [PATCH 03/32] setmode! -> testmode! --- src/Flux.jl | 2 +- src/layers/basic.jl | 25 +++++++++++-------------- src/tracker/Tracker.jl | 1 + test/layers.jl | 13 ++++++++----- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/src/Flux.jl b/src/Flux.jl index daeaa9ac..957940c3 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -9,7 +9,7 @@ using Lazy: @forward export Chain, Dense, RNN, LSTM, Dropout, SGD, ADAM, Momentum, Nesterov, - param, params, mapleaves, setmode! + param, params, mapleaves, testmode! using NNlib export σ, relu, leakyrelu, elu, swish, softmax diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 0c7e1fd0..841cf094 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -81,22 +81,22 @@ end """ - Dropout(p; mode=:train) + Dropout(p; testmode=false) -A Dropout layer. In `:train` mode sets input components `x[i]` to zero with +A Dropout layer. If `testmode=false` mode sets input components `x[i]` to zero with probability `p` and to `x[i]/(1-p)` with probability `(1-p)`. -In `:eval` mode it doesn't alter the input: `x == Dropout(p; mode=:eval)(x)`. -Change the mode with [`setmode!`](@ref). +In `testmode=true`it doesn't alter the input: `x == Dropout(p; mode=:eval)(x)`. +Change the mode with [`testmode!`](@ref). """ mutable struct Dropout{F} p::F - mode::Symbol + testmode::Bool end -Dropout(p::F; mode=:train) where {F} = Dropout{F}(p, mode) +Dropout(p::F; testmode::Bool=false) where {F} = Dropout{F}(p, testmode) function (a::Dropout)(x) - if a.mode == :eval + if a.testmode return x else if 0 < a.p < 1 @@ -116,12 +116,9 @@ function (a::Dropout)(x) end """ - setmode!(m, mode::Symbol) + testmode!(m, val=true) -Change the mode of model `m` to `mode`. Possible values for `mode` are -`:train` and `:eval`. -This has an affect only if `m` contains [`Dropout`](@ref) of `BatchNorm` layers. +Set model `m` in test mode if `val=true`, and in training mode otherwise. +This has an affect only if `m` contains [`Dropout`](@ref) or `BatchNorm` layers. """ -setmode!(a, mode::Symbol) = nothing -setmode!(c::Chain, mode::Symbol) = mapchildren(x->setmode!(x, mode), c) -setmode!(a::Dropout, mode::Symbol) = a.mode = mode +testmode!(m, val::Bool=true) = prefor(x -> x isa Dropout && (x.testmode = val), m) diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 8f495f82..1ab92f7e 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -41,6 +41,7 @@ TrackedArray(x::AbstractArray) = TrackedArray(Call(nothing), x, zeros(x)) param(xs) = TrackedArray(AbstractFloat.(xs)) istracked(x::TrackedArray) = true data(x::TrackedArray) = x.data +# data(x::TrackedScalar) = x.data[] grad(x::TrackedArray) = x.grad # Fallthrough methods diff --git a/test/layers.jl b/test/layers.jl index d0a5cbe1..0d6c3bc6 100644 --- a/test/layers.jl +++ b/test/layers.jl @@ -1,23 +1,26 @@ @testset "dropout" begin x = [1.,2.,3.] - @test x === Dropout(0.1, mode=:eval)(x) - @test x === Dropout(0, mode=:train)(x) - @test all(zeros(x) .== Dropout(1, mode=:train)(x)) + @test x === Dropout(0.1, testmode=true)(x) + @test x === Dropout(0, testmode=false)(x) + @test all(zeros(x) .== Dropout(1, testmode=false)(x)) x = rand(100) m = Dropout(0.9) y = m(x) @test count(a->a==0, y) > 50 - setmode!(m, :eval) + testmode!(m) y = m(x) @test count(a->a==0, y) == 0 + testmode!(m, false) + y = m(x) + @test count(a->a==0, y) > 50 x = rand(100) m = Chain(Dense(100,100), Dropout(0.9)) y = m(x) @test count(a->a == 0, y) > 50 - setmode!(m, :eval) + testmode!(m) y = m(x) @test count(a->a == 0, y) == 0 end From 711ea09d99cc3cc8daf39b172c5a5be065f13d7f Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Wed, 25 Oct 2017 02:35:27 +0200 Subject: [PATCH 04/32] address comments --- src/layers/basic.jl | 2 +- src/tracker/Tracker.jl | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 841cf094..c15868ab 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -121,4 +121,4 @@ end Set model `m` in test mode if `val=true`, and in training mode otherwise. This has an affect only if `m` contains [`Dropout`](@ref) or `BatchNorm` layers. """ -testmode!(m, val::Bool=true) = prefor(x -> x isa Dropout && (x.testmode = val), m) +testmode!(m, val::Bool=true) = prefor(x -> :testmode ∈ fieldnames(x) && (x.testmode = val), m) diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 1ab92f7e..90707ea5 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -55,12 +55,17 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) = Base.similar(x::TrackedArray, T::Type) = similar(data(x), T) -==(x::TrackedArray, y) = data(x) == y -==(y, x::TrackedArray) = y == data(x) -==(x::TrackedScalar, y) = data(x)[] == y -==(y, x::TrackedScalar) = y == data(x)[] -<(x::TrackedScalar, y) = data(x)[] < y -<(x, y::TrackedScalar) = x < data(y)[] +#to be merged with data in the future +unbox(x::TrackedArray) = data(x) +unbox(x::TrackedScalar) = data(x)[] + +==(x::TrackedArray, y) = unbox(x) == y +==(y, x::TrackedArray) = y == unbox(x) +==(x::TrackedArray, y::TrackedArray) = unbox(x) == unbox(x) + +<(x::TrackedScalar, y) = unbox(x) < y +<(x, y::TrackedScalar) = x < unbox(y) +<(x::TrackedScalar, y::TrackedScalar) = unbox(x) < unbox(y) Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} = print(io, "TrackedArray{…,$A}") From 0df300299ffc91487318c3c8d3f483985ea92d9c Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 26 Oct 2017 11:15:14 +0100 Subject: [PATCH 05/32] clearer error message, fixes #93 --- src/tracker/Tracker.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 8e6a584a..f2e52f61 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -70,6 +70,9 @@ function Base.showarray(io::IO, X::TrackedArray, repr::Bool = true; header = tru end end +Base.setindex!(xs::TrackedArray, v, i...) = + error("Can't differentiate `setindex!`") + include("back.jl") include("lib.jl") include("numeric.jl") From cf6b930f639970a4fb2707eedee0a1d4c2287205 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 26 Oct 2017 11:46:12 +0100 Subject: [PATCH 06/32] reorganise --- docs/src/models/layers.md | 8 ++++ src/Flux.jl | 3 +- src/layers/basic.jl | 44 -------------------- src/layers/normalisation.jl | 45 +++++++++++++++++++++ test/{layers.jl => layers/normalisation.jl} | 10 +++-- test/runtests.jl | 2 +- 6 files changed, 62 insertions(+), 50 deletions(-) create mode 100644 src/layers/normalisation.jl rename test/{layers.jl => layers/normalisation.jl} (69%) diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index 565e3b05..46547ce3 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -30,3 +30,11 @@ leakyrelu elu swish ``` + +## Normalisation & Regularisation + +These layers don't affect the structure of the network but may improve training times or reduce overfitting. + +```@docs +Dropout +``` diff --git a/src/Flux.jl b/src/Flux.jl index 957940c3..e4b6c832 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -9,7 +9,7 @@ using Lazy: @forward export Chain, Dense, RNN, LSTM, Dropout, SGD, ADAM, Momentum, Nesterov, - param, params, mapleaves, testmode! + param, params, mapleaves using NNlib export σ, relu, leakyrelu, elu, swish, softmax @@ -27,5 +27,6 @@ include("tree.jl") include("layers/stateless.jl") include("layers/basic.jl") include("layers/recurrent.jl") +include("layers/normalisation.jl") end # module diff --git a/src/layers/basic.jl b/src/layers/basic.jl index c15868ab..969a261c 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -78,47 +78,3 @@ function Base.show(io::IO, l::Dense) l.σ == identity || print(io, ", ", l.σ) print(io, ")") end - - -""" - Dropout(p; testmode=false) - -A Dropout layer. If `testmode=false` mode sets input components `x[i]` to zero with -probability `p` and to `x[i]/(1-p)` with probability `(1-p)`. - -In `testmode=true`it doesn't alter the input: `x == Dropout(p; mode=:eval)(x)`. -Change the mode with [`testmode!`](@ref). -""" -mutable struct Dropout{F} - p::F - testmode::Bool -end -Dropout(p::F; testmode::Bool=false) where {F} = Dropout{F}(p, testmode) - -function (a::Dropout)(x) - if a.testmode - return x - else - if 0 < a.p < 1 - y = similar(x) - rand!(y) - q = 1 - a.p - @inbounds for i=1:length(y) - y[i] = y[i] > a.p ? 1 / q : 0 - end - return y .* x - elseif a.p == 0 - return x - elseif a.p == 1 - return zeros(x) - end - end -end - -""" - testmode!(m, val=true) - -Set model `m` in test mode if `val=true`, and in training mode otherwise. -This has an affect only if `m` contains [`Dropout`](@ref) or `BatchNorm` layers. -""" -testmode!(m, val::Bool=true) = prefor(x -> :testmode ∈ fieldnames(x) && (x.testmode = val), m) diff --git a/src/layers/normalisation.jl b/src/layers/normalisation.jl new file mode 100644 index 00000000..08c21428 --- /dev/null +++ b/src/layers/normalisation.jl @@ -0,0 +1,45 @@ +""" + testmode!(m) + testmode!(m, false) + +Put layers like [`Dropout`](@ref) and `BatchNorm` into testing mode (or back to +training mode with `false`). +""" +function testmode!(m, val::Bool=true) + prefor(x -> _testmode!(x, val), m) + return m +end + +_testmode!(m, test) = nothing + +""" + Dropout(p) + +A Dropout layer. For each input, either sets that input to `0` (with probability +`p`) or scales it by `1/(1-p)`. This is used as a regularisation, i.e. it +reduces overfitting during training. + +Does nothing to the input once in [`testmode!`](@ref). +""" +mutable struct Dropout{F} + p::F + active::Bool +end + +function Dropout(p) + @assert 0 ≤ p ≤ 1 + Dropout{typeof(p)}(p, true) +end + +function (a::Dropout)(x) + a.active || return x + y = similar(x) + rand!(y) + q = 1 - a.p + @inbounds for i=1:length(y) + y[i] = y[i] > a.p ? 1 / q : 0 + end + return y .* x +end + +_testmode!(a::Dropout, test) = (a.active = !test) diff --git a/test/layers.jl b/test/layers/normalisation.jl similarity index 69% rename from test/layers.jl rename to test/layers/normalisation.jl index 0d6c3bc6..5a302a51 100644 --- a/test/layers.jl +++ b/test/layers/normalisation.jl @@ -1,8 +1,10 @@ -@testset "dropout" begin +using Flux: testmode! + +@testset "Dropout" begin x = [1.,2.,3.] - @test x === Dropout(0.1, testmode=true)(x) - @test x === Dropout(0, testmode=false)(x) - @test all(zeros(x) .== Dropout(1, testmode=false)(x)) + @test x == testmode!(Dropout(0.1))(x) + @test x == Dropout(0)(x) + @test zeros(x) == Dropout(1)(x) x = rand(100) m = Dropout(0.9) diff --git a/test/runtests.jl b/test/runtests.jl index b7b838df..efd1a462 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,6 @@ using Flux, Base.Test include("utils.jl") include("tracker.jl") -include("layers.jl") +include("layers/normalisation.jl") end From 84efbbcc8422d1521e2b67f11f5f015b1868e581 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 26 Oct 2017 12:06:29 +0100 Subject: [PATCH 07/32] tracker predicate tweaks --- src/tracker/Tracker.jl | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 90707ea5..8a481970 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -1,5 +1,5 @@ module Tracker -import Base: <, == + export TrackedArray, param, back! data(x) = x @@ -41,7 +41,6 @@ TrackedArray(x::AbstractArray) = TrackedArray(Call(nothing), x, zeros(x)) param(xs) = TrackedArray(AbstractFloat.(xs)) istracked(x::TrackedArray) = true data(x::TrackedArray) = x.data -# data(x::TrackedScalar) = x.data[] grad(x::TrackedArray) = x.grad # Fallthrough methods @@ -55,17 +54,17 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) = Base.similar(x::TrackedArray, T::Type) = similar(data(x), T) -#to be merged with data in the future -unbox(x::TrackedArray) = data(x) -unbox(x::TrackedScalar) = data(x)[] +value(x) = x +value(x::TrackedArray) = data(x) +value(x::TrackedScalar) = data(x)[] -==(x::TrackedArray, y) = unbox(x) == y -==(y, x::TrackedArray) = y == unbox(x) -==(x::TrackedArray, y::TrackedArray) = unbox(x) == unbox(x) +Base.:(==)(x::TrackedArray, y) = value(x) == y +Base.:(==)(y, x::TrackedArray) = y == value(x) +Base.:(==)(x::TrackedArray, y::TrackedArray) = value(x) == value(x) -<(x::TrackedScalar, y) = unbox(x) < y -<(x, y::TrackedScalar) = x < unbox(y) -<(x::TrackedScalar, y::TrackedScalar) = unbox(x) < unbox(y) +Base.isless(x::TrackedScalar, y) = isless(value(x), y) +Base.isless(x, y::TrackedScalar) = isless(x, value(y)) +Base.isless(x::TrackedScalar, y::TrackedScalar) = isless(value(x), value(y)) Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} = print(io, "TrackedArray{…,$A}") From c43bda019bedb0fda14cd0e747a115dae22f8d54 Mon Sep 17 00:00:00 2001 From: Iblis Lin Date: Mon, 30 Oct 2017 16:21:02 +0800 Subject: [PATCH 08/32] TrackedArray: implement `mean` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ```julia julia> p Tracked 2×3 Array{Float64,2}: 1.0 3.0 5.0 2.0 4.0 6.0 ``` Before ```julia julia> @benchmark Flux.Tracker.back!(sum($p, 2) ./ size($p, 2), ones(2, 1)) BenchmarkTools.Trial: memory estimate: 3.44 KiB allocs estimate: 75 -------------- minimum time: 20.438 μs (0.00% GC) median time: 21.239 μs (0.00% GC) mean time: 22.354 μs (1.68% GC) maximum time: 3.811 ms (98.51% GC) -------------- samples: 10000 evals/sample: 1 ``` After ```julia julia> @benchmark Flux.Tracker.back!(mean($p, 2), ones(2, 1)) BenchmarkTools.Trial: memory estimate: 1008 bytes allocs estimate: 21 -------------- minimum time: 5.973 μs (0.00% GC) median time: 6.310 μs (0.00% GC) mean time: 6.630 μs (1.96% GC) maximum time: 680.709 μs (97.28% GC) -------------- samples: 10000 evals/sample: 6 ``` --- src/tracker/lib.jl | 5 +++++ test/tracker.jl | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl index 254be8dc..9f3adc6b 100644 --- a/src/tracker/lib.jl +++ b/src/tracker/lib.jl @@ -57,6 +57,11 @@ back(::typeof(sum), Δ, xs::TrackedArray, dim...) = back(xs, similar(xs.data) .= Base.maximum(xs::TrackedArray, args...) = maximum(xs.data, args...) Base.findfirst(xs::TrackedArray, args...) = findfirst(xs.data, args...) +Base.mean(xs::TrackedArray, region) = TrackedArray(Call(mean, xs, region)) + +back(::typeof(mean), Δ, xs::TrackedArray, region) = + back(xs, similar(xs.data) .= Δ ./ prod(size(xs.data, region...))) + # BLAS a::TrackedMatrix * b::TrackedMatrix = TrackedArray(Call(*, a, b)) diff --git a/test/tracker.jl b/test/tracker.jl index 2a20338e..52a73a07 100644 --- a/test/tracker.jl +++ b/test/tracker.jl @@ -22,6 +22,16 @@ gradtest(f, dims...) = gradtest(f, rand.(dims)...) @test gradtest(vcat, rand(5), rand(3)) @test gradtest(vcat, rand(2,3), rand(3,3)) +@testset "mean" begin + @test gradtest(mean, rand(2, 3)) + + @test gradtest(x -> mean(x, 1), rand(2, 3)) + @test gradtest(x -> mean(x, 2), rand(2, 3)) + @test gradtest(x -> mean(x, 3), rand(2, 3, 4)) + + @test gradtest(x -> mean(x, [1, 2]), rand(2, 3, 4)) +end + @test gradtest(rand(5)) do x y = x.^2 2y + x From 3d8b7250aeff4e0f600e39af6372e471df8240ac Mon Sep 17 00:00:00 2001 From: Iblis Lin Date: Tue, 31 Oct 2017 10:41:44 +0000 Subject: [PATCH 09/32] add scalar mean --- src/tracker/lib.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl index 9f3adc6b..a90eb932 100644 --- a/src/tracker/lib.jl +++ b/src/tracker/lib.jl @@ -57,8 +57,10 @@ back(::typeof(sum), Δ, xs::TrackedArray, dim...) = back(xs, similar(xs.data) .= Base.maximum(xs::TrackedArray, args...) = maximum(xs.data, args...) Base.findfirst(xs::TrackedArray, args...) = findfirst(xs.data, args...) +Base.mean(xs::TrackedArray) = TrackedArray(Call(mean, xs), toarray(xs.data, mean(xs.data))) Base.mean(xs::TrackedArray, region) = TrackedArray(Call(mean, xs, region)) +back(::typeof(mean), Δ, xs::TrackedArray) = back(xs, similar(xs.data) .= Δ ./ length(xs.data)) back(::typeof(mean), Δ, xs::TrackedArray, region) = back(xs, similar(xs.data) .= Δ ./ prod(size(xs.data, region...))) From e943a39ee72037184b9e46d89c0af536c78effef Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 31 Oct 2017 16:37:33 +0000 Subject: [PATCH 10/32] combine special cases --- src/layers/recurrent.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index 716bc574..599776ce 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -1,5 +1,7 @@ # TODO: broadcasting cat -combine(x, h) = vcat(x, h .* trues(1, size(x, 2))) +combine(x::AbstractMatrix, h::AbstractVector) = vcat(x, h .* trues(1, size(x, 2))) +combine(x::AbstractVector, h::AbstractVector) = vcat(x, h) +combine(x::AbstractMatrix, h::AbstractMatrix) = vcat(x, h) # Stateful recurrence From 0f8ba87dc68307050abb46f594280c1873ee4d84 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 31 Oct 2017 16:37:41 +0000 Subject: [PATCH 11/32] treelike tuples --- src/tree.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tree.jl b/src/tree.jl index efdf9101..899fccea 100644 --- a/src/tree.jl +++ b/src/tree.jl @@ -1,6 +1,9 @@ children(x) = () mapchildren(f, x) = x +children(x::Tuple) = x +mapchildren(f, x::Tuple) = map(f, x) + function treelike(T, fs = fieldnames(T)) @eval begin children(x::$T) = ($([:(x.$f) for f in fs]...),) From e7a510da9a42495bd5599aa857b19b96b5b7a442 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 1 Nov 2017 16:01:34 +0000 Subject: [PATCH 12/32] add cmudict dataset --- .gitignore | 2 +- src/Flux.jl | 2 ++ src/data/Data.jl | 14 ++++++++++++++ src/data/cmudict.jl | 42 ++++++++++++++++++++++++++++++++++++++++++ test/data.jl | 3 +++ 5 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 src/data/Data.jl create mode 100644 src/data/cmudict.jl create mode 100644 test/data.jl diff --git a/.gitignore b/.gitignore index 785b9c4e..9d6de240 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,4 @@ docs/build/ docs/site/ docs/flux.css -demos +deps diff --git a/src/Flux.jl b/src/Flux.jl index e4b6c832..242c8b1f 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -29,4 +29,6 @@ include("layers/basic.jl") include("layers/recurrent.jl") include("layers/normalisation.jl") +include("data/Data.jl") + end # module diff --git a/src/data/Data.jl b/src/data/Data.jl new file mode 100644 index 00000000..ffea729c --- /dev/null +++ b/src/data/Data.jl @@ -0,0 +1,14 @@ +module Data + +export CMUDict, cmudict + +deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...) + +function __init__() + mkpath(deps()) +end + +include("cmudict.jl") +using .CMUDict + +end diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl new file mode 100644 index 00000000..88b9c6c0 --- /dev/null +++ b/src/data/cmudict.jl @@ -0,0 +1,42 @@ +module CMUDict + +export cmudict + +using ..Data: deps + +const version = "0.7b" + +function load() + isdir(deps("cmudict")) && return + mkpath(deps("cmudict")) + for x in ["", ".phones", ".symbols"] + download("http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-$version$x", + deps("cmudict", "cmudict$x")) + end +end + +function phones() + load() + Symbol.(first.(split.(split(readstring(deps("cmudict", "cmudict.phones")), + "\n", keep = false), "\t"))) +end + +function symbols() + load() + Symbol.(split(readstring(deps("CMUDict", "cmudict.symbols")), + "\n", keep = false)) +end + +function rawdict() + load() + Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in + filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n")))) +end + +validword(s) = ismatch(r"^[\w-\.]+$", s) + +cmudict() = filter((s, ps) -> validword(s), rawdict()) + +alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.'] + +end diff --git a/test/data.jl b/test/data.jl new file mode 100644 index 00000000..1b93ab3c --- /dev/null +++ b/test/data.jl @@ -0,0 +1,3 @@ +using Flux.Data + +@test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args From 21ea93ffcd08c87ed5dfae5bc6645852744160fe Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 2 Nov 2017 11:44:39 +0000 Subject: [PATCH 13/32] rename treelike --- src/Flux.jl | 2 +- src/{tree.jl => treelike.jl} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/{tree.jl => treelike.jl} (100%) diff --git a/src/Flux.jl b/src/Flux.jl index 242c8b1f..ff78593f 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -22,7 +22,7 @@ using .Optimise include("utils.jl") include("onehot.jl") -include("tree.jl") +include("treelike.jl") include("layers/stateless.jl") include("layers/basic.jl") diff --git a/src/tree.jl b/src/treelike.jl similarity index 100% rename from src/tree.jl rename to src/treelike.jl From efa51f02e7a7ea28d79aabe496cdb57aedbae4fd Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 11 Oct 2017 11:54:18 +0100 Subject: [PATCH 14/32] basic batch type --- src/Flux.jl | 2 ++ src/batches/Batches.jl | 7 +++++++ src/batches/batch.jl | 8 ++++++++ 3 files changed, 17 insertions(+) create mode 100644 src/batches/Batches.jl create mode 100644 src/batches/batch.jl diff --git a/src/Flux.jl b/src/Flux.jl index ff78593f..acefff19 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -31,4 +31,6 @@ include("layers/normalisation.jl") include("data/Data.jl") +include("batches/Batches.jl") + end # module diff --git a/src/batches/Batches.jl b/src/batches/Batches.jl new file mode 100644 index 00000000..066f4d1c --- /dev/null +++ b/src/batches/Batches.jl @@ -0,0 +1,7 @@ +module Batches + +import ..Flux + +include("batch.jl") + +end diff --git a/src/batches/batch.jl b/src/batches/batch.jl new file mode 100644 index 00000000..5a2eb82e --- /dev/null +++ b/src/batches/batch.jl @@ -0,0 +1,8 @@ +struct Batch{T,A,M} + data::A + mask::M +end + +Batch{T}(data, mask) where T = Batch{T,typeof(data),typeof(mask)}(data, mask) + +Batch(xs) = Batch{typeof(first(xs))}(Flux.batch(xs),trues(length(xs))) From 97244e0a68fa8cbae17f8065160126897a674009 Mon Sep 17 00:00:00 2001 From: Fredrik Bagge Carlson Date: Sat, 4 Nov 2017 13:27:32 +0100 Subject: [PATCH 15/32] Allow array of optimisers to train! This allows an array of optimisers to be sent to `train!` --- src/optimise/train.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/optimise/train.jl b/src/optimise/train.jl index 2a2ec5eb..0809e86b 100644 --- a/src/optimise/train.jl +++ b/src/optimise/train.jl @@ -1,8 +1,8 @@ using Juno using Flux.Tracker: back! -tocb(f) = f -tocb(fs::AbstractVector) = () -> foreach(call, fs) +runall(f) = f +runall(fs::AbstractVector) = () -> foreach(call, fs) """ train!(loss, data, opt; cb = () -> ()) @@ -11,10 +11,11 @@ For each datapoint `d` in `data` computes the gradient of `loss(d...)` through backpropagation and calls the optimizer `opt` and the callback `cb` (i.e. `opt()` and `cb()`). -Multiple callbacks can be passed to `cb` as an array. +Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays. """ function train!(loss, data, opt; cb = () -> ()) - cb = tocb(cb) + cb = runall(cb) + opt = runall(opt) @progress for d in data l = loss(d...) isinf(l.data[]) && error("Loss is Inf") From d6423eefe54b8ba822ed49b8b5c0d52dbe58ae1d Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 7 Nov 2017 19:34:27 +0000 Subject: [PATCH 16/32] matrix-vector fast path --- src/tracker/Tracker.jl | 2 ++ src/tracker/lib.jl | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index d6fa6f35..5e26a051 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -38,6 +38,8 @@ TrackedArray(c::Call) = TrackedArray(c, c()) TrackedArray(x::AbstractArray) = TrackedArray(Call(nothing), x, zeros(x)) +isleaf(x::TrackedArray) = x.f == Call(nothing) + param(xs) = TrackedArray(AbstractFloat.(xs)) param(xs::Real) = param(fill(xs)) diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl index a90eb932..2ee5d659 100644 --- a/src/tracker/lib.jl +++ b/src/tracker/lib.jl @@ -79,6 +79,16 @@ function back(::typeof(*), Δ, a::AbstractMatrix, b::AbstractVecOrMat) @back(b, At_mul_B(data(a), Δ)) end +# Fast path for matrix-vector +function back(::typeof(*), Δ::AbstractVector, W::TrackedMatrix, x::AbstractVector) + if isleaf(W) + W.grad .+= Δ .* data(x).' + else + back(W, A_mul_Bt(Δ, data(x))) + end + @back(x, At_mul_B(data(W), Δ)) +end + # NNlib import NNlib: softmax, ∇softmax From d4229c4815a265d2ba084dc2b5b6db264cea497d Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 7 Nov 2017 19:34:35 +0000 Subject: [PATCH 17/32] useful params method --- src/treelike.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/treelike.jl b/src/treelike.jl index 899fccea..097ccdc6 100644 --- a/src/treelike.jl +++ b/src/treelike.jl @@ -35,3 +35,5 @@ function params(m) prefor(p -> p isa TrackedArray && push!(ps, p), m) return ps end + +params(m...) = params(m) From fcd091e8f06fc7a8824c4ca12d38dd23a4da4f08 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 8 Nov 2017 22:00:19 +0000 Subject: [PATCH 18/32] Ac_mul_B derivatives --- src/tracker/lib.jl | 28 ++++++++++++++++++++-------- test/tracker.jl | 2 ++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl index 2ee5d659..aab26dfe 100644 --- a/src/tracker/lib.jl +++ b/src/tracker/lib.jl @@ -1,5 +1,3 @@ -import Base: * - toarray(xs::AbstractArray, ys::AbstractArray) = ys toarray(xs::AbstractArray, y) = similar(xs, typeof(y), ()) .= y @@ -66,19 +64,33 @@ back(::typeof(mean), Δ, xs::TrackedArray, region) = # BLAS -a::TrackedMatrix * b::TrackedMatrix = TrackedArray(Call(*, a, b)) -a::TrackedMatrix * b::AbstractMatrix = TrackedArray(Call(*, a, b)) -a::AbstractMatrix * b::TrackedMatrix = TrackedArray(Call(*, a, b)) +for f in :[*, Ac_mul_B].args + @eval begin + import Base.$f + $f(a::TrackedMatrix, b::TrackedMatrix) = TrackedArray(Call($f, a, b)) + $f(a::TrackedMatrix, b::AbstractMatrix) = TrackedArray(Call($f, a, b)) + $f(a::AbstractMatrix, b::TrackedMatrix) = TrackedArray(Call($f, a, b)) -a::TrackedMatrix * b::TrackedVector = TrackedArray(Call(*, a, b)) -a::TrackedMatrix * b::AbstractVector = TrackedArray(Call(*, a, b)) -a::AbstractMatrix * b::TrackedVector = TrackedArray(Call(*, a, b)) + $f(a::TrackedMatrix, b::TrackedVector) = TrackedArray(Call($f, a, b)) + $f(a::TrackedMatrix, b::AbstractVector) = TrackedArray(Call($f, a, b)) + $f(a::AbstractMatrix, b::TrackedVector) = TrackedArray(Call($f, a, b)) + + $f(a::TrackedVector, b::TrackedVector) = TrackedArray(Call($f, a, b)) + $f(a::TrackedVector, b::AbstractVector) = TrackedArray(Call($f, a, b)) + $f(a::AbstractVector, b::TrackedVector) = TrackedArray(Call($f, a, b)) + end +end function back(::typeof(*), Δ, a::AbstractMatrix, b::AbstractVecOrMat) @back(a, A_mul_Bt(Δ, data(b))) @back(b, At_mul_B(data(a), Δ)) end +function back(::typeof(Ac_mul_B), Δ, a::AbstractVecOrMat{<:Real}, b::AbstractVecOrMat{<:Real}) + @back(a, A_mul_Bt(Δ, data(b))') + @back(b, *(data(a), Δ)) +end + # Fast path for matrix-vector function back(::typeof(*), Δ::AbstractVector, W::TrackedMatrix, x::AbstractVector) if isleaf(W) diff --git a/test/tracker.jl b/test/tracker.jl index 52a73a07..69f37367 100644 --- a/test/tracker.jl +++ b/test/tracker.jl @@ -9,6 +9,8 @@ gradtest(f, dims...) = gradtest(f, rand.(dims)...) @test gradtest((x, W, b) -> σ.(W*x .+ b), 5, (2,5), 2) @test gradtest((x, W, b) -> σ.(W*x .+ b), (5,3), (2,5), 2) +@test gradtest((w, x) -> w'*x, randn(10, 2), randn(10)) + @test gradtest(x -> sin.(sum(x, (2, 3))), (3,4,5)) @test gradtest(x -> softmax(x).*(1:3), 3) From bdf02e42aee308125cf3a9a7a05bb3f7d24d4942 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 8 Nov 2017 22:00:31 +0000 Subject: [PATCH 19/32] test tweaks --- test/tracker.jl | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/test/tracker.jl b/test/tracker.jl index 69f37367..f2a369f8 100644 --- a/test/tracker.jl +++ b/test/tracker.jl @@ -39,18 +39,4 @@ end 2y + x end -for T in [Float32, Float64] - @test isa(param(T(1)), TrackedArray{T, 0}) - @test isa(param(rand(T, 2)), TrackedArray{T, 1}) - @test isa(param(rand(T, 2,2)), TrackedArray{T, 2}) -end - -# TODO: do we wand this behaviour ?? -F = typeof(AbstractFloat(1)) -for T in [Int32, Int64] - @test isa(param(T(1)), TrackedArray{F, 0}) - @test isa(param(rand(T, 2)), TrackedArray{F, 1}) - @test isa(param(rand(T, 2,2)), TrackedArray{F, 2}) -end - end #testset From e5d99d784ec23d32e679b9f5a72cacb32ac5d361 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 9 Nov 2017 14:53:26 +0000 Subject: [PATCH 20/32] fixes #79 --- src/onehot.jl | 11 +++++++++-- src/tracker/Tracker.jl | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/onehot.jl b/src/onehot.jl index 5414773c..f8061063 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -1,3 +1,5 @@ +import Base: * + struct OneHotVector <: AbstractVector{Bool} ix::UInt32 of::UInt32 @@ -7,7 +9,7 @@ Base.size(xs::OneHotVector) = (Int64(xs.of),) Base.getindex(xs::OneHotVector, i::Integer) = i == xs.ix -Base.:*(A::AbstractMatrix, b::OneHotVector) = A[:, b.ix] +A::AbstractMatrix * b::OneHotVector = A[:, b.ix] struct OneHotMatrix{A<:AbstractVector{OneHotVector}} <: AbstractMatrix{Bool} height::Int @@ -18,7 +20,7 @@ Base.size(xs::OneHotMatrix) = (Int64(xs.height),length(xs.data)) Base.getindex(xs::OneHotMatrix, i::Int, j::Int) = xs.data[j][i] -Base.:*(A::AbstractMatrix, B::OneHotMatrix) = A[:, map(x->x.ix, B.data)] +A::AbstractMatrix * B::OneHotMatrix = A[:, map(x->x.ix, B.data)] Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...]) @@ -47,3 +49,8 @@ argmax(y::AbstractVector, labels = 1:length(y)) = argmax(y::AbstractMatrix, l...) = squeeze(mapslices(y -> argmax(y, l...), y, 1), 1) + +# Ambiguity hack + +a::TrackedMatrix * b::OneHotVector = TrackedArray(Tracker.Call(*, a, b)) +a::TrackedMatrix * b::OneHotMatrix = TrackedArray(Tracker.Call(*, a, b)) diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 5e26a051..3a64fcb7 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -1,6 +1,6 @@ module Tracker -export TrackedArray, param, back! +export TrackedArray, TrackedVector, TrackedMatrix, param, back! data(x) = x istracked(x) = false From 2cb94981a0176f070eb2dec31c00ef125613ce3f Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Fri, 27 Oct 2017 12:05:37 +0100 Subject: [PATCH 21/32] gpu-ready log --- src/Flux.jl | 1 + src/layers/stateless.jl | 4 +-- src/numeric.jl | 80 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 src/numeric.jl diff --git a/src/Flux.jl b/src/Flux.jl index acefff19..ce3861e5 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -21,6 +21,7 @@ include("optimise/Optimise.jl") using .Optimise include("utils.jl") +include("numeric.jl") include("onehot.jl") include("treelike.jl") diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 3931c216..56d18349 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -3,12 +3,12 @@ mse(ŷ, y) = sum((ŷ .- y).^2)/length(y) crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat) = - -sum(y .* log.(ŷ)) / size(y, 2) + -sum(y .* log_fast.(ŷ)) / size(y, 2) @deprecate logloss(x, y) crossentropy(x, y) function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat) logŷ = logŷ .- maximum(logŷ, 1) - ypred = logŷ .- log.(sum(exp.(logŷ), 1)) + ypred = logŷ .- log_fast.(sum(exp.(logŷ), 1)) -sum(y .* ypred) / size(y, 2) end diff --git a/src/numeric.jl b/src/numeric.jl new file mode 100644 index 00000000..9c444043 --- /dev/null +++ b/src/numeric.jl @@ -0,0 +1,80 @@ +using Base.Math: @horner, significand_bits, exponent_raw_max, exponent_bias + +if VERSION < v"0.7.0-DEV.1430" + using Base.Math.fpinttype +else + using Base.uinttype +end + +# log_fast from +# https://github.com/musm/SLEEF.jl/blob/c9dcd2eb090d69ec40790f19798c5fef2aba2616/src/log.jl + +const MLN2 = 6.931471805599453094172321214581765680755001343602552541206800094933936219696955e-01 # log(2) + +@inline float2integer(d::Float64) = (reinterpret(Int64, d) >> significand_bits(Float64)) % Int +@inline float2integer(d::Float32) = (reinterpret(Int32, d) >> significand_bits(Float32)) % Int + +@inline function ilogb2k(d::T) where {T<:Union{Float32,Float64}} + (float2integer(d) & exponent_raw_max(T)) - exponent_bias(T) +end + +@inline function ldexp3k(x::T, e::Int) where {T<:Union{Float32,Float64}} + if VERSION < v"0.7.0-DEV.1430" + reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % fpinttype(T)) + else + reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % uinttype(T)) + end +end + +""" + log_fast(x) +Compute the natural logarithm of `x`. The inverse of the natural logarithm is +the natural expoenential function `exp(x)` +""" +function log_fast end + +let +global log_fast + +c8d = 0.153487338491425068243146 +c7d = 0.152519917006351951593857 +c6d = 0.181863266251982985677316 +c5d = 0.222221366518767365905163 +c4d = 0.285714294746548025383248 +c3d = 0.399999999950799600689777 +c2d = 0.6666666666667778740063 +c1d = 2.0 + +c5f = 0.2392828464508056640625f0 +c4f = 0.28518211841583251953125f0 +c3f = 0.400005877017974853515625f0 +c2f = 0.666666686534881591796875f0 +c1f = 2f0 + +global @inline log_fast_kernel(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d +global @inline log_fast_kernel(x::Float32) = @horner x c1f c2f c3f c4f c5f + +function log_fast(d::T) where {T<:Union{Float32,Float64}} + o = d < realmin(T) + o && (d *= T(Int64(1) << 32) * T(Int64(1) << 32)) + + e = ilogb2k(d * T(1.0/0.75)) + m = ldexp3k(d, -e) + o && (e -= 64) + + x = (m - 1) / (m + 1) + x2 = x * x + + t = log_fast_kernel(x2) + + x = x * t + T(MLN2) * e + + isinf(d) && (x = T(Inf)) + (d < 0 || isnan(d)) && (x = T(NaN)) + d == 0 && (x = -T(Inf)) + + return x +end +end + +log_fast(x::Union{Int32,Int64}) = log_fast(float(x)) From e0657d93ecccf1b1ac924a42909a0c79b9433df4 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 9 Nov 2017 15:03:57 +0000 Subject: [PATCH 22/32] mv numeric.jl to nnlib --- src/Flux.jl | 1 - src/layers/stateless.jl | 2 ++ src/numeric.jl | 80 ----------------------------------------- 3 files changed, 2 insertions(+), 81 deletions(-) delete mode 100644 src/numeric.jl diff --git a/src/Flux.jl b/src/Flux.jl index ce3861e5..acefff19 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -21,7 +21,6 @@ include("optimise/Optimise.jl") using .Optimise include("utils.jl") -include("numeric.jl") include("onehot.jl") include("treelike.jl") diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 56d18349..834068aa 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -1,3 +1,5 @@ +using NNlib: log_fast + # Cost functions mse(ŷ, y) = sum((ŷ .- y).^2)/length(y) diff --git a/src/numeric.jl b/src/numeric.jl deleted file mode 100644 index 9c444043..00000000 --- a/src/numeric.jl +++ /dev/null @@ -1,80 +0,0 @@ -using Base.Math: @horner, significand_bits, exponent_raw_max, exponent_bias - -if VERSION < v"0.7.0-DEV.1430" - using Base.Math.fpinttype -else - using Base.uinttype -end - -# log_fast from -# https://github.com/musm/SLEEF.jl/blob/c9dcd2eb090d69ec40790f19798c5fef2aba2616/src/log.jl - -const MLN2 = 6.931471805599453094172321214581765680755001343602552541206800094933936219696955e-01 # log(2) - -@inline float2integer(d::Float64) = (reinterpret(Int64, d) >> significand_bits(Float64)) % Int -@inline float2integer(d::Float32) = (reinterpret(Int32, d) >> significand_bits(Float32)) % Int - -@inline function ilogb2k(d::T) where {T<:Union{Float32,Float64}} - (float2integer(d) & exponent_raw_max(T)) - exponent_bias(T) -end - -@inline function ldexp3k(x::T, e::Int) where {T<:Union{Float32,Float64}} - if VERSION < v"0.7.0-DEV.1430" - reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % fpinttype(T)) - else - reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % uinttype(T)) - end -end - -""" - log_fast(x) -Compute the natural logarithm of `x`. The inverse of the natural logarithm is -the natural expoenential function `exp(x)` -""" -function log_fast end - -let -global log_fast - -c8d = 0.153487338491425068243146 -c7d = 0.152519917006351951593857 -c6d = 0.181863266251982985677316 -c5d = 0.222221366518767365905163 -c4d = 0.285714294746548025383248 -c3d = 0.399999999950799600689777 -c2d = 0.6666666666667778740063 -c1d = 2.0 - -c5f = 0.2392828464508056640625f0 -c4f = 0.28518211841583251953125f0 -c3f = 0.400005877017974853515625f0 -c2f = 0.666666686534881591796875f0 -c1f = 2f0 - -global @inline log_fast_kernel(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d -global @inline log_fast_kernel(x::Float32) = @horner x c1f c2f c3f c4f c5f - -function log_fast(d::T) where {T<:Union{Float32,Float64}} - o = d < realmin(T) - o && (d *= T(Int64(1) << 32) * T(Int64(1) << 32)) - - e = ilogb2k(d * T(1.0/0.75)) - m = ldexp3k(d, -e) - o && (e -= 64) - - x = (m - 1) / (m + 1) - x2 = x * x - - t = log_fast_kernel(x2) - - x = x * t + T(MLN2) * e - - isinf(d) && (x = T(Inf)) - (d < 0 || isnan(d)) && (x = T(NaN)) - d == 0 && (x = -T(Inf)) - - return x -end -end - -log_fast(x::Union{Int32,Int64}) = log_fast(float(x)) From 8991ce028ca02ed9d4c3286eba3468d2fe6e9ec1 Mon Sep 17 00:00:00 2001 From: Fredrik Bagge Carlson Date: Tue, 14 Nov 2017 17:32:16 +0100 Subject: [PATCH 23/32] Fix bug in rmsprop and adadelta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `@. p.Δ = η * p.Δ / √acc` parses correctly while `@. p.Δ /= √acc*η` seems to parse like `@. p.Δ /= (√acc*η)`, hence the step size was de facto interpreted as `1/η` --- src/optimise/optimisers.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index 95b31b98..1ffd8982 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -38,7 +38,7 @@ function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8) acc = zeros(p.x) .+ ϵ function () @. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2 - @. p.Δ /= √acc * η + @. p.Δ = η * p.Δ / √acc end end @@ -46,7 +46,7 @@ function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8) acc = zeros(p.x) .+ ϵ function () @. acc += p.Δ ^ 2 - @. p.Δ /= √acc * η + @. p.Δ = η * p.Δ / √acc end end From 187fddc11c2f0733d5e6a1644c2167d8bde590ab Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 21 Nov 2017 12:29:02 +0100 Subject: [PATCH 24/32] doc fixes --- docs/src/models/layers.md | 1 + docs/src/training/optimisers.md | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index 5d5d2ee8..f92f751a 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -36,5 +36,6 @@ swish These layers don't affect the structure of the network but may improve training times or reduce overfitting. ```@docs +Flux.testmode! Dropout ``` diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md index 3af5604b..56f511e4 100644 --- a/docs/src/training/optimisers.md +++ b/docs/src/training/optimisers.md @@ -58,8 +58,5 @@ All optimisers return a function that, when called, will update the parameters p SGD Momentum Nesterov -RMSProp ADAM -ADAGrad -ADADelta ``` From e51268caf57cb259a74a6f7f71bc4235b8891d90 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 21 Nov 2017 12:59:39 +0100 Subject: [PATCH 25/32] mention treelike --- docs/src/models/basics.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md index 6fbd0792..02225279 100644 --- a/docs/src/models/basics.md +++ b/docs/src/models/basics.md @@ -151,3 +151,13 @@ m = Chain(x -> x^2, x -> x+1) m(5) # => 26 ``` + +## Layer helpers + +Flux provides a set of helpers for custom layers, which you can enable by calling + +```julia +Flux.treelike(Affine) +``` + +This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md). From 979949d01adab7bec0711771785eb02b6109788f Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 21 Nov 2017 15:25:09 +0100 Subject: [PATCH 26/32] style --- src/optimise/optimisers.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index 1ffd8982..abc54090 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -38,7 +38,7 @@ function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8) acc = zeros(p.x) .+ ϵ function () @. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2 - @. p.Δ = η * p.Δ / √acc + @. p.Δ *= η / √acc end end @@ -46,7 +46,7 @@ function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8) acc = zeros(p.x) .+ ϵ function () @. acc += p.Δ ^ 2 - @. p.Δ = η * p.Δ / √acc + @. p.Δ *= η / √acc end end From 11d53781b254bbb0fbe8a1c1313a3b05efc61112 Mon Sep 17 00:00:00 2001 From: skariel Date: Tue, 10 Oct 2017 23:33:37 +0300 Subject: [PATCH 27/32] adding layer normalization --- src/layers/basic.jl | 30 ++++++++++++++++++++++++++++++ src/layers/stateless.jl | 23 +++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 969a261c..03a340df 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -78,3 +78,33 @@ function Base.show(io::IO, l::Dense) l.σ == identity || print(io, ", ", l.σ) print(io, ")") end + +""" + ElementwiseLinear(in::Integer) + +Creates an element-wise linear transformation layer with learnable +vectors α and β: + + y = α .* x .+ b + +The input `x` must be a vector of length `in`, or a batch of vectors represented +as an `in × N` matrix. The out `y` will be a vector or batch of length `in`. +""" +struct ElementwiseLinear{T} + α::T + β::T +end + +ElementwiseLinear(in::Integer; initα = ones, initβ = zeros) = + ElementwiseLinear(param(initα(in)), param(initβ(in))) + +treelike(ElementwiseLinear) + +function (a::ElementwiseLinear)(x) + α, β = a.α, a.β + α.*x .+ β +end + +function Base.show(io::IO, l::ElementwiseLinear) + print(io, "ElementwiseLinear(", length(l.α), ")") +end diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 3931c216..8d0276e8 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -12,3 +12,26 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat) ypred = logŷ .- log.(sum(exp.(logŷ), 1)) -sum(y .* ypred) / size(y, 2) end + +""" + layernormalization(α=1.0, β=0.0) + +Creates a normalization layer based on https://arxiv.org/pdf/1607.06450.pdf + +The differences are: + +1) std here divides by N-1 (as does std in Julia) vs the paper N +2) this layer α and β are constant numbers (i.e. not learnable vectors) + +To achieve the same effect of learnable vectors α and β oe can use +the ElementwiseLinear layer +""" +function layernormalization(α=1.0, β=0.0) + function layer(y) + _mean = mean(y) + _std = sqrt.(sum((y.-_mean).^2) ./ (length(y)-1)) + _std /= α + _mean -= β*_std + return (y .- _mean) ./ _std + end +end From b06884b9123d9168104602c9855e4bc046bdecab Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Mon, 23 Oct 2017 12:53:07 +0100 Subject: [PATCH 28/32] LayerNorm tweaks --- docs/src/models/layers.md | 1 + src/Flux.jl | 2 +- src/layers/basic.jl | 19 +++++++++---------- src/layers/normalisation.jl | 22 ++++++++++++++++++++++ src/layers/stateless.jl | 24 ++++++------------------ 5 files changed, 39 insertions(+), 29 deletions(-) diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index f92f751a..1fd87d41 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -38,4 +38,5 @@ These layers don't affect the structure of the network but may improve training ```@docs Flux.testmode! Dropout +LayerNorm ``` diff --git a/src/Flux.jl b/src/Flux.jl index acefff19..df4b1636 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -7,7 +7,7 @@ module Flux using Juno, Requires using Lazy: @forward -export Chain, Dense, RNN, LSTM, Dropout, +export Chain, Dense, RNN, LSTM, Dropout, LayerNorm, SGD, ADAM, Momentum, Nesterov, param, params, mapleaves diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 03a340df..3c47b595 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -80,31 +80,30 @@ function Base.show(io::IO, l::Dense) end """ - ElementwiseLinear(in::Integer) + Diagonal(in::Integer) Creates an element-wise linear transformation layer with learnable vectors α and β: y = α .* x .+ b -The input `x` must be a vector of length `in`, or a batch of vectors represented -as an `in × N` matrix. The out `y` will be a vector or batch of length `in`. +The input `x` must be a array where `size(x, 1) == in`. """ -struct ElementwiseLinear{T} +struct Diagonal{T} α::T β::T end -ElementwiseLinear(in::Integer; initα = ones, initβ = zeros) = - ElementwiseLinear(param(initα(in)), param(initβ(in))) +Diagonal(in::Integer; initα = ones, initβ = zeros) = + Diagonal(param(initα(in)), param(initβ(in))) -treelike(ElementwiseLinear) +treelike(Diagonal) -function (a::ElementwiseLinear)(x) +function (a::Diagonal)(x) α, β = a.α, a.β α.*x .+ β end -function Base.show(io::IO, l::ElementwiseLinear) - print(io, "ElementwiseLinear(", length(l.α), ")") +function Base.show(io::IO, l::Diagonal) + print(io, "Diagonal(", length(l.α), ")") end diff --git a/src/layers/normalisation.jl b/src/layers/normalisation.jl index 08c21428..d296b0a3 100644 --- a/src/layers/normalisation.jl +++ b/src/layers/normalisation.jl @@ -43,3 +43,25 @@ function (a::Dropout)(x) end _testmode!(a::Dropout, test) = (a.active = !test) + +""" + LayerNorm(h::Integer) + +A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be +used with recurrent hidden states of size `h`. Normalises the mean/stddev of +each input before applying a per-neuron gain/bias. +""" +struct LayerNorm{T} + diag::Diagonal{T} +end + +LayerNorm(h::Integer) = + LayerNorm(Diagonal(h)) + +treelike(LayerNorm) + +(a::LayerNorm)(x) = a.diag(normalise(x)) + +function Base.show(io::IO, l::LayerNorm) + print(io, "LayerNorm(", length(l.diag.α), ")") +end diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 8d0276e8..2a4b9a7c 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -14,24 +14,12 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat) end """ - layernormalization(α=1.0, β=0.0) + normalise(x::AbstractVecOrMat) -Creates a normalization layer based on https://arxiv.org/pdf/1607.06450.pdf - -The differences are: - -1) std here divides by N-1 (as does std in Julia) vs the paper N -2) this layer α and β are constant numbers (i.e. not learnable vectors) - -To achieve the same effect of learnable vectors α and β oe can use -the ElementwiseLinear layer +Normalise each column of `x` to mean 0 and standard deviation 1. """ -function layernormalization(α=1.0, β=0.0) - function layer(y) - _mean = mean(y) - _std = sqrt.(sum((y.-_mean).^2) ./ (length(y)-1)) - _std /= α - _mean -= β*_std - return (y .- _mean) ./ _std - end +function normalise(x::AbstractVecOrMat) + μ′ = mean(x, 1) + σ′ = std(x, 1, mean = μ′) + return (x .- μ′) ./ σ′ end From 351d3d4771da08e53d2a2f89547f91d5fdb47beb Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 21 Nov 2017 17:04:04 +0100 Subject: [PATCH 29/32] std derivative --- src/layers/basic.jl | 4 ++-- src/tracker/lib.jl | 6 ++++++ test/tracker.jl | 3 +++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 3c47b595..aa101c43 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -83,9 +83,9 @@ end Diagonal(in::Integer) Creates an element-wise linear transformation layer with learnable -vectors α and β: +vectors `α` and `β`: - y = α .* x .+ b + y = α .* x .+ β The input `x` must be a array where `size(x, 1) == in`. """ diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl index aab26dfe..5065a40d 100644 --- a/src/tracker/lib.jl +++ b/src/tracker/lib.jl @@ -58,6 +58,12 @@ Base.findfirst(xs::TrackedArray, args...) = findfirst(xs.data, args...) Base.mean(xs::TrackedArray) = TrackedArray(Call(mean, xs), toarray(xs.data, mean(xs.data))) Base.mean(xs::TrackedArray, region) = TrackedArray(Call(mean, xs, region)) +# Hacks to get std working +Base.std(x::TrackedArray; mean = Base.mean(x)) = + sqrt.(sum((x .- mean).^2) ./ (length(x)-1)) +Base.std(x::TrackedArray, dim; mean = Base.mean(x, dim)) = + sqrt.(sum((x .- mean).^2, dim) ./ (size(x, dim)-1)) + back(::typeof(mean), Δ, xs::TrackedArray) = back(xs, similar(xs.data) .= Δ ./ length(xs.data)) back(::typeof(mean), Δ, xs::TrackedArray, region) = back(xs, similar(xs.data) .= Δ ./ prod(size(xs.data, region...))) diff --git a/test/tracker.jl b/test/tracker.jl index f2a369f8..81a72566 100644 --- a/test/tracker.jl +++ b/test/tracker.jl @@ -34,6 +34,9 @@ gradtest(f, dims...) = gradtest(f, rand.(dims)...) @test gradtest(x -> mean(x, [1, 2]), rand(2, 3, 4)) end +@test gradtest(x -> std(x), rand(5,5)) +@test gradtest(x -> std(x, 1), rand(5,5)) + @test gradtest(rand(5)) do x y = x.^2 2y + x From 2d33f19346b48dd76559926b62ba1dd7cd978ba7 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Wed, 29 Nov 2017 16:45:50 +0000 Subject: [PATCH 30/32] onehot unk arg --- src/onehot.jl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/onehot.jl b/src/onehot.jl index f8061063..f94fb93e 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -42,7 +42,14 @@ function onehot(l, labels) OneHotVector(i, length(labels)) end -onehotbatch(ls, labels) = OneHotMatrix(length(labels), [onehot(l, labels) for l in ls]) +function onehot(l, labels, unk) + i = findfirst(labels, l) + i > 0 || return onehot(unk, labels) + OneHotVector(i, length(labels)) +end + +onehotbatch(ls, labels, unk...) = + OneHotMatrix(length(labels), [onehot(l, labels, unk...) for l in ls]) argmax(y::AbstractVector, labels = 1:length(y)) = labels[findfirst(y, maximum(y))] From 19039f48819835bf01ea6f2f69792f53dfe7d4f8 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 30 Nov 2017 13:37:38 +0000 Subject: [PATCH 31/32] export sigmoid --- src/Flux.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Flux.jl b/src/Flux.jl index df4b1636..7671ddd2 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -12,7 +12,7 @@ export Chain, Dense, RNN, LSTM, Dropout, LayerNorm, param, params, mapleaves using NNlib -export σ, relu, leakyrelu, elu, swish, softmax +export σ, sigmoid, relu, leakyrelu, elu, swish, softmax include("tracker/Tracker.jl") using .Tracker From cab235a57863558aa060a28776f8934d5a0a0ed4 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 30 Nov 2017 13:51:31 +0000 Subject: [PATCH 32/32] gpu compat --- src/tracker/Tracker.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 3a64fcb7..74ed2d75 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -40,7 +40,7 @@ TrackedArray(x::AbstractArray) = TrackedArray(Call(nothing), x, zeros(x)) isleaf(x::TrackedArray) = x.f == Call(nothing) -param(xs) = TrackedArray(AbstractFloat.(xs)) +param(xs) = TrackedArray(map(x -> AbstractFloat(x), xs)) param(xs::Real) = param(fill(xs)) istracked(x::TrackedArray) = true