From 13b934c2500b8e39ac24c834079b562057dede5a Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Thu, 12 Oct 2017 10:31:38 +0200 Subject: [PATCH 1/3] improve optimizers --- src/data/cmudict.jl | 3 +- src/optimise/interface.jl | 50 +++++++++++----------- src/optimise/optimisers.jl | 85 +++++++++++++++++++++----------------- src/tracker/Tracker.jl | 2 + test/optimise.jl | 19 +++++++++ test/runtests.jl | 1 + 6 files changed, 98 insertions(+), 62 deletions(-) create mode 100644 test/optimise.jl diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl index 88b9c6c0..a23c6a3d 100644 --- a/src/data/cmudict.jl +++ b/src/data/cmudict.jl @@ -33,7 +33,8 @@ function rawdict() filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n")))) end -validword(s) = ismatch(r"^[\w-\.]+$", s) +# validword(s) = ismatch(r"^[\w-\.]+$", s) +validword(s) = ismatch(r"^\[\w-\.\]+$", s) cmudict() = filter((s, ps) -> validword(s), rawdict()) diff --git a/src/optimise/interface.jl b/src/optimise/interface.jl index 0b2a25ae..47b0f62c 100644 --- a/src/optimise/interface.jl +++ b/src/optimise/interface.jl @@ -1,5 +1,7 @@ call(f, xs...) = f(xs...) +# note for optimisers: set to zero +# p.Δ at the end of the weigths update function optimiser(ps, fs...) ps = [Param(p) for p in ps] fs = map(ps) do p @@ -10,64 +12,64 @@ function optimiser(ps, fs...) end """ - SGD(params, η = 1; decay = 0) + SGD(params, η = 0.1; decay = 0) -Classic gradient descent optimiser. For each parameter `p` and its -gradient `δp`, this runs `p -= η*δp`. +Classic gradient descent optimiser with learning rate `η`. +For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`. -Supports decayed learning rate decay if the `decay` argument is provided. +Supports inverse decaying learning rate if the `decay` argument is provided. """ -SGD(ps, η = 1; decay = 0) = - optimiser(ps, p -> invdecay(p, decay), p -> descent(p, η)) +SGD(ps, η = 0.1; decay = 0) = + optimiser(ps, p -> invdecay(p, decay), p -> descent(p,η)) """ - Momentum(params, ρ, decay = 0) + Momentum(params, η = 0.01; ρ = 0.9, decay = 0) -SGD with momentum `ρ` and optional learning rate decay. +SGD with learning rate `η`, momentum `ρ` and optional learning rate inverse decay. """ -Momentum(ps, ρ; decay = 0) = - optimiser(ps, p -> momentum(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1)) +Momentum(ps, η = 0.01; ρ = 0.9, decay = 0) = + optimiser(ps, p->invdecay(p,decay), p->momentum(p, ρ, η), p->descent(p,1)) """ - Nesterov(params, ρ, decay = 0) + Nesterov(params, η = 0.01; ρ = 0.9, decay = 0) -SGD with Nesterov momentum `ρ` and optional learning rate decay. +SGD with learning rate `η`, Nesterov momentum `ρ` and optional learning rate inverse decay. """ -Nesterov(ps, ρ; decay = 0) = - optimiser(ps, p -> nesterov(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1)) +Nesterov(ps, η = 0.01; ρ = 0.9, decay = 0) = + optimiser(ps, p->invdecay(p,decay), p->nesterov(p, ρ, η), p->descent(p,1)) """ - RMSProp(params; η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0) + RMSProp(params, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0) [RMSProp](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) optimiser. Parameters other than learning rate don't need tuning. Often a good choice for recurrent networks. """ RMSProp(ps, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0) = - optimiser(ps, p -> rmsprop(p; η = η, ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) + optimiser(ps, p->rmsprop(p; η=η, ρ=ρ, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) """ - ADAM(params; η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) + ADAM(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser. """ ADAM(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) = - optimiser(ps, p -> adam(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) + optimiser(ps, p->adam(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) """ - ADAGrad(params; η = 0.01, ϵ = 1e-8, decay = 0) + ADAGrad(params, η = 0.01; ϵ = 1e-8, decay = 0) [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser. Parameters don't need tuning. """ -ADAGrad(ps; η = 0.01, ϵ = 1e-8, decay = 0) = - optimiser(ps, p -> adagrad(p; η = η, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) +ADAGrad(ps, η = 0.01; ϵ = 1e-8, decay = 0) = + optimiser(ps, p->adagrad(p; η=η, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) """ - ADADelta(params; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0) + ADADelta(params; ρ = 0.9, ϵ = 1e-8, decay = 0) [ADADelta](http://arxiv.org/abs/1212.5701) optimiser. Parameters don't need tuning. """ -ADADelta(ps; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0) = - optimiser(ps, p -> adadelta(p; ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) +ADADelta(ps; ρ = 0.9, ϵ = 1e-8, decay = 0) = + optimiser(ps, p->adadelta(p; ρ=ρ, ϵ=ϵ), p->descent(p,1)) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index abc54090..7cf271b6 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -1,74 +1,85 @@ function descent(p::Param, η::Real) function () - p.x .-= p.Δ .* η - p.Δ .= 0 + @. p.x -= η * p.Δ + @. p.Δ = 0 end end -function momentum(p::Param, ρ::Real) - mo = zeros(p.x) - () -> p.Δ .= mo .= ρ .* mo .+ p.Δ -end - -function nesterov(p::Param, ρ::Real) - mo = zeros(p.x) +function momentum(p::Param, ρ, η) + v = zeros(p.x) function () - mo .= ρ .* mo .+ p.Δ - p.Δ .= ρ .* mo .+ p.Δ + @. v = ρ * v - η * p.Δ + @. p.Δ = -v end end -function clip(p::Param, thresh::Real) - () -> clamp!(p.Δ, -thresh, thresh) -end - -function weightdecay(p::Param, γ::Real) - () -> p.Δ .+= γ .* p.x -end - -function invdecay(p::Param, γ::Real) - n = 0 +# Ref. https://arxiv.org/pdf/1212.0901.pdf +function nesterov(p::Param, ρ, η) + v = zeros(p.x) function () - p.Δ .*= 1 / (1 + γ * n) - n += 1 + d = @. ρ^2 * v - (1+ρ) * η * p.Δ + @. v = ρ*v - η*p.Δ + @. p.Δ = -d end end function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8) - acc = zeros(p.x) .+ ϵ + acc = zeros(p.x) function () - @. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2 - @. p.Δ *= η / √acc + @. acc = ρ * acc + (1 - ρ) * p.Δ^2 + @. p.Δ *= η / (√acc + ϵ) end end function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8) acc = zeros(p.x) .+ ϵ function () - @. acc += p.Δ ^ 2 + @. acc += p.Δ^2 @. p.Δ *= η / √acc end end -function adadelta(p::Param; ρ::Real = 0.95, ϵ::Real = 1e-8) - acc = zeros(p.x) .+ ϵ - Δacc = zeros(p.x) .+ ϵ +function adadelta(p::Param; ρ::Real = 0.9, ϵ::Real = 1e-8) + acc = zeros(p.x) + Δacc = zeros(p.x) function () - @. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2 - @. p.Δ *= √Δacc / √acc - @. Δacc = ρ * Δacc + (1 - ρ) * p.Δ ^ 2 - end + @. acc = ρ * acc + (1 - ρ) * p.Δ^2 + @. p.Δ *= √(Δacc + ϵ) / √(acc + ϵ) + @. Δacc = ρ * Δacc + (1 - ρ) * p.Δ^2 + end end function adam(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8) mt = zeros(p.x) - vt = zeros(p.x) .+ ϵ + vt = zeros(p.x) β1p, β2p = β1, β2 function () @. mt = β1 * mt + (1 - β1) * p.Δ - @. vt = β2 * vt + (1 - β2) * p.Δ ^ 2 - @. p.Δ = √(1 - β2p) / √(1 - β1p) * mt / √vt * η + @. vt = β2 * vt + (1 - β2) * p.Δ^2 + @. p.Δ = mt / (1 - β1p) / (sqrt(vt / (1 - β2p)) + ϵ) * η β1p *= β1 β2p *= β2 end end + +clip(p::Param, thresh::Real) = () -> clamp!(p.Δ, -thresh, thresh) + +function expdecay(p::Param, γ::Real) + if γ != 0 + return () -> p.Δ .+= γ .* p.x + else + return () -> nothing + end +end + +function invdecay(p::Param, γ::Real) + if γ != 0 + n = 0 + return () -> begin + p.Δ .*= 1 / (1 + γ * n) + n += 1 + end + else + return () -> nothing + end +end \ No newline at end of file diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 3a64fcb7..57bdc447 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -58,6 +58,7 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) = Base.similar(x::TrackedArray, T::Type) = similar(data(x), T) +# TODO decide if keeping both data and value. The problem is TrackedScalar value(x) = x value(x::TrackedArray) = data(x) value(x::TrackedScalar) = data(x)[] @@ -69,6 +70,7 @@ Base.:(==)(x::TrackedArray, y::TrackedArray) = value(x) == value(x) Base.isless(x::TrackedScalar, y) = isless(value(x), y) Base.isless(x, y::TrackedScalar) = isless(x, value(y)) Base.isless(x::TrackedScalar, y::TrackedScalar) = isless(value(x), value(y)) +Base.isapprox(x::TrackedScalar, y; kws...) = isapprox(x.data[], y; kws...) Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} = print(io, "TrackedArray{…,$A}") diff --git a/test/optimise.jl b/test/optimise.jl new file mode 100644 index 00000000..85fd53f9 --- /dev/null +++ b/test/optimise.jl @@ -0,0 +1,19 @@ +using Flux.Optimise +using Flux.Tracker + +@testset "Optimise" begin + loss(x) = sum(x.^2) + η = 0.1 + # RMSProp gets stuck + for OPT in [SGD, Nesterov, Momentum, ADAM, ADAGrad, ADADelta] + x = param(randn(10)) + opt = OPT == ADADelta ? OPT([x]) : OPT([x], η) + for t=1:10000 + l = loss(x) + back!(l) + opt() + l.data[] < 1e-10 && break + end + @test loss(x) ≈ 0. atol=1e-7 + end +end diff --git a/test/runtests.jl b/test/runtests.jl index efd1a462..bdd1f2d0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,5 +5,6 @@ using Flux, Base.Test include("utils.jl") include("tracker.jl") include("layers/normalisation.jl") +include("optimise.jl") end From 951c21366a54ab60899f2e9955c05bd8ebaedf5b Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Fri, 8 Dec 2017 16:42:30 +0000 Subject: [PATCH 2/3] fix regex --- src/data/cmudict.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl index a23c6a3d..9ec567b4 100644 --- a/src/data/cmudict.jl +++ b/src/data/cmudict.jl @@ -33,8 +33,7 @@ function rawdict() filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n")))) end -# validword(s) = ismatch(r"^[\w-\.]+$", s) -validword(s) = ismatch(r"^\[\w-\.\]+$", s) +validword(s) = ismatch(r"^[\w\-\.]+$", s) cmudict() = filter((s, ps) -> validword(s), rawdict()) From 69cc5642b48b685bbbf109af310384f8eae917e4 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Fri, 8 Dec 2017 17:10:29 +0000 Subject: [PATCH 3/3] regression testing --- test/optimise.jl | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/test/optimise.jl b/test/optimise.jl index 85fd53f9..65bb65be 100644 --- a/test/optimise.jl +++ b/test/optimise.jl @@ -2,18 +2,16 @@ using Flux.Optimise using Flux.Tracker @testset "Optimise" begin - loss(x) = sum(x.^2) - η = 0.1 - # RMSProp gets stuck - for OPT in [SGD, Nesterov, Momentum, ADAM, ADAGrad, ADADelta] - x = param(randn(10)) - opt = OPT == ADADelta ? OPT([x]) : OPT([x], η) - for t=1:10000 - l = loss(x) - back!(l) - opt() - l.data[] < 1e-10 && break - end - @test loss(x) ≈ 0. atol=1e-7 + w = randn(10, 10) + for Opt in [SGD, Nesterov, Momentum, ADAM, RMSProp, ps -> ADAGrad(ps, 0.1), ADADelta] + w′ = param(randn(10, 10)) + loss(x) = Flux.mse(w*x, w′*x) + opt = Opt([w′]) + for t=1:10^5 + l = loss(rand(10)) + back!(l) + opt() end + @test Flux.mse(w, w′) < 0.01 + end end