improve optimizers

This commit is contained in:
CarloLucibello 2017-10-12 10:31:38 +02:00
parent dc1f08a709
commit 13b934c250
6 changed files with 98 additions and 62 deletions

View File

@ -33,7 +33,8 @@ function rawdict()
filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n")))) filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n"))))
end end
validword(s) = ismatch(r"^[\w-\.]+$", s) # validword(s) = ismatch(r"^[\w-\.]+$", s)
validword(s) = ismatch(r"^\[\w-\.\]+$", s)
cmudict() = filter((s, ps) -> validword(s), rawdict()) cmudict() = filter((s, ps) -> validword(s), rawdict())

View File

@ -1,5 +1,7 @@
call(f, xs...) = f(xs...) call(f, xs...) = f(xs...)
# note for optimisers: set to zero
# p.Δ at the end of the weigths update
function optimiser(ps, fs...) function optimiser(ps, fs...)
ps = [Param(p) for p in ps] ps = [Param(p) for p in ps]
fs = map(ps) do p fs = map(ps) do p
@ -10,64 +12,64 @@ function optimiser(ps, fs...)
end end
""" """
SGD(params, η = 1; decay = 0) SGD(params, η = 0.1; decay = 0)
Classic gradient descent optimiser. For each parameter `p` and its Classic gradient descent optimiser with learning rate `η`.
gradient `δp`, this runs `p -= η*δp`. For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
Supports decayed learning rate decay if the `decay` argument is provided. Supports inverse decaying learning rate if the `decay` argument is provided.
""" """
SGD(ps, η = 1; decay = 0) = SGD(ps, η = 0.1; decay = 0) =
optimiser(ps, p -> invdecay(p, decay), p -> descent(p, η)) optimiser(ps, p -> invdecay(p, decay), p -> descent(p,η))
""" """
Momentum(params, ρ, decay = 0) Momentum(params, η = 0.01; ρ = 0.9, decay = 0)
SGD with momentum `ρ` and optional learning rate decay. SGD with learning rate `η`, momentum `ρ` and optional learning rate inverse decay.
""" """
Momentum(ps, ρ; decay = 0) = Momentum(ps, η = 0.01; ρ = 0.9, decay = 0) =
optimiser(ps, p -> momentum(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1)) optimiser(ps, p->invdecay(p,decay), p->momentum(p, ρ, η), p->descent(p,1))
""" """
Nesterov(params, ρ, decay = 0) Nesterov(params, η = 0.01; ρ = 0.9, decay = 0)
SGD with Nesterov momentum `ρ` and optional learning rate decay. SGD with learning rate `η`, Nesterov momentum `ρ` and optional learning rate inverse decay.
""" """
Nesterov(ps, ρ; decay = 0) = Nesterov(ps, η = 0.01; ρ = 0.9, decay = 0) =
optimiser(ps, p -> nesterov(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1)) optimiser(ps, p->invdecay(p,decay), p->nesterov(p, ρ, η), p->descent(p,1))
""" """
RMSProp(params; η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0) RMSProp(params, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0)
[RMSProp](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) [RMSProp](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
optimiser. Parameters other than learning rate don't need tuning. Often a good optimiser. Parameters other than learning rate don't need tuning. Often a good
choice for recurrent networks. choice for recurrent networks.
""" """
RMSProp(ps, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0) = RMSProp(ps, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0) =
optimiser(ps, p -> rmsprop(p; η = η, ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) optimiser(ps, p->rmsprop(p; η=η, ρ=ρ, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1))
""" """
ADAM(params; η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) ADAM(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0)
[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser. [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
""" """
ADAM(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) = ADAM(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) =
optimiser(ps, p -> adam(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) optimiser(ps, p->adam(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1))
""" """
ADAGrad(params; η = 0.01, ϵ = 1e-8, decay = 0) ADAGrad(params, η = 0.01; ϵ = 1e-8, decay = 0)
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser. [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
Parameters don't need tuning. Parameters don't need tuning.
""" """
ADAGrad(ps; η = 0.01, ϵ = 1e-8, decay = 0) = ADAGrad(ps, η = 0.01; ϵ = 1e-8, decay = 0) =
optimiser(ps, p -> adagrad(p; η = η, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) optimiser(ps, p->adagrad(p; η=η, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1))
""" """
ADADelta(params; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0) ADADelta(params; ρ = 0.9, ϵ = 1e-8, decay = 0)
[ADADelta](http://arxiv.org/abs/1212.5701) optimiser. Parameters don't need [ADADelta](http://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
tuning. tuning.
""" """
ADADelta(ps; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0) = ADADelta(ps; ρ = 0.9, ϵ = 1e-8, decay = 0) =
optimiser(ps, p -> adadelta(p; ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) optimiser(ps, p->adadelta(p; ρ=ρ, ϵ=ϵ), p->descent(p,1))

View File

@ -1,74 +1,85 @@
function descent(p::Param, η::Real) function descent(p::Param, η::Real)
function () function ()
p.x .-= p.Δ .* η @. p.x -= η * p.Δ
p.Δ .= 0 @. p.Δ = 0
end end
end end
function momentum(p::Param, ρ::Real) function momentum(p::Param, ρ, η)
mo = zeros(p.x) v = zeros(p.x)
() -> p.Δ .= mo .= ρ .* mo .+ p.Δ
end
function nesterov(p::Param, ρ::Real)
mo = zeros(p.x)
function () function ()
mo .= ρ .* mo .+ p.Δ @. v = ρ * v - η * p.Δ
p.Δ .= ρ .* mo .+ p.Δ @. p.Δ = -v
end end
end end
function clip(p::Param, thresh::Real) # Ref. https://arxiv.org/pdf/1212.0901.pdf
() -> clamp!(p.Δ, -thresh, thresh) function nesterov(p::Param, ρ, η)
end v = zeros(p.x)
function weightdecay(p::Param, γ::Real)
() -> p.Δ .+= γ .* p.x
end
function invdecay(p::Param, γ::Real)
n = 0
function () function ()
p.Δ .*= 1 / (1 + γ * n) d = @. ρ^2 * v - (1+ρ) * η * p.Δ
n += 1 @. v = ρ*v - η*p.Δ
@. p.Δ = -d
end end
end end
function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8) function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8)
acc = zeros(p.x) .+ ϵ acc = zeros(p.x)
function () function ()
@. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2 @. acc = ρ * acc + (1 - ρ) * p.Δ^2
@. p.Δ *= η / acc @. p.Δ *= η / (acc + ϵ)
end end
end end
function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8) function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8)
acc = zeros(p.x) .+ ϵ acc = zeros(p.x) .+ ϵ
function () function ()
@. acc += p.Δ ^ 2 @. acc += p.Δ^2
@. p.Δ *= η / acc @. p.Δ *= η / acc
end end
end end
function adadelta(p::Param; ρ::Real = 0.95, ϵ::Real = 1e-8) function adadelta(p::Param; ρ::Real = 0.9, ϵ::Real = 1e-8)
acc = zeros(p.x) .+ ϵ acc = zeros(p.x)
Δacc = zeros(p.x) .+ ϵ Δacc = zeros(p.x)
function () function ()
@. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2 @. acc = ρ * acc + (1 - ρ) * p.Δ^2
@. p.Δ *= Δacc / acc @. p.Δ *= (Δacc + ϵ) / (acc + ϵ)
@. Δacc = ρ * Δacc + (1 - ρ) * p.Δ ^ 2 @. Δacc = ρ * Δacc + (1 - ρ) * p.Δ^2
end end
end end
function adam(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8) function adam(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8)
mt = zeros(p.x) mt = zeros(p.x)
vt = zeros(p.x) .+ ϵ vt = zeros(p.x)
β1p, β2p = β1, β2 β1p, β2p = β1, β2
function () function ()
@. mt = β1 * mt + (1 - β1) * p.Δ @. mt = β1 * mt + (1 - β1) * p.Δ
@. vt = β2 * vt + (1 - β2) * p.Δ ^ 2 @. vt = β2 * vt + (1 - β2) * p.Δ^2
@. p.Δ = (1 - β2p) / (1 - β1p) * mt / vt * η @. p.Δ = mt / (1 - β1p) / (sqrt(vt / (1 - β2p)) + ϵ) * η
β1p *= β1 β1p *= β1
β2p *= β2 β2p *= β2
end end
end end
clip(p::Param, thresh::Real) = () -> clamp!(p.Δ, -thresh, thresh)
function expdecay(p::Param, γ::Real)
if γ != 0
return () -> p.Δ .+= γ .* p.x
else
return () -> nothing
end
end
function invdecay(p::Param, γ::Real)
if γ != 0
n = 0
return () -> begin
p.Δ .*= 1 / (1 + γ * n)
n += 1
end
else
return () -> nothing
end
end

View File

@ -58,6 +58,7 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) =
Base.similar(x::TrackedArray, T::Type) = similar(data(x), T) Base.similar(x::TrackedArray, T::Type) = similar(data(x), T)
# TODO decide if keeping both data and value. The problem is TrackedScalar
value(x) = x value(x) = x
value(x::TrackedArray) = data(x) value(x::TrackedArray) = data(x)
value(x::TrackedScalar) = data(x)[] value(x::TrackedScalar) = data(x)[]
@ -69,6 +70,7 @@ Base.:(==)(x::TrackedArray, y::TrackedArray) = value(x) == value(x)
Base.isless(x::TrackedScalar, y) = isless(value(x), y) Base.isless(x::TrackedScalar, y) = isless(value(x), y)
Base.isless(x, y::TrackedScalar) = isless(x, value(y)) Base.isless(x, y::TrackedScalar) = isless(x, value(y))
Base.isless(x::TrackedScalar, y::TrackedScalar) = isless(value(x), value(y)) Base.isless(x::TrackedScalar, y::TrackedScalar) = isless(value(x), value(y))
Base.isapprox(x::TrackedScalar, y; kws...) = isapprox(x.data[], y; kws...)
Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} = Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} =
print(io, "TrackedArray{…,$A}") print(io, "TrackedArray{…,$A}")

19
test/optimise.jl Normal file
View File

@ -0,0 +1,19 @@
using Flux.Optimise
using Flux.Tracker
@testset "Optimise" begin
loss(x) = sum(x.^2)
η = 0.1
# RMSProp gets stuck
for OPT in [SGD, Nesterov, Momentum, ADAM, ADAGrad, ADADelta]
x = param(randn(10))
opt = OPT == ADADelta ? OPT([x]) : OPT([x], η)
for t=1:10000
l = loss(x)
back!(l)
opt()
l.data[] < 1e-10 && break
end
@test loss(x) 0. atol=1e-7
end
end

View File

@ -5,5 +5,6 @@ using Flux, Base.Test
include("utils.jl") include("utils.jl")
include("tracker.jl") include("tracker.jl")
include("layers/normalisation.jl") include("layers/normalisation.jl")
include("optimise.jl")
end end