diff --git a/src/optimise/interface.jl b/src/optimise/interface.jl index 42b05dc8..29068983 100644 --- a/src/optimise/interface.jl +++ b/src/optimise/interface.jl @@ -56,6 +56,15 @@ RMSProp(ps, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0) = ADAM(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) = optimiser(ps, p->adam(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) +""" + AdaMax(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) + +[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on +the ∞-norm. +""" +AdaMax(ps, η = 0.002; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) = + optimiser(ps, p->adamax(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) + """ ADAGrad(params, η = 0.01; ϵ = 1e-8, decay = 0) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index c09e6131..569e69aa 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -62,6 +62,18 @@ function adam(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ end end +function adamax(p::Param; η::Real = 0.002, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8) + mt = zeros(p.x) + ut = zero(p.x) + β1p = β1 + function () + @. mt = β1 * mt + (1 - β1) * p.Δ + ut = max(β2 * ut, norm(p.Δ, Inf)) + @. p.Δ = (η/(1 - β1p)) * mt/(ut + ϵ) + β1p *= β1 + end +end + function amsgrad(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8) mt = zeros(p.x) vt = zeros(p.x) .+ ϵ