optimiser docs
This commit is contained in:
parent
b26f77489e
commit
7426faf37d
@ -52,3 +52,15 @@ opt()
|
|||||||
```
|
```
|
||||||
|
|
||||||
An optimiser takes a parameter list and returns a function that does the same thing as `update` above. We can pass either `opt` or `update` to our [training loop](training.md), which will then run the optimiser after every mini-batch of data.
|
An optimiser takes a parameter list and returns a function that does the same thing as `update` above. We can pass either `opt` or `update` to our [training loop](training.md), which will then run the optimiser after every mini-batch of data.
|
||||||
|
|
||||||
|
## Optimiser Reference
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
SGD
|
||||||
|
Momentum
|
||||||
|
Nesterov
|
||||||
|
RMSProp
|
||||||
|
ADAM
|
||||||
|
ADAGrad
|
||||||
|
ADADelta
|
||||||
|
```
|
||||||
|
@ -8,7 +8,8 @@ using Juno, Requires
|
|||||||
using Lazy: @forward
|
using Lazy: @forward
|
||||||
|
|
||||||
export Chain, Dense, RNN, LSTM,
|
export Chain, Dense, RNN, LSTM,
|
||||||
SGD, param, params, mapleaves
|
SGD, ADAM, Momentum, Nesterov, RMSProp, ADAGrad, ADADelta,
|
||||||
|
param, params, mapleaves
|
||||||
|
|
||||||
using NNlib
|
using NNlib
|
||||||
export σ, relu, softmax
|
export σ, relu, softmax
|
||||||
|
@ -9,10 +9,65 @@ function optimiser(ps, fs...)
|
|||||||
() -> foreach(call, fs)
|
() -> foreach(call, fs)
|
||||||
end
|
end
|
||||||
|
|
||||||
SGD(ps, η = 1) = optimiser(ps, p -> descent(p, η))
|
"""
|
||||||
ADAM(ps, η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0.0) = optimiser(ps, p -> adam(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
SGD(params, η = 1; decay = 0)
|
||||||
Momentum(ps,ρ, decay = 0.0) = optimiser(ps, p -> momentum(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
|
|
||||||
Nesterov(ps,ρ, decay = 0.0) = optimiser(ps, p -> nesterov(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
|
Classic gradient descent optimiser. For each parameter `p` and its
|
||||||
RMSProp(ps, η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> rmsprop(p; η = η, ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
gradient `δp`, this runs `p -= η*δp`.
|
||||||
ADAGrad(ps, η = 0.01, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> adagrad(p; η = η, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
|
||||||
ADADelta(ps, η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0.0) = optimiser(ps, p -> adadelta(p; ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
Supports decayed learning rate decay if the `decay` argument is provided.
|
||||||
|
"""
|
||||||
|
SGD(ps, η = 1; decay = 0) =
|
||||||
|
optimiser(ps, p -> invdecay(p, decay), p -> descent(p, η))
|
||||||
|
|
||||||
|
"""
|
||||||
|
Momentum(params, ρ, decay = 0)
|
||||||
|
|
||||||
|
SGD with momentum `ρ` and optional learning rate decay.
|
||||||
|
"""
|
||||||
|
Momentum(ps, ρ; decay = 0) =
|
||||||
|
optimiser(ps, p -> momentum(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||||
|
|
||||||
|
"""
|
||||||
|
Nesterov(params, ρ, decay = 0)
|
||||||
|
|
||||||
|
SGD with Nesterov momentum `ρ` and optional learning rate decay.
|
||||||
|
"""
|
||||||
|
Nesterov(ps, ρ; decay = 0) =
|
||||||
|
optimiser(ps, p -> nesterov(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||||
|
|
||||||
|
"""
|
||||||
|
RMSProp(params; η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0)
|
||||||
|
|
||||||
|
[RMSProp](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
|
||||||
|
optimiser. Parameters other than learning rate don't need tuning. Often a good
|
||||||
|
choice for recurrent networks.
|
||||||
|
"""
|
||||||
|
RMSProp(ps; η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0) =
|
||||||
|
optimiser(ps, p -> rmsprop(p; η = η, ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||||
|
|
||||||
|
"""
|
||||||
|
ADAM(params; η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0)
|
||||||
|
|
||||||
|
[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
|
||||||
|
"""
|
||||||
|
ADAM(ps; η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) =
|
||||||
|
optimiser(ps, p -> adam(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||||
|
|
||||||
|
"""
|
||||||
|
ADAGrad(params; η = 0.01, ϵ = 1e-8, decay = 0)
|
||||||
|
|
||||||
|
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
|
||||||
|
Parameters don't need tuning.
|
||||||
|
"""
|
||||||
|
ADAGrad(ps; η = 0.01, ϵ = 1e-8, decay = 0) =
|
||||||
|
optimiser(ps, p -> adagrad(p; η = η, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||||
|
|
||||||
|
"""
|
||||||
|
ADADelta(params; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0)
|
||||||
|
|
||||||
|
[ADADelta](http://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
|
||||||
|
tuning.
|
||||||
|
"""
|
||||||
|
ADADelta(ps; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0) =
|
||||||
|
optimiser(ps, p -> adadelta(p; ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||||
|
Loading…
Reference in New Issue
Block a user