Merge branch 'master' into sf/weighted_crossentropy
This commit is contained in:
commit
128725cefd
@ -37,6 +37,7 @@ These layers don't affect the structure of the network but may improve training
|
||||
|
||||
```@docs
|
||||
Flux.testmode!
|
||||
BatchNorm
|
||||
Dropout
|
||||
LayerNorm
|
||||
```
|
||||
|
@ -7,8 +7,9 @@ module Flux
|
||||
using Juno, Requires
|
||||
using Lazy: @forward
|
||||
|
||||
export Chain, Dense, RNN, LSTM, Dropout, LayerNorm,
|
||||
SGD, ADAM, Momentum, Nesterov,
|
||||
export Chain, Dense, RNN, LSTM,
|
||||
Dropout, LayerNorm, BatchNorm,
|
||||
SGD, ADAM, Momentum, Nesterov, AMSGrad,
|
||||
param, params, mapleaves
|
||||
|
||||
using NNlib
|
||||
|
@ -33,7 +33,7 @@ function rawdict()
|
||||
filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n"))))
|
||||
end
|
||||
|
||||
validword(s) = ismatch(r"^[\w-\.]+$", s)
|
||||
validword(s) = ismatch(r"^[\w\-\.]+$", s)
|
||||
|
||||
cmudict() = filter((s, ps) -> validword(s), rawdict())
|
||||
|
||||
|
@ -2,8 +2,8 @@
|
||||
testmode!(m)
|
||||
testmode!(m, false)
|
||||
|
||||
Put layers like [`Dropout`](@ref) and `BatchNorm` into testing mode (or back to
|
||||
training mode with `false`).
|
||||
Put layers like [`Dropout`](@ref) and [`BatchNorm`](@ref) into testing mode
|
||||
(or back to training mode with `false`).
|
||||
"""
|
||||
function testmode!(m, val::Bool=true)
|
||||
prefor(x -> _testmode!(x, val), m)
|
||||
@ -45,6 +45,7 @@ end
|
||||
_testmode!(a::Dropout, test) = (a.active = !test)
|
||||
|
||||
"""
|
||||
|
||||
LayerNorm(h::Integer)
|
||||
|
||||
A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be
|
||||
@ -65,3 +66,77 @@ treelike(LayerNorm)
|
||||
function Base.show(io::IO, l::LayerNorm)
|
||||
print(io, "LayerNorm(", length(l.diag.α), ")")
|
||||
end
|
||||
|
||||
"""
|
||||
BatchNorm(dims...; λ = identity,
|
||||
initβ = zeros, initγ = ones, ϵ = 1e-8, momentum = .1)
|
||||
|
||||
Batch Normalization Layer for [`Dense`](@ref) layer.
|
||||
|
||||
See [Batch Normalization: Accelerating Deep Network Training by Reducing
|
||||
Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf)
|
||||
|
||||
In the example of MNIST,
|
||||
in order to normalize the input of other layer,
|
||||
put the `BatchNorm` layer before activation function.
|
||||
|
||||
```julia
|
||||
m = Chain(
|
||||
Dense(28^2, 64),
|
||||
BatchNorm(64, λ = relu),
|
||||
Dense(64, 10),
|
||||
BatchNorm(10),
|
||||
softmax)
|
||||
```
|
||||
"""
|
||||
mutable struct BatchNorm{F,V,N}
|
||||
λ::F # activation function
|
||||
β::V # bias
|
||||
γ::V # scale
|
||||
μ # moving mean
|
||||
σ # moving std
|
||||
ϵ::N
|
||||
momentum::N
|
||||
active::Bool
|
||||
end
|
||||
|
||||
BatchNorm(dims::Integer...; λ = identity,
|
||||
initβ = zeros, initγ = ones, ϵ = 1e-8, momentum = .1) =
|
||||
BatchNorm(λ, param(initβ(dims)), param(initγ(dims)), 0., 1., ϵ, momentum, true)
|
||||
|
||||
function (BN::BatchNorm)(x)
|
||||
λ, γ, β = BN.λ, BN.γ, BN.β
|
||||
|
||||
if !BN.active
|
||||
μ = BN.μ
|
||||
σ = BN.σ
|
||||
else
|
||||
T = eltype(x)
|
||||
|
||||
ϵ = T(BN.ϵ)
|
||||
m = size(x, 2) # batch size
|
||||
μ = mean(x, 2)
|
||||
σ = sqrt.(sum((x .- μ).^2, 2) ./ m .+ ϵ)
|
||||
|
||||
# update moving mean/std
|
||||
mtm = T(BN.momentum)
|
||||
BN.μ = (1 - mtm) .* BN.μ .+ mtm .* μ.data
|
||||
BN.σ = (1 - mtm) .* BN.σ .+ mtm .* σ.data .* m ./ (m - 1)
|
||||
end
|
||||
|
||||
λ.(γ .* ((x .- μ) ./ σ) .+ β)
|
||||
end
|
||||
|
||||
children(BN::BatchNorm) =
|
||||
(BN.λ, BN.β, BN.γ, BN.μ, BN.σ, BN.momentum, BN.ϵ, BN.active)
|
||||
|
||||
mapchildren(f, BN::BatchNorm) = # e.g. mapchildren(cu, BN)
|
||||
BatchNorm(BN.λ, f(BN.β), f(BN.γ), BN.μ, BN.σ, BN.momentum, BN.ϵ, BN.active)
|
||||
|
||||
_testmode!(BN::BatchNorm, test) = (BN.active = !test)
|
||||
|
||||
function Base.show(io::IO, l::BatchNorm)
|
||||
print(io, "BatchNorm($(join(size(l.β), ", "))")
|
||||
(l.λ == identity) || print(io, ", λ = $(l.λ)")
|
||||
print(io, ")")
|
||||
end
|
||||
|
@ -1,7 +1,7 @@
|
||||
module Optimise
|
||||
|
||||
export update!, params, train!,
|
||||
SGD, ADAM, Momentum, Nesterov, RMSProp, ADAGrad, ADADelta
|
||||
SGD, ADAM, Momentum, Nesterov, RMSProp, ADAGrad, ADADelta, AMSGrad
|
||||
|
||||
struct Param{T}
|
||||
x::T
|
||||
|
@ -1,5 +1,7 @@
|
||||
call(f, xs...) = f(xs...)
|
||||
|
||||
# note for optimisers: set to zero
|
||||
# p.Δ at the end of the weigths update
|
||||
function optimiser(ps, fs...)
|
||||
ps = [Param(p) for p in ps]
|
||||
fs = map(ps) do p
|
||||
@ -10,64 +12,73 @@ function optimiser(ps, fs...)
|
||||
end
|
||||
|
||||
"""
|
||||
SGD(params, η = 1; decay = 0)
|
||||
SGD(params, η = 0.1; decay = 0)
|
||||
|
||||
Classic gradient descent optimiser. For each parameter `p` and its
|
||||
gradient `δp`, this runs `p -= η*δp`.
|
||||
Classic gradient descent optimiser with learning rate `η`.
|
||||
For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
|
||||
|
||||
Supports decayed learning rate decay if the `decay` argument is provided.
|
||||
Supports inverse decaying learning rate if the `decay` argument is provided.
|
||||
"""
|
||||
SGD(ps, η = 1; decay = 0) =
|
||||
optimiser(ps, p -> invdecay(p, decay), p -> descent(p, η))
|
||||
SGD(ps, η = 0.1; decay = 0) =
|
||||
optimiser(ps, p -> invdecay(p, decay), p -> descent(p,η))
|
||||
|
||||
"""
|
||||
Momentum(params, ρ, decay = 0)
|
||||
Momentum(params, η = 0.01; ρ = 0.9, decay = 0)
|
||||
|
||||
SGD with momentum `ρ` and optional learning rate decay.
|
||||
SGD with learning rate `η`, momentum `ρ` and optional learning rate inverse decay.
|
||||
"""
|
||||
Momentum(ps, ρ; decay = 0) =
|
||||
optimiser(ps, p -> momentum(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||
Momentum(ps, η = 0.01; ρ = 0.9, decay = 0) =
|
||||
optimiser(ps, p->invdecay(p,decay), p->momentum(p, ρ, η), p->descent(p,1))
|
||||
|
||||
"""
|
||||
Nesterov(params, ρ, decay = 0)
|
||||
Nesterov(params, η = 0.01; ρ = 0.9, decay = 0)
|
||||
|
||||
SGD with Nesterov momentum `ρ` and optional learning rate decay.
|
||||
SGD with learning rate `η`, Nesterov momentum `ρ` and optional learning rate inverse decay.
|
||||
"""
|
||||
Nesterov(ps, ρ; decay = 0) =
|
||||
optimiser(ps, p -> nesterov(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||
Nesterov(ps, η = 0.01; ρ = 0.9, decay = 0) =
|
||||
optimiser(ps, p->invdecay(p,decay), p->nesterov(p, ρ, η), p->descent(p,1))
|
||||
|
||||
"""
|
||||
RMSProp(params; η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0)
|
||||
RMSProp(params, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0)
|
||||
|
||||
[RMSProp](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
|
||||
optimiser. Parameters other than learning rate don't need tuning. Often a good
|
||||
choice for recurrent networks.
|
||||
"""
|
||||
RMSProp(ps, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0) =
|
||||
optimiser(ps, p -> rmsprop(p; η = η, ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||
optimiser(ps, p->rmsprop(p; η=η, ρ=ρ, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1))
|
||||
|
||||
"""
|
||||
ADAM(params; η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0)
|
||||
ADAM(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0)
|
||||
|
||||
[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
|
||||
"""
|
||||
ADAM(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) =
|
||||
optimiser(ps, p -> adam(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||
optimiser(ps, p->adam(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1))
|
||||
|
||||
"""
|
||||
ADAGrad(params; η = 0.01, ϵ = 1e-8, decay = 0)
|
||||
ADAGrad(params, η = 0.01; ϵ = 1e-8, decay = 0)
|
||||
|
||||
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
|
||||
Parameters don't need tuning.
|
||||
"""
|
||||
ADAGrad(ps; η = 0.01, ϵ = 1e-8, decay = 0) =
|
||||
optimiser(ps, p -> adagrad(p; η = η, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||
ADAGrad(ps, η = 0.01; ϵ = 1e-8, decay = 0) =
|
||||
optimiser(ps, p->adagrad(p; η=η, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1))
|
||||
|
||||
"""
|
||||
ADADelta(params; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0)
|
||||
ADADelta(params; ρ = 0.9, ϵ = 1e-8, decay = 0)
|
||||
|
||||
[ADADelta](http://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
|
||||
tuning.
|
||||
"""
|
||||
ADADelta(ps; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0) =
|
||||
optimiser(ps, p -> adadelta(p; ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||
ADADelta(ps; ρ = 0.9, ϵ = 1e-8, decay = 0) =
|
||||
optimiser(ps, p->adadelta(p; ρ=ρ, ϵ=ϵ), p->descent(p,1))
|
||||
|
||||
"""
|
||||
AMSGrad(params; η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0)
|
||||
|
||||
[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
|
||||
tuning.
|
||||
"""
|
||||
AMSGrad(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) =
|
||||
optimiser(ps, p -> amsgrad(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
|
||||
|
@ -1,74 +1,97 @@
|
||||
function descent(p::Param, η::Real)
|
||||
function ()
|
||||
p.x .-= p.Δ .* η
|
||||
p.Δ .= 0
|
||||
@. p.x -= η * p.Δ
|
||||
@. p.Δ = 0
|
||||
end
|
||||
end
|
||||
|
||||
function momentum(p::Param, ρ::Real)
|
||||
mo = zeros(p.x)
|
||||
() -> p.Δ .= mo .= ρ .* mo .+ p.Δ
|
||||
end
|
||||
|
||||
function nesterov(p::Param, ρ::Real)
|
||||
mo = zeros(p.x)
|
||||
function momentum(p::Param, ρ, η)
|
||||
v = zeros(p.x)
|
||||
function ()
|
||||
mo .= ρ .* mo .+ p.Δ
|
||||
p.Δ .= ρ .* mo .+ p.Δ
|
||||
@. v = ρ * v - η * p.Δ
|
||||
@. p.Δ = -v
|
||||
end
|
||||
end
|
||||
|
||||
function clip(p::Param, thresh::Real)
|
||||
() -> clamp!(p.Δ, -thresh, thresh)
|
||||
end
|
||||
|
||||
function weightdecay(p::Param, γ::Real)
|
||||
() -> p.Δ .+= γ .* p.x
|
||||
end
|
||||
|
||||
function invdecay(p::Param, γ::Real)
|
||||
n = 0
|
||||
# Ref. https://arxiv.org/pdf/1212.0901.pdf
|
||||
function nesterov(p::Param, ρ, η)
|
||||
v = zeros(p.x)
|
||||
function ()
|
||||
p.Δ .*= 1 / (1 + γ * n)
|
||||
n += 1
|
||||
d = @. ρ^2 * v - (1+ρ) * η * p.Δ
|
||||
@. v = ρ*v - η*p.Δ
|
||||
@. p.Δ = -d
|
||||
end
|
||||
end
|
||||
|
||||
function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8)
|
||||
acc = zeros(p.x) .+ ϵ
|
||||
acc = zeros(p.x)
|
||||
function ()
|
||||
@. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2
|
||||
@. p.Δ *= η / √acc
|
||||
@. acc = ρ * acc + (1 - ρ) * p.Δ^2
|
||||
@. p.Δ *= η / (√acc + ϵ)
|
||||
end
|
||||
end
|
||||
|
||||
function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8)
|
||||
acc = zeros(p.x) .+ ϵ
|
||||
function ()
|
||||
@. acc += p.Δ ^ 2
|
||||
@. acc += p.Δ^2
|
||||
@. p.Δ *= η / √acc
|
||||
end
|
||||
end
|
||||
|
||||
function adadelta(p::Param; ρ::Real = 0.95, ϵ::Real = 1e-8)
|
||||
acc = zeros(p.x) .+ ϵ
|
||||
Δacc = zeros(p.x) .+ ϵ
|
||||
function adadelta(p::Param; ρ::Real = 0.9, ϵ::Real = 1e-8)
|
||||
acc = zeros(p.x)
|
||||
Δacc = zeros(p.x)
|
||||
function ()
|
||||
@. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2
|
||||
@. p.Δ *= √Δacc / √acc
|
||||
@. Δacc = ρ * Δacc + (1 - ρ) * p.Δ ^ 2
|
||||
end
|
||||
@. acc = ρ * acc + (1 - ρ) * p.Δ^2
|
||||
@. p.Δ *= √(Δacc + ϵ) / √(acc + ϵ)
|
||||
@. Δacc = ρ * Δacc + (1 - ρ) * p.Δ^2
|
||||
end
|
||||
end
|
||||
|
||||
function adam(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8)
|
||||
mt = zeros(p.x)
|
||||
vt = zeros(p.x) .+ ϵ
|
||||
vt = zeros(p.x)
|
||||
β1p, β2p = β1, β2
|
||||
function ()
|
||||
@. mt = β1 * mt + (1 - β1) * p.Δ
|
||||
@. vt = β2 * vt + (1 - β2) * p.Δ ^ 2
|
||||
@. p.Δ = √(1 - β2p) / √(1 - β1p) * mt / √vt * η
|
||||
@. vt = β2 * vt + (1 - β2) * p.Δ^2
|
||||
@. p.Δ = mt / (1 - β1p) / (√(vt / (1 - β2p)) + ϵ) * η
|
||||
β1p *= β1
|
||||
β2p *= β2
|
||||
end
|
||||
end
|
||||
|
||||
function amsgrad(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8)
|
||||
mt = zeros(p.x)
|
||||
vt = zeros(p.x) .+ ϵ
|
||||
v̂t = zeros(p.x) .+ ϵ
|
||||
function ()
|
||||
@. mt = β1 * mt + (1 - β1) * p.Δ
|
||||
@. vt = β2 * vt + (1 - β2) * p.Δ ^ 2
|
||||
@. v̂t = max.(v̂t, vt)
|
||||
@. p.Δ = η * mt / √v̂t
|
||||
end
|
||||
end
|
||||
|
||||
clip(p::Param, thresh::Real) = () -> clamp!(p.Δ, -thresh, thresh)
|
||||
|
||||
function expdecay(p::Param, γ::Real)
|
||||
if γ != 0
|
||||
return () -> p.Δ .+= γ .* p.x
|
||||
else
|
||||
return () -> nothing
|
||||
end
|
||||
end
|
||||
|
||||
function invdecay(p::Param, γ::Real)
|
||||
if γ != 0
|
||||
n = 0
|
||||
return () -> begin
|
||||
p.Δ .*= 1 / (1 + γ * n)
|
||||
n += 1
|
||||
end
|
||||
else
|
||||
return () -> nothing
|
||||
end
|
||||
end
|
||||
|
@ -58,6 +58,7 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) =
|
||||
|
||||
Base.similar(x::TrackedArray, T::Type) = similar(data(x), T)
|
||||
|
||||
# TODO decide if keeping both data and value. The problem is TrackedScalar
|
||||
value(x) = x
|
||||
value(x::TrackedArray) = data(x)
|
||||
value(x::TrackedScalar) = data(x)[]
|
||||
@ -69,6 +70,7 @@ Base.:(==)(x::TrackedArray, y::TrackedArray) = value(x) == value(x)
|
||||
Base.isless(x::TrackedScalar, y) = isless(value(x), y)
|
||||
Base.isless(x, y::TrackedScalar) = isless(x, value(y))
|
||||
Base.isless(x::TrackedScalar, y::TrackedScalar) = isless(value(x), value(y))
|
||||
Base.isapprox(x::TrackedScalar, y; kws...) = isapprox(x.data[], y; kws...)
|
||||
|
||||
Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} =
|
||||
print(io, "TrackedArray{…,$A}")
|
||||
|
@ -58,6 +58,15 @@ Base.findfirst(xs::TrackedArray, args...) = findfirst(xs.data, args...)
|
||||
Base.mean(xs::TrackedArray) = TrackedArray(Call(mean, xs), toarray(xs.data, mean(xs.data)))
|
||||
Base.mean(xs::TrackedArray, region) = TrackedArray(Call(mean, xs, region))
|
||||
|
||||
LinAlg.dot(xs::TrackedVector, ys::TrackedVector) = TrackedArray(Call(dot, xs, ys), toarray(xs.data, dot(data(xs), data(ys))))
|
||||
LinAlg.dot(xs::AbstractVector, ys::TrackedVector) = TrackedArray(Call(dot, xs, ys), toarray(xs.data, dot(data(xs), data(ys))))
|
||||
LinAlg.dot(xs::TrackedVector, ys::AbstractVector) = TrackedArray(Call(dot, xs, ys), toarray(xs.data, dot(data(xs), data(ys))))
|
||||
|
||||
function back(::typeof(dot), Δ, xs, ys)
|
||||
@back(xs, Δ.*ys)
|
||||
@back(ys, Δ.*xs)
|
||||
end
|
||||
|
||||
# Hacks to get std working
|
||||
Base.std(x::TrackedArray; mean = Base.mean(x)) =
|
||||
sqrt.(sum((x .- mean).^2) ./ (length(x)-1))
|
||||
@ -70,7 +79,7 @@ back(::typeof(mean), Δ, xs::TrackedArray, region) =
|
||||
|
||||
# BLAS
|
||||
|
||||
for f in :[*, Ac_mul_B].args
|
||||
for f in :[*, Ac_mul_B, A_mul_Bc].args
|
||||
@eval begin
|
||||
import Base.$f
|
||||
$f(a::TrackedMatrix, b::TrackedMatrix) = TrackedArray(Call($f, a, b))
|
||||
@ -94,7 +103,12 @@ end
|
||||
|
||||
function back(::typeof(Ac_mul_B), Δ, a::AbstractVecOrMat{<:Real}, b::AbstractVecOrMat{<:Real})
|
||||
@back(a, A_mul_Bt(Δ, data(b))')
|
||||
@back(b, *(data(a), Δ))
|
||||
@back(b, data(a)*Δ)
|
||||
end
|
||||
|
||||
function back(::typeof(A_mul_Bc), Δ, a::AbstractVecOrMat{<:Real}, b::AbstractVecOrMat{<:Real})
|
||||
@back(a, Δ * data(b))
|
||||
@back(b, At_mul_B(data(a), Δ)')
|
||||
end
|
||||
|
||||
# Fast path for matrix-vector
|
||||
|
@ -26,3 +26,55 @@ using Flux: testmode!
|
||||
y = m(x)
|
||||
@test count(a->a == 0, y) == 0
|
||||
end
|
||||
|
||||
@testset "BatchNorm" begin
|
||||
let m = BatchNorm(2), x = param([1 2; 3 4; 5 6]')
|
||||
|
||||
@test m.β.data == [0, 0] # initβ(2)
|
||||
@test m.γ.data == [1, 1] # initγ(2)
|
||||
# initial m.σ is 1
|
||||
# initial m.μ is 0
|
||||
@test m.active
|
||||
|
||||
# @test m(x).data ≈ [-1 -1; 0 0; 1 1]'
|
||||
m(x)
|
||||
|
||||
# julia> x
|
||||
# 2×3 Array{Float64,2}:
|
||||
# 1.0 3.0 5.0
|
||||
# 2.0 4.0 6.0
|
||||
#
|
||||
# μ of batch will be
|
||||
# (1. + 3. + 5.) / 3 = 3
|
||||
# (2. + 4. + 6.) / 3 = 4
|
||||
#
|
||||
# ∴ update rule with momentum:
|
||||
# .1 * 3 + 0 = .3
|
||||
# .1 * 4 + 0 = .4
|
||||
@test m.μ ≈ reshape([0.3, 0.4], 2, 1)
|
||||
|
||||
# julia> .1 .* std(x, 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
|
||||
# 2×1 Array{Float64,2}:
|
||||
# 1.14495
|
||||
# 1.14495
|
||||
@test m.σ ≈ .1 .* std(x.data, 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
|
||||
|
||||
testmode!(m)
|
||||
@test !m.active
|
||||
|
||||
x′ = m(x).data
|
||||
@test x′[1] ≈ (1 - 0.3) / 1.1449489742783179
|
||||
end
|
||||
|
||||
# with activation function
|
||||
let m = BatchNorm(2, λ = σ), x = param([1 2; 3 4; 5 6]')
|
||||
@test m.active
|
||||
m(x)
|
||||
|
||||
testmode!(m)
|
||||
@test !m.active
|
||||
|
||||
x′ = m(x).data
|
||||
@test x′[1] ≈ σ((1 - 0.3) / 1.1449489742783179)
|
||||
end
|
||||
end
|
||||
|
17
test/optimise.jl
Normal file
17
test/optimise.jl
Normal file
@ -0,0 +1,17 @@
|
||||
using Flux.Optimise
|
||||
using Flux.Tracker
|
||||
|
||||
@testset "Optimise" begin
|
||||
w = randn(10, 10)
|
||||
for Opt in [SGD, Nesterov, Momentum, ADAM, RMSProp, ps -> ADAGrad(ps, 0.1), ADADelta, AMSGrad]
|
||||
w′ = param(randn(10, 10))
|
||||
loss(x) = Flux.mse(w*x, w′*x)
|
||||
opt = Opt([w′])
|
||||
for t=1:10^5
|
||||
l = loss(rand(10))
|
||||
back!(l)
|
||||
opt()
|
||||
end
|
||||
@test Flux.mse(w, w′) < 0.01
|
||||
end
|
||||
end
|
@ -6,5 +6,6 @@ include("utils.jl")
|
||||
include("tracker.jl")
|
||||
include("layers/normalisation.jl")
|
||||
include("layers/stateless.jl")
|
||||
include("optimise.jl")
|
||||
|
||||
end
|
||||
|
@ -10,6 +10,7 @@ gradtest(f, dims...) = gradtest(f, rand.(dims)...)
|
||||
@test gradtest((x, W, b) -> σ.(W*x .+ b), (5,3), (2,5), 2)
|
||||
|
||||
@test gradtest((w, x) -> w'*x, randn(10, 2), randn(10))
|
||||
@test gradtest((w, x) -> w*x', randn(5,5), randn(5,5))
|
||||
|
||||
@test gradtest(x -> sin.(sum(x, (2, 3))), (3,4,5))
|
||||
|
||||
@ -37,6 +38,8 @@ end
|
||||
@test gradtest(x -> std(x), rand(5,5))
|
||||
@test gradtest(x -> std(x, 1), rand(5,5))
|
||||
|
||||
@test gradtest((x, y) -> x .* y, rand(5), rand(5))
|
||||
|
||||
@test gradtest(rand(5)) do x
|
||||
y = x.^2
|
||||
2y + x
|
||||
|
Loading…
Reference in New Issue
Block a user