Merge branch 'master' into gru

This commit is contained in:
Mike J Innes 2017-12-12 16:54:00 +00:00
commit 403cc26327
4 changed files with 132 additions and 3 deletions

View File

@ -37,6 +37,7 @@ These layers don't affect the structure of the network but may improve training
```@docs ```@docs
Flux.testmode! Flux.testmode!
BatchNorm
Dropout Dropout
LayerNorm LayerNorm
``` ```

View File

@ -7,7 +7,8 @@ module Flux
using Juno, Requires using Juno, Requires
using Lazy: @forward using Lazy: @forward
export Chain, Dense, RNN, LSTM, GRU, Dropout, LayerNorm, export Chain, Dense, RNN, LSTM, GRU,
Dropout, LayerNorm, BatchNorm,
SGD, ADAM, Momentum, Nesterov, AMSGrad, SGD, ADAM, Momentum, Nesterov, AMSGrad,
param, params, mapleaves param, params, mapleaves

View File

@ -2,8 +2,8 @@
testmode!(m) testmode!(m)
testmode!(m, false) testmode!(m, false)
Put layers like [`Dropout`](@ref) and `BatchNorm` into testing mode (or back to Put layers like [`Dropout`](@ref) and [`BatchNorm`](@ref) into testing mode
training mode with `false`). (or back to training mode with `false`).
""" """
function testmode!(m, val::Bool=true) function testmode!(m, val::Bool=true)
prefor(x -> _testmode!(x, val), m) prefor(x -> _testmode!(x, val), m)
@ -45,6 +45,7 @@ end
_testmode!(a::Dropout, test) = (a.active = !test) _testmode!(a::Dropout, test) = (a.active = !test)
""" """
LayerNorm(h::Integer) LayerNorm(h::Integer)
A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be
@ -65,3 +66,77 @@ treelike(LayerNorm)
function Base.show(io::IO, l::LayerNorm) function Base.show(io::IO, l::LayerNorm)
print(io, "LayerNorm(", length(l.diag.α), ")") print(io, "LayerNorm(", length(l.diag.α), ")")
end end
"""
BatchNorm(dims...; λ = identity,
initβ = zeros, initγ = ones, ϵ = 1e-8, momentum = .1)
Batch Normalization Layer for [`Dense`](@ref) layer.
See [Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf)
In the example of MNIST,
in order to normalize the input of other layer,
put the `BatchNorm` layer before activation function.
```julia
m = Chain(
Dense(28^2, 64),
BatchNorm(64, λ = relu),
Dense(64, 10),
BatchNorm(10),
softmax)
```
"""
mutable struct BatchNorm{F,V,N}
λ::F # activation function
β::V # bias
γ::V # scale
μ # moving mean
σ # moving std
ϵ::N
momentum::N
active::Bool
end
BatchNorm(dims::Integer...; λ = identity,
initβ = zeros, initγ = ones, ϵ = 1e-8, momentum = .1) =
BatchNorm(λ, param(initβ(dims)), param(initγ(dims)), 0., 1., ϵ, momentum, true)
function (BN::BatchNorm)(x)
λ, γ, β = BN.λ, BN.γ, BN.β
if !BN.active
μ = BN.μ
σ = BN.σ
else
T = eltype(x)
ϵ = T(BN.ϵ)
m = size(x, 2) # batch size
μ = mean(x, 2)
σ = sqrt.(sum((x .- μ).^2, 2) ./ m .+ ϵ)
# update moving mean/std
mtm = T(BN.momentum)
BN.μ = (1 - mtm) .* BN.μ .+ mtm .* μ.data
BN.σ = (1 - mtm) .* BN.σ .+ mtm .* σ.data .* m ./ (m - 1)
end
λ.(γ .* ((x .- μ) ./ σ) .+ β)
end
children(BN::BatchNorm) =
(BN.λ, BN.β, BN.γ, BN.μ, BN.σ, BN.momentum, BN.ϵ, BN.active)
mapchildren(f, BN::BatchNorm) = # e.g. mapchildren(cu, BN)
BatchNorm(BN.λ, f(BN.β), f(BN.γ), BN.μ, BN.σ, BN.momentum, BN.ϵ, BN.active)
_testmode!(BN::BatchNorm, test) = (BN.active = !test)
function Base.show(io::IO, l::BatchNorm)
print(io, "BatchNorm($(join(size(l.β), ", "))")
(l.λ == identity) || print(io, ", λ = $(l.λ)")
print(io, ")")
end

View File

@ -26,3 +26,55 @@ using Flux: testmode!
y = m(x) y = m(x)
@test count(a->a == 0, y) == 0 @test count(a->a == 0, y) == 0
end end
@testset "BatchNorm" begin
let m = BatchNorm(2), x = param([1 2; 3 4; 5 6]')
@test m.β.data == [0, 0] # initβ(2)
@test m.γ.data == [1, 1] # initγ(2)
# initial m.σ is 1
# initial m.μ is 0
@test m.active
# @test m(x).data ≈ [-1 -1; 0 0; 1 1]'
m(x)
# julia> x
# 2×3 Array{Float64,2}:
# 1.0 3.0 5.0
# 2.0 4.0 6.0
#
# μ of batch will be
# (1. + 3. + 5.) / 3 = 3
# (2. + 4. + 6.) / 3 = 4
#
# ∴ update rule with momentum:
# .1 * 3 + 0 = .3
# .1 * 4 + 0 = .4
@test m.μ reshape([0.3, 0.4], 2, 1)
# julia> .1 .* std(x, 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
# 2×1 Array{Float64,2}:
# 1.14495
# 1.14495
@test m.σ .1 .* std(x.data, 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
testmode!(m)
@test !m.active
x = m(x).data
@test x[1] (1 - 0.3) / 1.1449489742783179
end
# with activation function
let m = BatchNorm(2, λ = σ), x = param([1 2; 3 4; 5 6]')
@test m.active
m(x)
testmode!(m)
@test !m.active
x = m(x).data
@test x[1] σ((1 - 0.3) / 1.1449489742783179)
end
end