From 11d53781b254bbb0fbe8a1c1313a3b05efc61112 Mon Sep 17 00:00:00 2001 From: skariel Date: Tue, 10 Oct 2017 23:33:37 +0300 Subject: [PATCH 1/3] adding layer normalization --- src/layers/basic.jl | 30 ++++++++++++++++++++++++++++++ src/layers/stateless.jl | 23 +++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 969a261c..03a340df 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -78,3 +78,33 @@ function Base.show(io::IO, l::Dense) l.σ == identity || print(io, ", ", l.σ) print(io, ")") end + +""" + ElementwiseLinear(in::Integer) + +Creates an element-wise linear transformation layer with learnable +vectors α and β: + + y = α .* x .+ b + +The input `x` must be a vector of length `in`, or a batch of vectors represented +as an `in × N` matrix. The out `y` will be a vector or batch of length `in`. +""" +struct ElementwiseLinear{T} + α::T + β::T +end + +ElementwiseLinear(in::Integer; initα = ones, initβ = zeros) = + ElementwiseLinear(param(initα(in)), param(initβ(in))) + +treelike(ElementwiseLinear) + +function (a::ElementwiseLinear)(x) + α, β = a.α, a.β + α.*x .+ β +end + +function Base.show(io::IO, l::ElementwiseLinear) + print(io, "ElementwiseLinear(", length(l.α), ")") +end diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 3931c216..8d0276e8 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -12,3 +12,26 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat) ypred = logŷ .- log.(sum(exp.(logŷ), 1)) -sum(y .* ypred) / size(y, 2) end + +""" + layernormalization(α=1.0, β=0.0) + +Creates a normalization layer based on https://arxiv.org/pdf/1607.06450.pdf + +The differences are: + +1) std here divides by N-1 (as does std in Julia) vs the paper N +2) this layer α and β are constant numbers (i.e. not learnable vectors) + +To achieve the same effect of learnable vectors α and β oe can use +the ElementwiseLinear layer +""" +function layernormalization(α=1.0, β=0.0) + function layer(y) + _mean = mean(y) + _std = sqrt.(sum((y.-_mean).^2) ./ (length(y)-1)) + _std /= α + _mean -= β*_std + return (y .- _mean) ./ _std + end +end From b06884b9123d9168104602c9855e4bc046bdecab Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Mon, 23 Oct 2017 12:53:07 +0100 Subject: [PATCH 2/3] LayerNorm tweaks --- docs/src/models/layers.md | 1 + src/Flux.jl | 2 +- src/layers/basic.jl | 19 +++++++++---------- src/layers/normalisation.jl | 22 ++++++++++++++++++++++ src/layers/stateless.jl | 24 ++++++------------------ 5 files changed, 39 insertions(+), 29 deletions(-) diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index f92f751a..1fd87d41 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -38,4 +38,5 @@ These layers don't affect the structure of the network but may improve training ```@docs Flux.testmode! Dropout +LayerNorm ``` diff --git a/src/Flux.jl b/src/Flux.jl index acefff19..df4b1636 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -7,7 +7,7 @@ module Flux using Juno, Requires using Lazy: @forward -export Chain, Dense, RNN, LSTM, Dropout, +export Chain, Dense, RNN, LSTM, Dropout, LayerNorm, SGD, ADAM, Momentum, Nesterov, param, params, mapleaves diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 03a340df..3c47b595 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -80,31 +80,30 @@ function Base.show(io::IO, l::Dense) end """ - ElementwiseLinear(in::Integer) + Diagonal(in::Integer) Creates an element-wise linear transformation layer with learnable vectors α and β: y = α .* x .+ b -The input `x` must be a vector of length `in`, or a batch of vectors represented -as an `in × N` matrix. The out `y` will be a vector or batch of length `in`. +The input `x` must be a array where `size(x, 1) == in`. """ -struct ElementwiseLinear{T} +struct Diagonal{T} α::T β::T end -ElementwiseLinear(in::Integer; initα = ones, initβ = zeros) = - ElementwiseLinear(param(initα(in)), param(initβ(in))) +Diagonal(in::Integer; initα = ones, initβ = zeros) = + Diagonal(param(initα(in)), param(initβ(in))) -treelike(ElementwiseLinear) +treelike(Diagonal) -function (a::ElementwiseLinear)(x) +function (a::Diagonal)(x) α, β = a.α, a.β α.*x .+ β end -function Base.show(io::IO, l::ElementwiseLinear) - print(io, "ElementwiseLinear(", length(l.α), ")") +function Base.show(io::IO, l::Diagonal) + print(io, "Diagonal(", length(l.α), ")") end diff --git a/src/layers/normalisation.jl b/src/layers/normalisation.jl index 08c21428..d296b0a3 100644 --- a/src/layers/normalisation.jl +++ b/src/layers/normalisation.jl @@ -43,3 +43,25 @@ function (a::Dropout)(x) end _testmode!(a::Dropout, test) = (a.active = !test) + +""" + LayerNorm(h::Integer) + +A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be +used with recurrent hidden states of size `h`. Normalises the mean/stddev of +each input before applying a per-neuron gain/bias. +""" +struct LayerNorm{T} + diag::Diagonal{T} +end + +LayerNorm(h::Integer) = + LayerNorm(Diagonal(h)) + +treelike(LayerNorm) + +(a::LayerNorm)(x) = a.diag(normalise(x)) + +function Base.show(io::IO, l::LayerNorm) + print(io, "LayerNorm(", length(l.diag.α), ")") +end diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 8d0276e8..2a4b9a7c 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -14,24 +14,12 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat) end """ - layernormalization(α=1.0, β=0.0) + normalise(x::AbstractVecOrMat) -Creates a normalization layer based on https://arxiv.org/pdf/1607.06450.pdf - -The differences are: - -1) std here divides by N-1 (as does std in Julia) vs the paper N -2) this layer α and β are constant numbers (i.e. not learnable vectors) - -To achieve the same effect of learnable vectors α and β oe can use -the ElementwiseLinear layer +Normalise each column of `x` to mean 0 and standard deviation 1. """ -function layernormalization(α=1.0, β=0.0) - function layer(y) - _mean = mean(y) - _std = sqrt.(sum((y.-_mean).^2) ./ (length(y)-1)) - _std /= α - _mean -= β*_std - return (y .- _mean) ./ _std - end +function normalise(x::AbstractVecOrMat) + μ′ = mean(x, 1) + σ′ = std(x, 1, mean = μ′) + return (x .- μ′) ./ σ′ end From 351d3d4771da08e53d2a2f89547f91d5fdb47beb Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Tue, 21 Nov 2017 17:04:04 +0100 Subject: [PATCH 3/3] std derivative --- src/layers/basic.jl | 4 ++-- src/tracker/lib.jl | 6 ++++++ test/tracker.jl | 3 +++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 3c47b595..aa101c43 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -83,9 +83,9 @@ end Diagonal(in::Integer) Creates an element-wise linear transformation layer with learnable -vectors α and β: +vectors `α` and `β`: - y = α .* x .+ b + y = α .* x .+ β The input `x` must be a array where `size(x, 1) == in`. """ diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl index aab26dfe..5065a40d 100644 --- a/src/tracker/lib.jl +++ b/src/tracker/lib.jl @@ -58,6 +58,12 @@ Base.findfirst(xs::TrackedArray, args...) = findfirst(xs.data, args...) Base.mean(xs::TrackedArray) = TrackedArray(Call(mean, xs), toarray(xs.data, mean(xs.data))) Base.mean(xs::TrackedArray, region) = TrackedArray(Call(mean, xs, region)) +# Hacks to get std working +Base.std(x::TrackedArray; mean = Base.mean(x)) = + sqrt.(sum((x .- mean).^2) ./ (length(x)-1)) +Base.std(x::TrackedArray, dim; mean = Base.mean(x, dim)) = + sqrt.(sum((x .- mean).^2, dim) ./ (size(x, dim)-1)) + back(::typeof(mean), Δ, xs::TrackedArray) = back(xs, similar(xs.data) .= Δ ./ length(xs.data)) back(::typeof(mean), Δ, xs::TrackedArray, region) = back(xs, similar(xs.data) .= Δ ./ prod(size(xs.data, region...))) diff --git a/test/tracker.jl b/test/tracker.jl index f2a369f8..81a72566 100644 --- a/test/tracker.jl +++ b/test/tracker.jl @@ -34,6 +34,9 @@ gradtest(f, dims...) = gradtest(f, rand.(dims)...) @test gradtest(x -> mean(x, [1, 2]), rand(2, 3, 4)) end +@test gradtest(x -> std(x), rand(5,5)) +@test gradtest(x -> std(x, 1), rand(5,5)) + @test gradtest(rand(5)) do x y = x.^2 2y + x