diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index f92f751a..1fd87d41 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -38,4 +38,5 @@ These layers don't affect the structure of the network but may improve training ```@docs Flux.testmode! Dropout +LayerNorm ``` diff --git a/src/Flux.jl b/src/Flux.jl index acefff19..df4b1636 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -7,7 +7,7 @@ module Flux using Juno, Requires using Lazy: @forward -export Chain, Dense, RNN, LSTM, Dropout, +export Chain, Dense, RNN, LSTM, Dropout, LayerNorm, SGD, ADAM, Momentum, Nesterov, param, params, mapleaves diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 969a261c..aa101c43 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -78,3 +78,32 @@ function Base.show(io::IO, l::Dense) l.σ == identity || print(io, ", ", l.σ) print(io, ")") end + +""" + Diagonal(in::Integer) + +Creates an element-wise linear transformation layer with learnable +vectors `α` and `β`: + + y = α .* x .+ β + +The input `x` must be a array where `size(x, 1) == in`. +""" +struct Diagonal{T} + α::T + β::T +end + +Diagonal(in::Integer; initα = ones, initβ = zeros) = + Diagonal(param(initα(in)), param(initβ(in))) + +treelike(Diagonal) + +function (a::Diagonal)(x) + α, β = a.α, a.β + α.*x .+ β +end + +function Base.show(io::IO, l::Diagonal) + print(io, "Diagonal(", length(l.α), ")") +end diff --git a/src/layers/normalisation.jl b/src/layers/normalisation.jl index 08c21428..d296b0a3 100644 --- a/src/layers/normalisation.jl +++ b/src/layers/normalisation.jl @@ -43,3 +43,25 @@ function (a::Dropout)(x) end _testmode!(a::Dropout, test) = (a.active = !test) + +""" + LayerNorm(h::Integer) + +A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be +used with recurrent hidden states of size `h`. Normalises the mean/stddev of +each input before applying a per-neuron gain/bias. +""" +struct LayerNorm{T} + diag::Diagonal{T} +end + +LayerNorm(h::Integer) = + LayerNorm(Diagonal(h)) + +treelike(LayerNorm) + +(a::LayerNorm)(x) = a.diag(normalise(x)) + +function Base.show(io::IO, l::LayerNorm) + print(io, "LayerNorm(", length(l.diag.α), ")") +end diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 3931c216..2a4b9a7c 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -12,3 +12,14 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat) ypred = logŷ .- log.(sum(exp.(logŷ), 1)) -sum(y .* ypred) / size(y, 2) end + +""" + normalise(x::AbstractVecOrMat) + +Normalise each column of `x` to mean 0 and standard deviation 1. +""" +function normalise(x::AbstractVecOrMat) + μ′ = mean(x, 1) + σ′ = std(x, 1, mean = μ′) + return (x .- μ′) ./ σ′ +end diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl index aab26dfe..5065a40d 100644 --- a/src/tracker/lib.jl +++ b/src/tracker/lib.jl @@ -58,6 +58,12 @@ Base.findfirst(xs::TrackedArray, args...) = findfirst(xs.data, args...) Base.mean(xs::TrackedArray) = TrackedArray(Call(mean, xs), toarray(xs.data, mean(xs.data))) Base.mean(xs::TrackedArray, region) = TrackedArray(Call(mean, xs, region)) +# Hacks to get std working +Base.std(x::TrackedArray; mean = Base.mean(x)) = + sqrt.(sum((x .- mean).^2) ./ (length(x)-1)) +Base.std(x::TrackedArray, dim; mean = Base.mean(x, dim)) = + sqrt.(sum((x .- mean).^2, dim) ./ (size(x, dim)-1)) + back(::typeof(mean), Δ, xs::TrackedArray) = back(xs, similar(xs.data) .= Δ ./ length(xs.data)) back(::typeof(mean), Δ, xs::TrackedArray, region) = back(xs, similar(xs.data) .= Δ ./ prod(size(xs.data, region...))) diff --git a/test/tracker.jl b/test/tracker.jl index f2a369f8..81a72566 100644 --- a/test/tracker.jl +++ b/test/tracker.jl @@ -34,6 +34,9 @@ gradtest(f, dims...) = gradtest(f, rand.(dims)...) @test gradtest(x -> mean(x, [1, 2]), rand(2, 3, 4)) end +@test gradtest(x -> std(x), rand(5,5)) +@test gradtest(x -> std(x, 1), rand(5,5)) + @test gradtest(rand(5)) do x y = x.^2 2y + x