diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 969a261c..03a340df 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -78,3 +78,33 @@ function Base.show(io::IO, l::Dense) l.σ == identity || print(io, ", ", l.σ) print(io, ")") end + +""" + ElementwiseLinear(in::Integer) + +Creates an element-wise linear transformation layer with learnable +vectors α and β: + + y = α .* x .+ b + +The input `x` must be a vector of length `in`, or a batch of vectors represented +as an `in × N` matrix. The out `y` will be a vector or batch of length `in`. +""" +struct ElementwiseLinear{T} + α::T + β::T +end + +ElementwiseLinear(in::Integer; initα = ones, initβ = zeros) = + ElementwiseLinear(param(initα(in)), param(initβ(in))) + +treelike(ElementwiseLinear) + +function (a::ElementwiseLinear)(x) + α, β = a.α, a.β + α.*x .+ β +end + +function Base.show(io::IO, l::ElementwiseLinear) + print(io, "ElementwiseLinear(", length(l.α), ")") +end diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 3931c216..8d0276e8 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -12,3 +12,26 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat) ypred = logŷ .- log.(sum(exp.(logŷ), 1)) -sum(y .* ypred) / size(y, 2) end + +""" + layernormalization(α=1.0, β=0.0) + +Creates a normalization layer based on https://arxiv.org/pdf/1607.06450.pdf + +The differences are: + +1) std here divides by N-1 (as does std in Julia) vs the paper N +2) this layer α and β are constant numbers (i.e. not learnable vectors) + +To achieve the same effect of learnable vectors α and β oe can use +the ElementwiseLinear layer +""" +function layernormalization(α=1.0, β=0.0) + function layer(y) + _mean = mean(y) + _std = sqrt.(sum((y.-_mean).^2) ./ (length(y)-1)) + _std /= α + _mean -= β*_std + return (y .- _mean) ./ _std + end +end