adding layer normalization
This commit is contained in:
parent
979949d01a
commit
11d53781b2
@ -78,3 +78,33 @@ function Base.show(io::IO, l::Dense)
|
|||||||
l.σ == identity || print(io, ", ", l.σ)
|
l.σ == identity || print(io, ", ", l.σ)
|
||||||
print(io, ")")
|
print(io, ")")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
ElementwiseLinear(in::Integer)
|
||||||
|
|
||||||
|
Creates an element-wise linear transformation layer with learnable
|
||||||
|
vectors α and β:
|
||||||
|
|
||||||
|
y = α .* x .+ b
|
||||||
|
|
||||||
|
The input `x` must be a vector of length `in`, or a batch of vectors represented
|
||||||
|
as an `in × N` matrix. The out `y` will be a vector or batch of length `in`.
|
||||||
|
"""
|
||||||
|
struct ElementwiseLinear{T}
|
||||||
|
α::T
|
||||||
|
β::T
|
||||||
|
end
|
||||||
|
|
||||||
|
ElementwiseLinear(in::Integer; initα = ones, initβ = zeros) =
|
||||||
|
ElementwiseLinear(param(initα(in)), param(initβ(in)))
|
||||||
|
|
||||||
|
treelike(ElementwiseLinear)
|
||||||
|
|
||||||
|
function (a::ElementwiseLinear)(x)
|
||||||
|
α, β = a.α, a.β
|
||||||
|
α.*x .+ β
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.show(io::IO, l::ElementwiseLinear)
|
||||||
|
print(io, "ElementwiseLinear(", length(l.α), ")")
|
||||||
|
end
|
||||||
|
@ -12,3 +12,26 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat)
|
|||||||
ypred = logŷ .- log.(sum(exp.(logŷ), 1))
|
ypred = logŷ .- log.(sum(exp.(logŷ), 1))
|
||||||
-sum(y .* ypred) / size(y, 2)
|
-sum(y .* ypred) / size(y, 2)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
layernormalization(α=1.0, β=0.0)
|
||||||
|
|
||||||
|
Creates a normalization layer based on https://arxiv.org/pdf/1607.06450.pdf
|
||||||
|
|
||||||
|
The differences are:
|
||||||
|
|
||||||
|
1) std here divides by N-1 (as does std in Julia) vs the paper N
|
||||||
|
2) this layer α and β are constant numbers (i.e. not learnable vectors)
|
||||||
|
|
||||||
|
To achieve the same effect of learnable vectors α and β oe can use
|
||||||
|
the ElementwiseLinear layer
|
||||||
|
"""
|
||||||
|
function layernormalization(α=1.0, β=0.0)
|
||||||
|
function layer(y)
|
||||||
|
_mean = mean(y)
|
||||||
|
_std = sqrt.(sum((y.-_mean).^2) ./ (length(y)-1))
|
||||||
|
_std /= α
|
||||||
|
_mean -= β*_std
|
||||||
|
return (y .- _mean) ./ _std
|
||||||
|
end
|
||||||
|
end
|
||||||
|
Loading…
Reference in New Issue
Block a user