diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 969a261c..03a340df 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -78,3 +78,33 @@ function Base.show(io::IO, l::Dense)
   l.σ == identity || print(io, ", ", l.σ)
   print(io, ")")
 end
+
+"""
+    ElementwiseLinear(in::Integer)
+
+Creates an element-wise linear transformation layer with learnable
+vectors α and β:
+
+    y = α .* x .+ b
+
+The input `x` must be a vector of length `in`, or a batch of vectors represented
+as an `in × N` matrix. The out `y` will be a vector or batch of length `in`.
+"""
+struct ElementwiseLinear{T}
+  α::T
+  β::T
+end
+
+ElementwiseLinear(in::Integer; initα = ones, initβ = zeros) =
+  ElementwiseLinear(param(initα(in)), param(initβ(in)))
+
+treelike(ElementwiseLinear)
+
+function (a::ElementwiseLinear)(x)
+  α, β = a.α, a.β
+  α.*x .+ β
+end
+
+function Base.show(io::IO, l::ElementwiseLinear)
+  print(io, "ElementwiseLinear(", length(l.α), ")")
+end
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 3931c216..8d0276e8 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -12,3 +12,26 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat)
   ypred = logŷ .- log.(sum(exp.(logŷ), 1))
   -sum(y .* ypred) / size(y, 2)
 end
+
+"""
+    layernormalization(α=1.0, β=0.0)
+
+Creates a normalization layer based on https://arxiv.org/pdf/1607.06450.pdf
+
+The differences are:
+
+1) std here divides by N-1 (as does std in Julia) vs the paper N
+2) this layer α and β are constant numbers (i.e. not learnable vectors)
+
+To achieve the same effect of learnable vectors α and β oe can use
+the ElementwiseLinear layer
+"""
+function layernormalization(α=1.0, β=0.0)
+  function layer(y)
+    _mean = mean(y)
+    _std = sqrt.(sum((y.-_mean).^2) ./ (length(y)-1))
+    _std /= α
+    _mean -= β*_std
+    return (y .- _mean) ./ _std
+  end
+end