Flux.jl/src/layers/basic.jl

"""
    Chain(layers...)

Chain multiple layers / functions together, so that they are called in sequence
on a given input.

```julia
m = Chain(x -> x^2, x -> x+1)
m(5) == 26

m = Chain(Dense(10, 5), Dense(5, 2))
x = rand(10)
m(x) == m[2](m[1](x))
```

`Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
`m[1:3](x)` will calculate the output of the first three layers.
"""
struct Chain{T<:Tuple}
  layers::T
  Chain(xs...) = new{typeof(xs)}(xs)
end

@forward Chain.layers Base.getindex, Base.length, Base.first, Base.last,
  Base.iterate, Base.lastindex

children(c::Chain) = c.layers
mapchildren(f, c::Chain) = Chain(f.(c.layers)...)

applychain(::Tuple{}, x) = x
applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))

(c::Chain)(x) = applychain(c.layers, x)

Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)

function Base.show(io::IO, c::Chain)
  print(io, "Chain(")
  join(io, c.layers, ", ")
  print(io, ")")
end

activations(c::Chain, x) = accumulate((x, m) -> m(x), c.layers, init = x)

"""
    Dense(in::Integer, out::Integer, σ = identity)

Creates a traditional `Dense` layer with parameters `W` and `b`.

    y = σ.(W * x .+ b)

The input `x` must be a vector of length `in`, or a batch of vectors represented
as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.

```julia
julia> d = Dense(5, 2)
Dense(5, 2)

julia> d(rand(5))
Tracked 2-element Array{Float64,1}:
  0.00257447
  -0.00449443
```
"""
struct Dense{F,S,T}
  W::S
  b::T
  σ::F
end

Dense(W, b) = Dense(W, b, identity)

function Dense(in::Integer, out::Integer, σ = identity;
               initW = glorot_uniform, initb = zeros)
  return Dense(param(initW(out, in)), param(initb(out)), σ)
end

@treelike Dense

function (a::Dense)(x::AbstractArray)
  W, b, σ = a.W, a.b, a.σ
  σ.(W*x .+ b)
end

function Base.show(io::IO, l::Dense)
  print(io, "Dense(", size(l.W, 2), ", ", size(l.W, 1))
  l.σ == identity || print(io, ", ", l.σ)
  print(io, ")")
end

# Try to avoid hitting generic matmul in some simple cases
# Base's matmul is so slow that it's worth the extra conversion to hit BLAS
(a::Dense{<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  invoke(a, Tuple{AbstractArray}, x)

(a::Dense{<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  a(T.(x))

"""
    Diagonal(in::Integer)

Creates an element-wise linear transformation layer with learnable
vectors `α` and `β`:

    y = α .* x .+ β

The input `x` must be a array where `size(x, 1) == in`.
"""
struct Diagonal{T}
  α::T
  β::T
end

Diagonal(in::Integer; initα = ones, initβ = zeros) =
  Diagonal(param(initα(in)), param(initβ(in)))

@treelike Diagonal

function (a::Diagonal)(x)
  α, β = a.α, a.β
  α.*x .+ β
end

function Base.show(io::IO, l::Diagonal)
  print(io, "Diagonal(", length(l.α), ")")
end


"""
    MaxOut(over)

MaxOut is a neural network layer, which has a number of internal layers,
which all have the same input, and the max out returns the elementwise maximium
of the internal layers' outputs.

Maxout over linear dense layers satisfies the univeral approximation theorem.

Reference:
Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, and Yoshua Bengio.
2013. Maxout networks.
In Proceedings of the 30th International Conference on International Conference on Machine Learning - Volume 28 (ICML'13),
Sanjoy Dasgupta and David McAllester (Eds.), Vol. 28. JMLR.org III-1319-III-1327.
https://arxiv.org/pdf/1302.4389.pdf
"""
struct MaxOut{FS<:Tuple}
    over::FS
end

"""
    MaxOut(f, n_alts, args...; kwargs...)

Constructs a MaxOut layer over `n_alts` instances of  the layer given  by `f`.
All other arguements (`args` & `kwargs`) are passed to the constructor `f`.

For example the followeExample usage
will construct a MaxOut layer over 4 dense linear layers,
each identical in structure (784 inputs, 128 outputs).
```julia
    insize = 784
    outsie = 128
    MaxOut(Dense, 4, insize, outsize)
```
"""
function MaxOut(f, n_alts, args...; kwargs...)
  over = Tuple(f(args...; kwargs...) for _ in 1:n_alts)
  return MaxOut(over)
end

function (mo::MaxOut)(input::AbstractArray)
    mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
end
-												layer docs

											
										
										
											2017-09-08 21:52:41 +00:00
+								"""
 								    Chain(layers...)
-												simplify organisation

											
										
										
											2017-08-19 19:52:29 +00:00
-												layer docs

											
										
										
											2017-09-08 21:52:41 +00:00
+								Chain multiple layers / functions together, so that they are called in sequence
 								on a given input.
-												syntax highlighting

											
										
										
											2017-10-18 14:44:06 +00:00
+								```julia
 								m = Chain(x -> x^2, x -> x+1)
 								m(5) == 26
-												layer docs

											
										
										
											2017-09-08 21:52:41 +00:00
-												syntax highlighting

											
										
										
											2017-10-18 14:44:06 +00:00
+								m = Chain(Dense(10, 5), Dense(5, 2))
 								x = rand(10)
 								m(x) == m[2](m[1](x))
 								```
-												layer docs

											
										
										
											2017-09-08 21:52:41 +00:00
 								`Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
-												chain utility note

											
										
										
											2017-09-10 00:02:48 +00:00
+								`m[1:3](x)` will calculate the output of the first three layers.
-												layer docs

											
										
										
											2017-09-08 21:52:41 +00:00
+								"""
-												immutable chain

											
										
										
											2018-11-16 12:22:15 +00:00
+								struct Chain{T<:Tuple}
 								  layers::T
 								  Chain(xs...) = new{typeof(xs)}(xs)
-												a bunch of stuff

											
										
										
											2016-08-25 21:49:21 +00:00
+								end
-												make chain collectable

											
										
										
											2019-01-16 14:51:37 +00:00
+								@forward Chain.layers Base.getindex, Base.length, Base.first, Base.last,
 								  Base.iterate, Base.lastindex
-												a bunch of stuff

											
										
										
											2016-08-25 21:49:21 +00:00
-												generic tree functions

											
										
										
											2017-09-27 20:11:21 +00:00
+								children(c::Chain) = c.layers
 								mapchildren(f, c::Chain) = Chain(f.(c.layers)...)
-												param collection

											
										
										
											2017-08-22 16:13:03 +00:00
-												immutable chain

											
										
										
											2018-11-16 12:22:15 +00:00
+								applychain(::Tuple{}, x) = x
 								applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 								(c::Chain)(x) = applychain(c.layers, x)
-												training julia models

											
										
										
											2017-06-12 11:39:34 +00:00
-												better alternative to basemodel

											
										
										
											2017-02-28 16:42:48 +00:00
+								Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
-												simplify organisation

											
										
										
											2017-08-19 19:52:29 +00:00
-												nicer show

											
										
										
											2017-08-21 16:20:09 +00:00
+								function Base.show(io::IO, c::Chain)
 								  print(io, "Chain(")
 								  join(io, c.layers, ", ")
 								  print(io, ")")
 								end
-												fix activations for 1.0

											
										
										
											2018-08-23 09:56:31 +00:00
+								activations(c::Chain, x) = accumulate((x, m) -> m(x), c.layers, init = x)
-												Chain `activations`

											
										
										
											2018-06-26 13:30:46 +00:00
-												layer docs

											
										
										
											2017-09-08 21:52:41 +00:00
+								"""
 								    Dense(in::Integer, out::Integer, σ = identity)
 								Creates a traditional `Dense` layer with parameters `W` and `b`.
-												simplify organisation

											
										
										
											2017-08-19 19:52:29 +00:00
-												layer docs

											
										
										
											2017-09-08 21:52:41 +00:00
+								    y = σ.(W * x .+ b)
-												docs updates

											
										
										
											2017-09-09 23:58:32 +00:00
 								The input `x` must be a vector of length `in`, or a batch of vectors represented
-												typoe

											
										
										
											2017-10-18 11:48:58 +00:00
+								as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.
-												dense layer example

											
										
										
											2017-10-18 11:47:45 +00:00
-												syntax highlighting

											
										
										
											2017-10-18 14:44:06 +00:00
+								```julia
 								julia> d = Dense(5, 2)
 								Dense(5, 2)
-												dense layer example

											
										
										
											2017-10-18 11:47:45 +00:00
-												syntax highlighting

											
										
										
											2017-10-18 14:44:06 +00:00
+								julia> d(rand(5))
 								Tracked 2-element Array{Float64,1}:
 .00257447
 								  -0.00449443
 								```
-												layer docs

											
										
										
											2017-09-08 21:52:41 +00:00
+								"""
-												clearer name for dense

											
										
										
											2017-09-02 20:50:11 +00:00
+								struct Dense{F,S,T}
-												simplify organisation

											
										
										
											2017-08-19 19:52:29 +00:00
+								  W::S
 								  b::T
-												easier initialisation with weights

											
										
										
											2018-02-15 20:52:29 +00:00
+								  σ::F
-												simplify organisation

											
										
										
											2017-08-19 19:52:29 +00:00
+								end
-												easier initialisation with weights

											
										
										
											2018-02-15 20:52:29 +00:00
+								Dense(W, b) = Dense(W, b, identity)
-												Add glorot (Xavier) initialization

Set default `Dense` and `RNN` inits to `glorot_uniform()` for `W`, `zeros` for `b`.

											
										
										
											2017-12-05 07:47:03 +00:00
+								function Dense(in::Integer, out::Integer, σ = identity;
 								               initW = glorot_uniform, initb = zeros)
-												easier initialisation with weights

											
										
										
											2018-02-15 20:52:29 +00:00
+								  return Dense(param(initW(out, in)), param(initb(out)), σ)
-												Add glorot (Xavier) initialization

Set default `Dense` and `RNN` inits to `glorot_uniform()` for `W`, `zeros` for `b`.

											
										
										
											2017-12-05 07:47:03 +00:00
+								end
-												simplify organisation

											
										
										
											2017-08-19 19:52:29 +00:00
-												update treelike

											
										
										
											2018-07-12 21:43:11 +00:00
+								@treelike Dense
-												param collection

											
										
										
											2017-08-22 16:13:03 +00:00
-												Fix issue #354

											
										
										
											2018-08-23 13:34:11 +00:00
+								function (a::Dense)(x::AbstractArray)
-												beginnings of gpu support

											
										
										
											2017-09-27 20:58:34 +00:00
+								  W, b, σ = a.W, a.b, a.σ
-												WIP 1.0 support
closes #353

											
										
										
											2018-08-20 12:08:04 +00:00
+								  σ.(W*x .+ b)
-												beginnings of gpu support

											
										
										
											2017-09-27 20:58:34 +00:00
+								end
-												nicer show

											
										
										
											2017-08-21 16:20:09 +00:00
-												clearer name for dense

											
										
										
											2017-09-02 20:50:11 +00:00
+								function Base.show(io::IO, l::Dense)
 								  print(io, "Dense(", size(l.W, 2), ", ", size(l.W, 1))
-												nicer show

											
										
										
											2017-08-21 16:20:09 +00:00
+								  l.σ == identity || print(io, ", ", l.σ)
 								  print(io, ")")
 								end
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
-												move Dense's overloads to be near its defn

											
										
										
											2019-02-27 11:46:20 +00:00
+								# Try to avoid hitting generic matmul in some simple cases
 								# Base's matmul is so slow that it's worth the extra conversion to hit BLAS
 								(a::Dense{<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
 								  invoke(a, Tuple{AbstractArray}, x)
 								(a::Dense{<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
 								  a(T.(x))
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
+								"""
-												LayerNorm tweaks

											
										
										
											2017-10-23 11:53:07 +00:00
+								    Diagonal(in::Integer)
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
 								Creates an element-wise linear transformation layer with learnable
-												std derivative

											
										
										
											2017-11-21 16:04:04 +00:00
+								vectors `α` and `β`:
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
-												std derivative

											
										
										
											2017-11-21 16:04:04 +00:00
+								    y = α .* x .+ β
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
-												LayerNorm tweaks

											
										
										
											2017-10-23 11:53:07 +00:00
+								The input `x` must be a array where `size(x, 1) == in`.
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
+								"""
-												LayerNorm tweaks

											
										
										
											2017-10-23 11:53:07 +00:00
+								struct Diagonal{T}
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
+								  α::T
 								  β::T
 								end
-												removed zeros fix

											
										
										
											2018-07-17 15:13:55 +00:00
+								Diagonal(in::Integer; initα = ones, initβ = zeros) =
-												LayerNorm tweaks

											
										
										
											2017-10-23 11:53:07 +00:00
+								  Diagonal(param(initα(in)), param(initβ(in)))
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
-												update treelike

											
										
										
											2018-07-12 21:43:11 +00:00
+								@treelike Diagonal
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
-												LayerNorm tweaks

											
										
										
											2017-10-23 11:53:07 +00:00
+								function (a::Diagonal)(x)
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
+								  α, β = a.α, a.β
 								  α.*x .+ β
 								end
-												LayerNorm tweaks

											
										
										
											2017-10-23 11:53:07 +00:00
+								function Base.show(io::IO, l::Diagonal)
 								  print(io, "Diagonal(", length(l.α), ")")
-												adding layer normalization

											
										
										
											2017-10-10 20:33:37 +00:00
+								end
-												float32 param initialisers

											
										
										
											2018-09-07 00:25:32 +00:00
-												Add MaxOut layer

											
										
										
											2019-02-27 12:04:59 +00:00
 								"""
 								    MaxOut(over)
 								MaxOut is a neural network layer, which has a number of internal layers,
 								which all have the same input, and the max out returns the elementwise maximium
 								of the internal layers' outputs.
 								Maxout over linear dense layers satisfies the univeral approximation theorem.
 								Reference:
 								Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, and Yoshua Bengio.
 . Maxout networks.
 								In Proceedings of the 30th International Conference on International Conference on Machine Learning - Volume 28 (ICML'13),
 								Sanjoy Dasgupta and David McAllester (Eds.), Vol. 28. JMLR.org III-1319-III-1327.
 								https://arxiv.org/pdf/1302.4389.pdf
 								"""
 								struct MaxOut{FS<:Tuple}
 								    over::FS
 								end
 								"""
 								    MaxOut(f, n_alts, args...; kwargs...)
 								Constructs a MaxOut layer over `n_alts` instances of  the layer given  by `f`.
 								All other arguements (`args` & `kwargs`) are passed to the constructor `f`.
 								For example the followeExample usage
 								will construct a MaxOut layer over 4 dense linear layers,
 								each identical in structure (784 inputs, 128 outputs).
 								```julia
 								    insize = 784
 								    outsie = 128
 								    MaxOut(Dense, 4, insize, outsize)
 								```
 								"""
 								function MaxOut(f, n_alts, args...; kwargs...)
 								  over = Tuple(f(args...; kwargs...) for _ in 1:n_alts)
 								  return MaxOut(over)
 								end
 								function (mo::MaxOut)(input::AbstractArray)
 								    mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 								end