diff --git a/src/data/fashion-mnist.jl b/src/data/fashion-mnist.jl index da78b605..5eaa1b29 100644 --- a/src/data/fashion-mnist.jl +++ b/src/data/fashion-mnist.jl @@ -33,9 +33,10 @@ const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte") Load the Fashion-MNIST images. -Each image is a 28×28 array of `Gray` colour values (see Colors.jl). +Each image is a 28×28 array of `Gray` colour values +(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)). -Returns the 60,000 training images by default; pass `:test` to retreive the +Return the 60,000 training images by default; pass `:test` to retrieve the 10,000 test images. """ function images(set = :train) @@ -49,10 +50,10 @@ end labels() labels(:test) -Load the labels corresponding to each of the images returned from `images()`. +Load the labels corresponding to each of the images returned from [`images()`](@ref). Each label is a number from 0-9. -Returns the 60,000 training labels by default; pass `:test` to retreive the +Return the 60,000 training labels by default; pass `:test` to retrieve the 10,000 test labels. """ function labels(set = :train) diff --git a/src/data/iris.jl b/src/data/iris.jl index f74e0709..76609677 100644 --- a/src/data/iris.jl +++ b/src/data/iris.jl @@ -2,13 +2,12 @@ Fisher's classic iris dataset. Measurements from 3 different species of iris: setosa, versicolor and -virginica. There are 50 examples of each species. +virginica. There are 50 examples of each species. -There are 4 measurements for each example: sepal length, sepal width, petal -length and petal width. The measurements are in centimeters. +There are 4 measurements for each example: sepal length, sepal width, +petal length and petal width. The measurements are in centimeters. The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris). - """ module Iris @@ -33,9 +32,7 @@ end Get the labels of the iris dataset, a 150 element array of strings listing the species of each example. -```jldoctest -julia> using Flux - +```jldoctest; setup = :(Flux.Data.Iris.load()) julia> labels = Flux.Data.Iris.labels(); julia> summary(labels) @@ -54,13 +51,11 @@ end """ features() -Get the features of the iris dataset. This is a 4x150 matrix of Float64 -elements. It has a row for each feature (sepal length, sepal width, +Get the features of the iris dataset. This is a 4x150 matrix of Float64 +elements. It has a row for each feature (sepal length, sepal width, petal length, petal width) and a column for each example. -```jldoctest -julia> using Flux - +```jldoctest; setup = :(Flux.Data.Iris.load()) julia> features = Flux.Data.Iris.features(); julia> summary(features) diff --git a/src/data/mnist.jl b/src/data/mnist.jl index b9c0540a..909814e0 100644 --- a/src/data/mnist.jl +++ b/src/data/mnist.jl @@ -83,9 +83,10 @@ getfeatures(io::IO, index::Integer) = vec(getimage(io, index)) Load the MNIST images. -Each image is a 28×28 array of `Gray` colour values (see Colors.jl). +Each image is a 28×28 array of `Gray` colour values +(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)). -Returns the 60,000 training images by default; pass `:test` to retreive the +Return the 60,000 training images by default; pass `:test` to retrieve the 10,000 test images. """ function images(set = :train) @@ -99,10 +100,10 @@ end labels() labels(:test) -Load the labels corresponding to each of the images returned from `images()`. +Load the labels corresponding to each of the images returned from [`images()`](@ref). Each label is a number from 0-9. -Returns the 60,000 training labels by default; pass `:test` to retreive the +Return the 60,000 training labels by default; pass `:test` to retrieve the 10,000 test labels. """ function labels(set = :train) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 96d67b45..4b0b4726 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -4,17 +4,23 @@ Chain multiple layers / functions together, so that they are called in sequence on a given input. -```julia -m = Chain(x -> x^2, x -> x+1) -m(5) == 26 - -m = Chain(Dense(10, 5), Dense(5, 2)) -x = rand(10) -m(x) == m[2](m[1](x)) -``` - `Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`. `m[1:3](x)` will calculate the output of the first three layers. + +# Examples +```jldoctest +julia> m = Chain(x -> x^2, x -> x+1); + +julia> m(5) == 26 +true + +julia> m = Chain(Dense(10, 5), Dense(5, 2)); + +julia> x = rand(10); + +julia> m(x) == m[2](m[1](x)) +true +``` """ struct Chain{T<:Tuple} layers::T @@ -60,6 +66,7 @@ outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))( # only slightly changed to better handle interaction with Zygote @dsweber2 """ activations(c::Chain, input) + Calculate the forward results of each layers in Chain `c` with `input` as model input. """ function activations(c::Chain, input) @@ -78,14 +85,15 @@ extraChain(::Tuple{}, x) = () """ Dense(in::Integer, out::Integer, σ = identity) -Creates a traditional `Dense` layer with parameters `W` and `b`. +Create a traditional `Dense` layer with parameters `W` and `b`. y = σ.(W * x .+ b) The input `x` must be a vector of length `in`, or a batch of vectors represented as an `in × N` matrix. The out `y` will be a vector or batch of length `out`. -```julia +# Examples +```jldoctest; setup = :(using Random; Random.seed!(0)) julia> d = Dense(5, 2) Dense(5, 2) @@ -145,7 +153,7 @@ outdims(l::Dense, isize) = (size(l.W)[1],) """ Diagonal(in::Integer) -Creates an element-wise linear transformation layer with learnable +Create an element-wise linear transformation layer with learnable vectors `α` and `β`: y = α .* x .+ β @@ -176,8 +184,8 @@ outdims(l::Diagonal, isize) = (length(l.α),) """ Maxout(over) -`Maxout` is a neural network layer, which has a number of internal layers, -which all have the same input, and the maxout returns the elementwise maximium +`Maxout` is a neural network layer which has a number of internal layers +which all receive the same input. The layer returns the elementwise maximium of the internal layers' outputs. Maxout over linear dense layers satisfies the univeral approximation theorem. @@ -196,17 +204,18 @@ end """ Maxout(f, n_alts) -Constructs a Maxout layer over `n_alts` instances of the layer given by `f`. -The function takes no arguement and should return some callable layer. -Conventionally this is a linear dense layer. +Construct a Maxout layer over `n_alts` instances of the layer given by `f`. +The function takes no arguments and should return some callable layer. +Conventionally, this is a linear dense layer. -For example the following example which -will construct a `Maxout` layer over 4 internal dense linear layers, -each identical in structure (784 inputs, 128 outputs). +# Examples + +This constructs a `Maxout` layer over 4 internal dense linear layers, each +identical in structure (784 inputs, 128 outputs): ```julia - insize = 784 - outsize = 128 - Maxout(()->Dense(insize, outsize), 4) +insize = 784 +outsize = 128 +Maxout(()->Dense(insize, outsize), 4) ``` """ function Maxout(f, n_alts) @@ -223,16 +232,18 @@ end outdims(l::Maxout, isize) = outdims(first(l.over), isize) """ - SkipConnection(layers, connection) + SkipConnection(layer, connection) -Creates a Skip Connection, of a layer or `Chain` of consecutive layers -plus a shortcut connection. The connection function will combine the result of the layers -with the original input, to give the final output. +Create a skip connection which consists of a layer or `Chain` of consecutive +layers and a shortcut connection linking the block's input to the output +through a user-supplied 2-argument callable. The first argument to the callable +will be propagated through the given `layer` while the second is the unchanged, +"skipped" input. -The simplest 'ResNet'-type connection is just `SkipConnection(layer, +)`, +The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`, and requires the output of the layers to be the same shape as the input. Here is a more complicated example: -``` +```julia m = Conv((3,3), 4=>7, pad=(1,1)) x = ones(5,5,4,10); size(m(x)) == (5, 5, 7, 10) diff --git a/src/layers/conv.jl b/src/layers/conv.jl index 742091a6..60666aa2 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -8,25 +8,26 @@ _convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+ expand(N, i::Tuple) = i expand(N, i::Integer) = ntuple(_ -> i, N) """ - Conv(size, in=>out) - Conv(size, in=>out, relu) + Conv(size, in => out, σ = identity; init = glorot_uniform, + stride = 1, pad = 0, dilation = 1) Standard convolutional layer. `size` should be a tuple like `(2, 2)`. `in` and `out` specify the number of input and output channels respectively. -Example: Applying Conv layer to a 1-channel input using a 2x2 window size, - giving us a 16-channel output. Output is activated with ReLU. - - size = (2,2) - in = 1 - out = 16 - Conv((2, 2), 1=>16, relu) - Data should be stored in WHCN order (width, height, # channels, batch size). In other words, a 100×100 RGB image would be a `100×100×3×1` array, and a batch of 50 would be a `100×100×3×50` array. -Takes the keyword arguments `pad`, `stride` and `dilation`. +# Examples + +Apply a `Conv` layer to a 1-channel input using a 2×2 window size, giving us a +16-channel output. Output is activated with ReLU. +```julia +size = (2,2) +in = 1 +out = 16 +Conv(size, in => out, relu) +``` """ struct Conv{N,M,F,A,V} σ::F @@ -76,8 +77,8 @@ end """ outdims(l::Conv, isize::Tuple) -Calculate the output dimensions given the input dimensions, `isize`. -Batch size and channel size are ignored as per `NNlib.jl`. +Calculate the output dimensions given the input dimensions `isize`. +Batch size and channel size are ignored as per [NNlib.jl](https://github.com/FluxML/NNlib.jl). ```julia m = Conv((3, 3), 3 => 16) @@ -89,17 +90,15 @@ outdims(l::Conv, isize) = output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation)) """ - ConvTranspose(size, in=>out) - ConvTranspose(size, in=>out, relu) + ConvTranspose(size, in => out, σ = identity; init = glorot_uniform, + stride = 1, pad = 0, dilation = 1) Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`. `in` and `out` specify the number of input and output channels respectively. -Data should be stored in WHCN order (width, height, # channels, # batches). +Data should be stored in WHCN order (width, height, # channels, batch size). In other words, a 100×100 RGB image would be a `100×100×3×1` array, and a batch of 50 would be a `100×100×3×50` array. - -Takes the keyword arguments `pad`, `stride` and `dilation`. """ struct ConvTranspose{N,M,F,A,V} σ::F @@ -165,18 +164,16 @@ end outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad) """ - DepthwiseConv(size, in=>out) - DepthwiseConv(size, in=>out, relu) + DepthwiseConv(size, in => out, σ = identity; init = glorot_uniform, + stride = 1, pad = 0, dilation = 1) Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`. `in` and `out` specify the number of input and output channels respectively. Note that `out` must be an integer multiple of `in`. -Data should be stored in WHCN order (width, height, # channels, # batches). +Data should be stored in WHCN order (width, height, # channels, batch size). In other words, a 100×100 RGB image would be a `100×100×3×1` array, and a batch of 50 would be a `100×100×3×50` array. - -Takes the keyword arguments `pad`, `stride` and `dilation`. """ struct DepthwiseConv{N,M,F,A,V} σ::F @@ -233,25 +230,26 @@ outdims(l::DepthwiseConv, isize) = output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation)) """ - CrossCor(size, in=>out) - CrossCor(size, in=>out, relu) + CrossCor(size, in => out, σ = identity; init = glorot_uniform, + stride = 1, pad = 0, dilation = 1) Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`. `in` and `out` specify the number of input and output channels respectively. -Example: Applying CrossCor layer to a 1-channel input using a 2x2 window size, - giving us a 16-channel output. Output is activated with ReLU. - - size = (2,2) - in = 1 - out = 16 - CrossCor((2, 2), 1=>16, relu) - -Data should be stored in WHCN order (width, height, # channels, # batches). +Data should be stored in WHCN order (width, height, # channels, batch size). In other words, a 100×100 RGB image would be a `100×100×3×1` array, and a batch of 50 would be a `100×100×3×50` array. -Takes the keyword arguments `pad`, `stride` and `dilation`. +# Examples + +Apply a `CrossCor` layer to a 1-channel input using a 2×2 window size, giving us a +16-channel output. Output is activated with ReLU. +```julia +size = (2,2) +in = 1 +out = 16 +CrossCor((2, 2), 1=>16, relu) +``` """ struct CrossCor{N,M,F,A,V} σ::F @@ -357,11 +355,9 @@ function Base.show(io::IO, g::GlobalMeanPool) end """ - MaxPool(k) + MaxPool(k; pad = 0, stride = k) -Max pooling layer. `k` stands for the size of the window for each dimension of the input. - -Takes the keyword arguments `pad` and `stride`. +Max pooling layer. `k` is the size of the window for each dimension of the input. """ struct MaxPool{N,M} k::NTuple{N,Int} @@ -388,11 +384,9 @@ end outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad)) """ - MeanPool(k) + MeanPool(k; pad = 0, stride = k) -Mean pooling layer. `k` stands for the size of the window for each dimension of the input. - -Takes the keyword arguments `pad` and `stride`. +Mean pooling layer. `k` is the size of the window for each dimension of the input. """ struct MeanPool{N,M} k::NTuple{N,Int} diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 3828748f..76d312bf 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -10,14 +10,14 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0) """ - dropout(p, dims = :) + dropout(x, p; dims = :) -Dropout function. For each input, either sets that input to `0` (with probability -`p`) or scales it by `1/(1-p)`. The `dims` argument is to specify the unbroadcasted -dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is -used as a regularisation, i.e. it reduces overfitting during training. - -See also [`Dropout`](@ref). +The dropout function. For each input, either sets that input to `0` (with probability +`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions, +e.g. `dims=1` applies dropout along columns and `dims=2` along rows. +This is used as a regularisation, i.e. it reduces overfitting during training. + +See also the [`Dropout`](@ref) layer. """ dropout(x, p; dims = :) = x @@ -32,7 +32,7 @@ end A Dropout layer. In the forward pass, applies the [`dropout`](@ref) function on the input. -Does nothing to the input once [`testmode!`](@ref) is true. +Does nothing to the input once [`Flux.testmode!`](@ref) is `true`. """ mutable struct Dropout{F,D} p::F @@ -64,9 +64,9 @@ end """ AlphaDropout(p) - -A dropout layer. It is used in Self-Normalizing Neural Networks. -(https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf) + +A dropout layer. It is used in +[Self-Normalizing Neural Networks](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf). The AlphaDropout layer ensures that mean and variance of activations remains the same as before. Does nothing to the input once [`testmode!`](@ref) is true. @@ -100,8 +100,8 @@ testmode!(m::AlphaDropout, mode = true) = LayerNorm(h::Integer) A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be -used with recurrent hidden states of size `h`. Normalises the mean/stddev of -each input before applying a per-neuron gain/bias. +used with recurrent hidden states of size `h`. Normalises the mean and standard +deviation of each input before applying a per-neuron gain/bias. """ struct LayerNorm{T} diag::Diagonal{T} @@ -139,7 +139,7 @@ Use [`testmode!`](@ref) during inference. See [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf). -Example: +# Examples ```julia m = Chain( Dense(28^2, 64), @@ -234,7 +234,7 @@ Use [`testmode!`](@ref) during inference. See [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022). -Example: +# Examples ```julia m = Chain( Dense(28^2, 64), @@ -316,28 +316,27 @@ function Base.show(io::IO, l::InstanceNorm) end """ -Group Normalization. -This layer can outperform Batch-Normalization and Instance-Normalization. + GroupNorm(chs::Integer, G::Integer, λ = identity; + initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), + ϵ = 1f-5, momentum = 0.1f0) - GroupNorm(chs::Integer, G::Integer, λ = identity; - initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), - ϵ = 1f-5, momentum = 0.1f0) +[Group Normalization](https://arxiv.org/pdf/1803.08494.pdf) layer. +This layer can outperform Batch Normalization and Instance Normalization. -``chs`` is the number of channels, the channel dimension of your input. -For an array of N dimensions, the (N-1)th index is the channel dimension. +`chs` is the number of channels, the channel dimension of your input. +For an array of N dimensions, the `N-1`th index is the channel dimension. -``G`` is the number of groups along which the statistics would be computed. +`G` is the number of groups along which the statistics are computed. The number of channels must be an integer multiple of the number of groups. Use [`testmode!`](@ref) during inference. -Example: -``` +# Examples +```julia m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1), - GroupNorm(32,16)) # 32 channels, 16 groups (G = 16), thus 2 channels per group used + GroupNorm(32,16)) + # 32 channels, 16 groups (G = 16), thus 2 channels per group used ``` - -Link : https://arxiv.org/pdf/1803.08494.pdf """ mutable struct GroupNorm{F,V,W,N,T} G::T # number of groups diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index 647dda25..05466b31 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -12,7 +12,7 @@ in the background. `cell` should be a model of the form: h, y = cell(h, x...) -For example, here's a recurrent network that keeps a running total of its inputs. +For example, here's a recurrent network that keeps a running total of its inputs: ```julia accum(h, x) = (h+x, x) @@ -135,8 +135,8 @@ Base.show(io::IO, l::LSTMCell) = """ LSTM(in::Integer, out::Integer) -Long Short Term Memory recurrent layer. Behaves like an RNN but generally -exhibits a longer memory span over sequences. +[Long Short Term Memory](https://www.researchgate.net/publication/13853244_Long_Short-term_Memory) +recurrent layer. Behaves like an RNN but generally exhibits a longer memory span over sequences. See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) for a good overview of the internals. @@ -176,8 +176,8 @@ Base.show(io::IO, l::GRUCell) = """ GRU(in::Integer, out::Integer) -Gated Recurrent Unit layer. Behaves like an RNN but generally -exhibits a longer memory span over sequences. +[Gated Recurrent Unit](https://arxiv.org/abs/1406.1078) layer. Behaves like an +RNN but generally exhibits a longer memory span over sequences. See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) for a good overview of the internals. diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index b598fdd4..b566c683 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -73,7 +73,7 @@ computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`. `weight` can be `Nothing`, a `Number` or an `AbstractVector`. `weight=nothing` acts like `weight=1` but is faster. -See also [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref). +See also: [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref) # Examples ```jldoctest @@ -86,13 +86,13 @@ crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _cros """ logitcrossentropy(ŷ, y; weight = 1) -Return the crossentropy computed after a [`logsoftmax`](@ref) operation; +Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation; computed as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`. `logitcrossentropy(ŷ, y)` is mathematically equivalent to -[`crossentropy(softmax(log(ŷ)), y)`](@ref) but it is more numerically stable. +[`Flux.crossentropy(softmax(log(ŷ)), y)`](@ref) but it is more numerically stable. -See also [`crossentropy`](@ref), [`binarycrossentropy`](@ref). +See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref) # Examples ```jldoctest @@ -107,9 +107,20 @@ end """ binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) -Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerical stability. +Return ``-y*\\log(ŷ + ϵ) - (1-y)*\\log(1-ŷ + ϵ)``. The `ϵ` term provides numerical stability. Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation. + +See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref) + +# Examples +```jldoctest +julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0]) +3-element Array{Float64,1}: + 1.424397097347566 + 0.35231664672364077 + 0.8616703662235441 +``` """ binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ) @@ -119,10 +130,19 @@ CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 """ logitbinarycrossentropy(ŷ, y) -`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(ŷ), y)` -but it is more numerically stable. +`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to +[`Flux.binarycrossentropy(σ(log(ŷ)), y)`](@ref) but it is more numerically stable. -See also [`binarycrossentropy`](@ref), [`sigmoid`](@ref), [`logsigmoid`](@ref). +See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref) + +# Examples +```jldoctest +julia> Flux.logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0]) +3-element Array{Float64,1}: + 1.4243970973475661 + 0.35231664672364094 + 0.8616703662235443 +``` """ logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ) @@ -132,26 +152,27 @@ CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ) """ normalise(x; dims=1) -Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns. +Normalise `x` to mean 0 and standard deviation 1 across the dimensions given by `dims`. +Defaults to normalising over columns. -```julia-repl +```jldoctest julia> a = reshape(collect(1:9), 3, 3) 3×3 Array{Int64,2}: - 1 4 7 - 2 5 8 - 3 6 9 + 1 4 7 + 2 5 8 + 3 6 9 -julia> normalise(a) +julia> Flux.normalise(a) 3×3 Array{Float64,2}: - -1.22474 -1.22474 -1.22474 + -1.22474 -1.22474 -1.22474 0.0 0.0 0.0 1.22474 1.22474 1.22474 -julia> normalise(a, dims=2) +julia> Flux.normalise(a, dims=2) 3×3 Array{Float64,2}: - -1.22474 0.0 1.22474 - -1.22474 0.0 1.22474 - -1.22474 0.0 1.22474 + -1.22474 0.0 1.22474 + -1.22474 0.0 1.22474 + -1.22474 0.0 1.22474 ``` """ function normalise(x::AbstractArray; dims=1) @@ -191,7 +212,7 @@ Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 o Returns `sum((max.(0, 1 .- ŷ .* y))) / size(y, 2)` [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss) -See also [`squared_hinge`](@ref). +See also: [`squared_hinge`](@ref) """ hinge(ŷ, y) = sum(max.(0, 1 .- ŷ .* y)) * 1 // size(y, 2) @@ -201,7 +222,7 @@ hinge(ŷ, y) = sum(max.(0, 1 .- ŷ .* y)) * 1 // size(y, 2) Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1). Returns `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)` -See also [`hinge`](@ref). +See also: [`hinge`](@ref) """ squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) * 1 // size(y, 2) diff --git a/src/onehot.jl b/src/onehot.jl index b480d9c0..7a046dc1 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -45,22 +45,20 @@ cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.d """ onehot(l, labels[, unk]) -Create an [`OneHotVector`](@ref) wtih `l`-th element be `true` based on possible `labels` set. -If `unk` is given, it retruns `onehot(unk, labels)` if the input label `l` is not find in `labels`; otherwise -it will error. - -## Examples +Create a [`OneHotVector`](@ref) with its `l`-th element `true` based on +possible `labels` set. +If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found +in `labels`; otherwise it will error. +# Examples ```jldoctest -julia> using Flux: onehot - -julia> onehot(:b, [:a, :b, :c]) +julia> Flux.onehot(:b, [:a, :b, :c]) 3-element Flux.OneHotVector: 0 1 0 -julia> onehot(:c, [:a, :b, :c]) +julia> Flux.onehot(:c, [:a, :b, :c]) 3-element Flux.OneHotVector: 0 0 @@ -85,12 +83,9 @@ end Create an [`OneHotMatrix`](@ref) with a batch of labels based on possible `labels` set, returns the `onehot(unk, labels)` if given labels `ls` is not found in set `labels`. -## Examples - +# Examples ```jldoctest -julia> using Flux: onehotbatch - -julia> onehotbatch([:b, :a, :b], [:a, :b, :c]) +julia> Flux.onehotbatch([:b, :a, :b], [:a, :b, :c]) 3×3 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}: 0 1 0 1 0 1 @@ -107,13 +102,12 @@ Base.argmax(xs::OneHotVector) = xs.ix Inverse operations of [`onehot`](@ref). +# Examples ```jldoctest -julia> using Flux: onecold - -julia> onecold([true, false, false], [:a, :b, :c]) +julia> Flux.onecold([true, false, false], [:a, :b, :c]) :a -julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c]) +julia> Flux.onecold([0.3, 0.2, 0.5], [:a, :b, :c]) :c ``` """ diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index 7db5bff5..4f121edf 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -6,19 +6,20 @@ const ϵ = 1e-8 # TODO: should use weak refs """ - Descent(η) + Descent(η = 0.1) Classic gradient descent optimiser with learning rate `η`. For each parameter `p` and its gradient `δp`, this runs `p -= η*δp` -## Parameters - - Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`. +# Parameters + - Learning rate (`η`): Amount by which the gradients are discounted before updating + the weights. -## Example -```julia-repl -opt = Descent() # uses default η (0.1) +# Examples +```julia +opt = Descent() -opt = Descent(0.3) # use provided η +opt = Descent(0.3) ps = params(model) @@ -40,17 +41,19 @@ function apply!(o::Descent, x, Δ) end """ - Momentum(η, ρ) + Momentum(η = 0.01, ρ = 0.9) -Gradient descent with learning rate `η` and momentum `ρ`. +Gradient descent optimizer with learning rate `η` and momentum `ρ`. -## Parameters - - Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`. - - Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`. +# Parameters + - Learning rate (`η`): Amount by which gradients are discounted before updating the + weights. + - Momentum (`ρ`): Controls the acceleration of gradient descent in the relevant direction + and therefore the dampening of oscillations. -## Examples +# Examples ```julia -opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9 +opt = Momentum() opt = Momentum(0.01, 0.99) ``` @@ -71,17 +74,18 @@ function apply!(o::Momentum, x, Δ) end """ - Nesterov(η, ρ) + Nesterov(η = 0.001, ρ = 0.9) -Gradient descent with learning rate `η` and Nesterov momentum `ρ`. +Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`. -## Parameters - - Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`. - - Nesterov Momentum (ρ): Parameters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`. +# Parameters + - Learning rate (`η`): Amount by which the gradients are discounted before updating the + weights. + - Nesterov momentum (`ρ`): The amount of Nesterov momentum to be applied. -## Examples +# Examples ```julia -opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9 +opt = Nesterov() opt = Nesterov(0.003, 0.95) ``` @@ -103,23 +107,23 @@ function apply!(o::Nesterov, x, Δ) end """ - RMSProp(η, ρ) + RMSProp(η = 0.001, ρ = 0.9) -Implements the RMSProp algortihm. Often a good choice for recurrent networks. Parameters other than learning rate generally don't need tuning. +Optimizer using the +[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) +algorithm. Often a good choice for recurrent networks. Parameters other than learning rate +generally don't need tuning. -## Parameters - - Learning Rate (η): Defaults to `0.001`. - - Rho (ρ): Defaults to `0.9`. +# Parameters + - Learning rate (`η`) + - Momentum (`ρ`) -## Examples +# Examples ```julia -opt = RMSProp() # uses default η = 0.001 and ρ = 0.9 +opt = RMSProp() opt = RMSProp(0.002, 0.95) ``` - -## References -[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) """ mutable struct RMSProp eta::Float64 @@ -137,23 +141,21 @@ function apply!(o::RMSProp, x, Δ) end """ - ADAM(η, β::Tuple) + ADAM(η = 0.001, β::Tuple = (0.9, 0.999)) -Implements the ADAM optimiser. +[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser. -## Paramters - - Learning Rate (`η`): Defaults to `0.001`. - - Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`. - -## Examples +# Parameters + - Learning rate (`η`) + - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the + second (β2) momentum estimate. +# Examples ```julia -opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999) +opt = ADAM() opt = ADAM(0.001, (0.9, 0.8)) ``` -## References -[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser. """ mutable struct ADAM eta::Float64 @@ -174,24 +176,21 @@ function apply!(o::ADAM, x, Δ) end """ - RADAM(η, β::Tuple) + RADAM(η = 0.001, β::Tuple = (0.9, 0.999)) -Implements the rectified ADAM optimizer. +[Rectified ADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimizer. -## Parameters - - Learning Rate (η): Defaults to `0.001` - - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`. - -## Examples +# Parameters + - Learning rate (`η`) + - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the + second (β2) momentum estimate. +# Examples ```julia -opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999) +opt = RADAM() opt = RADAM(0.001, (0.9, 0.8)) ``` - -## References -[RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM). """ mutable struct RADAM eta::Float64 @@ -219,22 +218,21 @@ function apply!(o::RADAM, x, Δ) end """ - AdaMax(η, β::Tuple) + AdaMax(η = 0.001, β::Tuple = (0.9, 0.999)) -Variant of ADAM based on ∞-norm. +[AdaMax](https://arxiv.org/abs/1412.6980v9) is a variant of ADAM based on the ∞-norm. -## Parameters - - Learning Rate (η): Defaults to `0.001` - - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`. +# Parameters + - Learning rate (`η`) + - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the + second (β2) momentum estimate. -## Examples +# Examples ```julia -opt = AdaMax() # uses default η and β +opt = AdaMax() opt = AdaMax(0.001, (0.9, 0.995)) ``` -## References -[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. """ mutable struct AdaMax eta::Float64 @@ -255,23 +253,21 @@ function apply!(o::AdaMax, x, Δ) end """ - ADAGrad(η) + ADAGrad(η = 0.1) -Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated. +[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has +parameter specific learning rates based on how frequently it is updated. +Parameters don't need tuning. -## Parameters - - Learning Rate (η): Defaults to `0.1` +# Parameters + - Learning rate (`η`) -## Examples +# Examples ```julia -opt = ADAGrad() # uses default η = 0.1 +opt = ADAGrad() opt = ADAGrad(0.001) ``` - -## References -[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser. -Parameters don't need tuning. """ mutable struct ADAGrad eta::Float64 @@ -288,21 +284,21 @@ function apply!(o::ADAGrad, x, Δ) end """ - ADADelta(ρ) + ADADelta(ρ = 0.9) -Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning. +[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning +rate based on a window of past gradient updates. +Parameters don't need tuning. -## Parameters - - Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`. +# Parameters + - Rho (`ρ`): Factor by which gradient is decayed at each time step. -## Examples +# Examples ```julia -opt = ADADelta() # uses default ρ = 0.9 +opt = ADADelta() + opt = ADADelta(0.89) ``` - -## References -[ADADelta](https://arxiv.org/abs/1212.5701) optimiser. """ mutable struct ADADelta rho::Float64 @@ -321,22 +317,22 @@ function apply!(o::ADADelta, x, Δ) end """ - AMSGrad(η, β::Tuple) + AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999)) -Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning. +The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM +optimiser. Parameters don't need tuning. -## Parameters - - Learning Rate (η): Defaults to `0.001`. - - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`. +# Parameters + - Learning Rate (`η`) + - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the + second (β2) momentum estimate. -## Examples +# Examples ```julia -opt = AMSGrad() # uses default η and β +opt = AMSGrad() + opt = AMSGrad(0.001, (0.89, 0.995)) ``` - -## References -[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. """ mutable struct AMSGrad eta::Float64 @@ -356,22 +352,22 @@ function apply!(o::AMSGrad, x, Δ) end """ - NADAM(η, β::Tuple) + NADAM(η = 0.001, β::Tuple = (0.9, 0.999)) -Nesterov variant of ADAM. Parameters don't need tuning. +[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) is a Nesterov variant of ADAM. +Parameters don't need tuning. -## Parameters - - Learning Rate (η): Defaults to `0.001`. - - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`. +# Parameters + - Learning rate (`η`) + - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the + second (β2) momentum estimate. -## Examples +# Examples ```julia -opt = NADAM() # uses default η and β +opt = NADAM() + opt = NADAM(0.002, (0.89, 0.995)) ``` - -## References -[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. """ mutable struct NADAM eta::Float64 @@ -392,23 +388,23 @@ function apply!(o::NADAM, x, Δ) end """ - ADAMW(η, β::Tuple, decay) + ADAMW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0) -Variant of ADAM defined by fixing weight decay regularization. +[ADAMW](https://arxiv.org/abs/1711.05101) is a variant of ADAM fixing (as in repairing) its +weight decay regularization. -## Parameters - - Learning Rate (η): Defaults to `0.001`. - - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999). - - decay: Decay applied to weights during optimisation. Defaults to 0. +# Parameters + - Learning rate (`η`) + - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the + second (β2) momentum estimate. + - `decay`: Decay applied to weights during optimisation. -## Examples +# Examples ```julia -opt = ADAMW() # uses default η, β and decay +opt = ADAMW() + opt = ADAMW(0.001, (0.89, 0.995), 0.1) ``` - -## References -[ADAMW](https://arxiv.org/abs/1711.05101) """ ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) = Optimiser(ADAM(η, β), WeightDecay(decay)) @@ -441,14 +437,13 @@ function apply!(o::Optimiser, x, Δ) end """ - InvDecay(γ) + InvDecay(γ = 0.001) -Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified. +Apply inverse time decay to an optimiser, so that the effective step size at +iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. +The wrapped optimiser's step size is not modified. -## Parameters - - gamma (γ): Defaults to `0.001` - -## Example +# Examples ```julia Optimiser(InvDecay(..), Opt(..)) ``` @@ -469,20 +464,23 @@ function apply!(o::InvDecay, x, Δ) end """ - ExpDecay(eta, decay, decay_step, clip) + ExpDecay(eta = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4) -Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`. +Discount the learning rate `eta` by the factor `decay` every `decay_step` steps till +a minimum of `clip`. -## Parameters - - Learning Rate (eta): Defaults to `0.001`. - - decay: Factor by which the learning rate is discounted. Defaults to `0.1`. - - decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`. - - clip: Minimum value of learning rate. Defaults to `1e-4`. +# Parameters + - Learning rate (`eta`) + - `decay`: Factor by which the learning rate is discounted. + - `decay_step`: Schedule decay operations by setting number of steps between two decay + operations. + - `clip`: Minimum value of learning rate. -## Example +# Examples To apply exponential decay to an optimiser: ```julia Optimiser(ExpDecay(..), Opt(..)) + opt = Optimiser(ExpDecay(), ADAM()) ``` """ @@ -507,12 +505,12 @@ function apply!(o::ExpDecay, x, Δ) end """ - WeightDecay(wd) + WeightDecay(wd = 0) -Decays the weight by `wd` +Decay weights by `wd`. -## Parameters - - weight decay (wd): 0 +# Parameters + - Weight decay (`wd`) """ mutable struct WeightDecay wd::Real diff --git a/src/optimise/train.jl b/src/optimise/train.jl index e12ab27b..9c3c29bd 100644 --- a/src/optimise/train.jl +++ b/src/optimise/train.jl @@ -43,9 +43,8 @@ struct StopException <: Exception end Call `Flux.stop()` in a callback to indicate when a callback condition is met. This would trigger the train loop to stop and exit. +# Examples ```julia -# Example callback: - cb = function () accuracy() > 0.9 && Flux.stop() end @@ -65,12 +64,12 @@ In case datapoints `d` are of numeric array type, assumes no splatting is needed and computes the gradient of `loss(d)`. Takes a callback as keyword argument `cb`. For example, this will print "training" -every 10 seconds: +every 10 seconds (using [`throttle`](@ref)): train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10)) -The callback can call `Flux.stop()` to interrupt the training loop. +The callback can call [`Flux.stop()`](@ref) to interrupt the training loop. Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays. """ @@ -106,11 +105,12 @@ end Run `body` `N` times. Mainly useful for quickly doing multiple epochs of training in a REPL. -```julia -julia> @epochs 2 println("hello") -INFO: Epoch 1 +# Examples +```jldoctest +julia> Flux.@epochs 2 println("hello") +[ Info: Epoch 1 hello -INFO: Epoch 2 +[ Info: Epoch 2 hello ``` """ diff --git a/src/utils.jl b/src/utils.jl index 25be1063..40f0ae9c 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -125,8 +125,9 @@ unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)] Split `xs` into `n` parts. -```julia -julia> chunk(1:10, 3) +# Examples +```jldoctest +julia> Flux.chunk(1:10, 3) 3-element Array{Array{Int64,1},1}: [1, 2, 3, 4] [5, 6, 7, 8] @@ -142,11 +143,12 @@ batchindex(xs, i) = (reverse(Base.tail(reverse(axes(xs))))..., i) Count the number of times that each element of `xs` appears. -```julia -julia> frequencies(['a','b','b']) +# Examples +```jldoctest +julia> Flux.frequencies(['a','b','b']) Dict{Char,Int64} with 2 entries: - 'b' => 2 'a' => 1 + 'b' => 2 ``` """ function frequencies(xs) @@ -166,8 +168,9 @@ squeezebatch(x) = reshape(x, head(size(x))) Batch the arrays in `xs` into a single array. -```julia -julia> batch([[1,2,3],[4,5,6]]) +# Examples +```jldoctest +julia> Flux.batch([[1,2,3],[4,5,6]]) 3×2 Array{Int64,2}: 1 4 2 5 @@ -211,8 +214,9 @@ Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0)) Take a list of `N` sequences, and turn them into a single sequence where each item is a batch of `N`. Short sequences will be padded by `pad`. -```julia -julia> batchseq([[1, 2, 3], [4, 5]], 0) +# Examples +```jldoctest +julia> Flux.batchseq([[1, 2, 3], [4, 5]], 0) 3-element Array{Array{Int64,1},1}: [1, 4] [2, 5] @@ -269,11 +273,15 @@ end # Other """ -Returns a function that when invoked, will only be triggered at most once -during `timeout` seconds. Normally, the throttled function will run -as much as it can, without ever going more than once per `wait` duration; -but if you'd like to disable the execution on the leading edge, pass -`leading=false`. To enable execution on the trailing edge, ditto. + throttle(f, timeout; leading=true, trailing=false) + +Return a function that when invoked, will only be triggered at most once +during `timeout` seconds. + +Normally, the throttled function will run as much as it can, without ever +going more than once per `wait` duration; but if you'd like to disable the +execution on the leading edge, pass `leading=false`. To enable execution on +the trailing edge, pass `trailing=true`. """ function throttle(f, timeout; leading=true, trailing=false) cooldown = true