From fc123d6279677de37ede997bb7decff68ac791a2 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 20 Oct 2019 13:35:41 +0200
Subject: [PATCH 001/113] Add SamePad for conv layers

---
 src/Flux.jl         |  2 +-
 src/layers/conv.jl  | 38 ++++++++++++++++++++++++++++++++++----
 test/layers/conv.jl | 17 +++++++++++++++++
 3 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 95bdcd32..c6fda5dc 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -9,7 +9,7 @@ using MacroTools: @forward
 using Zygote: Params, @adjoint, gradient, pullback
 export gradient
 
-export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
+export Chain, Dense, Maxout, RNN, LSTM, GRU, SamePad, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
        DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
        SkipConnection, params, fmap, cpu, gpu, f32, f64
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 519f129f..4a7f916c 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -2,6 +2,28 @@ using NNlib: conv, ∇conv_data, depthwiseconv
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
+
+"""
+    SamePad
+
+Padding for convolutional layers will be calculated so that outputshape == inputshape when stride = 1.
+
+For stride > 1 the output shape depends on the type of convolution layer.
+"""
+struct SamePad end
+
+calc_padding(pad, k::NTuple{N,T}, dilation, stride) where {T,N}= expand(Val(2*(N-2)), pad)
+function calc_padding(::SamePad, k, dilation, stride)
+  #Formula from Relationship 14 in http://deeplearning.net/software/theano_versions/dev/tutorial/conv_arithmetic.html
+
+  # Effective kernel size, including dilation
+  k_eff = @. k + (k - 1) * (dilation - 1)
+  # How much total padding needs to be applied?
+  pad_amt = @. k_eff - 1
+  # In case amount of padding is odd we need to apply different amounts to each side.
+  return Tuple(mapfoldl(i -> [ceil(Int, i/2), i ÷ 2], vcat, pad_amt))
+end
+
 """
     Conv(size, in=>out)
     Conv(size, in=>out, relu)
@@ -22,6 +44,8 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride
 """
 struct Conv{N,M,F,A,V}
   σ::F
@@ -35,8 +59,8 @@ end
 function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
-  pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
   return Conv(σ, w, b, stride, pad, dilation)
 end
 
@@ -79,6 +103,8 @@ Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1
 """
 struct ConvTranspose{N,M,F,A,V}
   σ::F
@@ -92,8 +118,8 @@ end
 function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
-  pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
   return ConvTranspose(σ, w, b, stride, pad, dilation)
 end
 
@@ -149,6 +175,8 @@ Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride
 """
 struct DepthwiseConv{N,M,F,A,V}
   σ::F
@@ -162,8 +190,8 @@ end
 function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
                        stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
-  pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
   return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 
@@ -221,6 +249,8 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride
 """
 struct CrossCor{N,M,F,A,V}
   σ::F
@@ -234,8 +264,8 @@ end
 function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
-  pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
   return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index aa3925f1..d65d9fee 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -102,3 +102,20 @@ end
     true
   end
 end
+
+@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, CrossCor), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
+  data = ones(Float32, (k .+ 3)..., 1,1)
+  l = ltype(k, 1=>1, pad=SamePad())
+  @test size(l(data)) == size(data)
+
+  l = ltype(k, 1=>1, pad=SamePad(), dilation = k .÷ 2)
+  @test size(l(data)) == size(data)
+
+  stride = 3
+  l = ltype(k, 1=>1, pad=SamePad(), stride = stride)
+  if ltype == ConvTranspose
+    @test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] .- stride .- 1
+  else
+    @test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ stride)
+  end
+end

From 411ce5dbd873c455163a2a310336eb252745adf5 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 20 Oct 2019 13:43:39 +0200
Subject: [PATCH 002/113] Add SamePad for pooling layers

---
 src/layers/conv.jl  | 9 ++++++---
 test/layers/conv.jl | 9 ++++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 4a7f916c..2b465d65 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -308,6 +308,8 @@ end
 Max pooling layer. `k` stands for the size of the window for each dimension of the input.
 
 Takes the keyword arguments `pad` and `stride`.
+
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride
 """
 struct MaxPool{N,M}
   k::NTuple{N,Int}
@@ -317,8 +319,7 @@ end
 
 function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
   stride = expand(Val(N), stride)
-  pad = expand(Val(2*N), pad)
-
+  pad = calc_padding(pad, k, 1, stride)
   return MaxPool(k, pad, stride)
 end
 
@@ -337,6 +338,8 @@ end
 Mean pooling layer. `k` stands for the size of the window for each dimension of the input.
 
 Takes the keyword arguments `pad` and `stride`.
+
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride
 """
 struct MeanPool{N,M}
     k::NTuple{N,Int}
@@ -346,7 +349,7 @@ end
 
 function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
   stride = expand(Val(N), stride)
-  pad = expand(Val(2*N), pad)
+  pad = calc_padding(pad, k, 1, stride)
   return MeanPool(k, pad, stride)
 end
 
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index d65d9fee..75098660 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -114,8 +114,15 @@ end
   stride = 3
   l = ltype(k, 1=>1, pad=SamePad(), stride = stride)
   if ltype == ConvTranspose
-    @test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] .- stride .- 1
+    @test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] .- stride .+ 1
   else
     @test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ stride)
   end
 end
+
+@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
+    data = ones(Float32, (k .+ 3)..., 1,1)
+
+    l = ltype(k, pad=SamePad())
+    @test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ k)
+end

From 530d4edb679eb684c3a03503ef66dd15c762166a Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 20 Oct 2019 15:15:30 +0200
Subject: [PATCH 003/113] Fix for reading comprehension error (dim is not
 always 2 * (N-2)) Fix for ambiguous method sig

---
 src/layers/conv.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 2b465d65..58a913da 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -12,8 +12,8 @@ For stride > 1 the output shape depends on the type of convolution layer.
 """
 struct SamePad end
 
-calc_padding(pad, k::NTuple{N,T}, dilation, stride) where {T,N}= expand(Val(2*(N-2)), pad)
-function calc_padding(::SamePad, k, dilation, stride)
+calc_padding(pad, k::NTuple{N,T}, dilation, stride) where {T,N}= expand(Val(2*N), pad)
+function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
   #Formula from Relationship 14 in http://deeplearning.net/software/theano_versions/dev/tutorial/conv_arithmetic.html
 
   # Effective kernel size, including dilation

From 9803826a368fa3f527e9c2682876f168e11f75fc Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Mon, 20 Jan 2020 13:53:28 -0500
Subject: [PATCH 004/113] test restructure on the GPU

Requires https://github.com/FluxML/Zygote.jl/pull/474
---
 test/cuda/cuda.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 1576d88f..911eef93 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -58,6 +58,13 @@ end
   @test y[3,:] isa CuArray
 end
 
+@testset "restructure gpu" begin
+  dudt = Dense(1,1) |> gpu
+  p,re = Flux.destructure(dudt)
+  foo(x) = sum(re(p)(x))
+  @test gradient(foo, cu(rand(1)))[1] isa CuArray
+end
+
 if CuArrays.has_cudnn()
   @info "Testing Flux/CUDNN"
   include("cudnn.jl")

From 7ac647a7ac83ad688863082c6f37a72279200e36 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Wed, 5 Feb 2020 22:29:15 +0530
Subject: [PATCH 005/113] Added loss functions

---
 src/layers/stateless.jl | 117 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 159a8385..1324f62c 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -2,9 +2,91 @@ using CuArrays
 using NNlib: logsoftmax, logσ
 
 # Cost functions
+"""
+    mae(ŷ, y)
+L1 loss function. Computes the mean of absolute error between prediction and true values
+"""
+mae(ŷ, y) = sum(abs.(ŷ, y)) * 1 // length(y)
 
+
+"""
+    mse(ŷ, y)
+L2 loss function. Computes the mean of the squared errors between prediction and true values
+"""
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
+
+"""
+    mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
+
+L2 loss function. Returns the mean of the squared logarithmic errors of prediction ŷ, and true values y. The ϵ1 and ϵ2 terms provide numerical stability.
+(Computes mean of squared(log(predicted values)-log(true value)). This error penalizes an under-predicted estimate greater than an over-predicted estimate.
+
+  ```julia
+  julia> y_=[14726,327378,74734]
+  3-element Array{Int64,1}:
+    14726
+  327378
+    74734
+
+  julia> y = [12466.1,16353.95,16367.98]
+  3-element Array{Float64,1}:
+  12466.1 
+  16353.95
+  16367.98
+
+  julia> mean_squared_logarithmic_error(y,y_)
+  3.771271382334686
+  ```
+Alias:
+  msle(ŷ,y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
+
+"""
+mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+#Alias
+msle(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+
+
+
+"""
+    huber_loss(ŷ, y,delta=1.0)
+
+Computes the mean of the Huber loss between prediction ŷ and true values y. By default, delta is set to 1.0.
+[Huber Loss](https://en.wikipedia.org/wiki/Huber_loss).
+  
+  ```julia
+  julia> y = [1.2636,1.25,1.73]
+  3-element Array{Float64,1}:
+  1.2636
+  1.25  
+  1.73  
+
+  julia> y_= [-1.376,0,3.37]
+  3-element Array{Float64,1}:
+  -1.376
+   0.0  
+   3.37 
+
+  julia> huber_loss(y,y_)
+  0.7131999999999998
+  ```
+
+"""
+function huber_loss(ŷ, y,delta=1.0)
+  abs_error = abs.(ŷ.-y)
+  hub_loss =0
+  for i in 1:length(y)
+    if (abs_error[i]<=delta)
+      hub_loss+=abs_error[i]^2*0.5
+    else
+      hub_loss+=delta*(abs_error[i]-0.5*delta)
+    end
+  
+  return hub_loss*1//length(y)
+  end
+end
+
+
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
   return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
 end
@@ -17,8 +99,32 @@ function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Abstr
   return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
 end
 
+"""
+  crossentropy(ŷ, y, weight)
+
+Computes crossentropy loss over the prediction ŷ and true labels y(expected `onehot` encoded). 'weight' parameter allows to set the class weights while calculating loss.
+It can be a number or a vector of class weights. By default, weight is set to nothing.
+
+  ```julia
+  julia> ŷ = [0.33 .11 .98;0.11 0.34 0.11]
+  2×3 Array{Float64,2}:
+  0.33  0.11  0.98
+  0.11  0.34  0.11
+
+  julia> y = [1 0 0;0 1 0]
+  2×3 Array{Int64,2}:
+  1  0  0
+  0  1  0
+
+  julia> crossentropy(ŷ,y)
+  0.7291574286311803
+  ```
+
+Note: If only two classes are there, better use binarycrossentropy(ŷ, y) function.
+"""
 crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
 
+
 function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
   return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
 end
@@ -106,7 +212,16 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
 """
     hinge(ŷ, y)
-Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
+
+L1 loss function. Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
+
+"""
+    squared_hinge(ŷ, y)
+
+L2 loss function. Computes squared hinge loss over the prediction ŷ and true labels y(conatining 1 or -1)
+"""
+squared_hinge(ŷ, y) = sum((max.(0,1.-ŷ.*y)).^2) *1//size(y,2)
+  

From 643086c8db3220cd08a8a4c12a760121f45c5a46 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Wed, 5 Feb 2020 22:40:07 +0530
Subject: [PATCH 006/113] Updated squared_hinge

---
 src/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 1324f62c..8670a0b6 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -223,5 +223,5 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
 L2 loss function. Computes squared hinge loss over the prediction ŷ and true labels y(conatining 1 or -1)
 """
-squared_hinge(ŷ, y) = sum((max.(0,1.-ŷ.*y)).^2) *1//size(y,2)
+squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
   

From 44a977b7a4e8aaa1e84446d946b5ca95d43a09b3 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Wed, 5 Feb 2020 23:20:06 +0530
Subject: [PATCH 007/113] Added tests for new loss functions

---
 test/layers/stateless.jl | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 7cb8ed2e..fe553db0 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -12,6 +12,20 @@ const ϵ = 1e-7
   @testset "mse" begin
     @test mse(ŷ, y) ≈ (.1^2 + .9^2)/2
   end
+  
+  @testset "mae" begin
+    @test Flux.mae(ŷ, y) ≈ 1/2
+  end
+  
+  @testset "huber_loss" begin
+    @test Flux.huber_loss(ŷ, y) ≈ 0.0012499999999999994
+  end       
+            
+  y = [123,456,789]
+  y1 = [345,332,789]
+  @testset "msle" begin
+    @test Flux.msle(y1, y) ≈ 0.38813985859136585
+  end
 
   # Now onehot y's
   y = onehotbatch([1, 1, 0, 0], 0:1)
@@ -64,18 +78,23 @@ const ϵ = 1e-7
     @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
   end
   
+  @testset "squared_hinge" begin
+    @test Flux.squared_hinge(y, y1) ≈ 0
+    @test Flux.squared_hinge(y, 0.5 .* y) ≈ 0.0625
+  end
+  
   y = [0.1 0.2 0.3]
   y1 = [0.4 0.5 0.6]
   @testset "poisson" begin
     @test Flux.poisson(y, y1) ≈ 1.0160455586700767
     @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
-  
+            
   @testset "no spurious promotions" begin
     for T in (Float32, Float64)
       y = rand(T, 2)
       ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,Flux.mae,Flux.huber_loss,Flux.msle,Flux.squared_hinge)
         fwd, back = Flux.pullback(f, ŷ, y)
         @test fwd isa T
         @test eltype(back(one(T))[1]) == T

From b5184553d44619e3fb4a32d1d35e42eea9699346 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Wed, 5 Feb 2020 23:32:55 +0530
Subject: [PATCH 008/113] Error correction in mae

---
 src/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 8670a0b6..c0ac6ecb 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -6,7 +6,7 @@ using NNlib: logsoftmax, logσ
     mae(ŷ, y)
 L1 loss function. Computes the mean of absolute error between prediction and true values
 """
-mae(ŷ, y) = sum(abs.(ŷ, y)) * 1 // length(y)
+mae(ŷ, y) = sum(abs.(ŷ.- y)) * 1 // length(y)
 
 
 """

From 7710bb0b4bbe90693dbc6110de9a1e7112ed2c79 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 6 Feb 2020 01:06:41 +0530
Subject: [PATCH 009/113] Removed spurious promotions

---
 src/layers/stateless.jl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index c0ac6ecb..e3bdfe00 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -6,7 +6,7 @@ using NNlib: logsoftmax, logσ
     mae(ŷ, y)
 L1 loss function. Computes the mean of absolute error between prediction and true values
 """
-mae(ŷ, y) = sum(abs.(ŷ.- y)) * 1 // length(y)
+mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 
 
 """
@@ -42,9 +42,9 @@ Alias:
   msle(ŷ,y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
 
 """
-mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
 #Alias
-msle(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+msle(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
 
 
 
@@ -74,12 +74,14 @@ Computes the mean of the Huber loss between prediction ŷ and true values y. By
 """
 function huber_loss(ŷ, y,delta=1.0)
   abs_error = abs.(ŷ.-y)
-  hub_loss =0
+  type_ = eltype(ŷ)
+  delta = type_(delta)
+  hub_loss =type_(0)
   for i in 1:length(y)
     if (abs_error[i]<=delta)
-      hub_loss+=abs_error[i]^2*0.5
+      hub_loss+=abs_error[i]^2*type_(0.5)
     else
-      hub_loss+=delta*(abs_error[i]-0.5*delta)
+      hub_loss+=delta*(abs_error[i]-type_(0.5*delta))
     end
   
   return hub_loss*1//length(y)

From 659ba074d1e83075b23b4a3a9d5b09cb17551e4a Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 6 Feb 2020 01:21:51 +0530
Subject: [PATCH 010/113] Updated test for msle

---
 test/layers/stateless.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index fe553db0..d038bcda 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -21,8 +21,8 @@ const ϵ = 1e-7
     @test Flux.huber_loss(ŷ, y) ≈ 0.0012499999999999994
   end       
             
-  y = [123,456,789]
-  y1 = [345,332,789]
+  y = [123.0,456.0,789.0]
+  y1 = [345.0,332.0,789.0]
   @testset "msle" begin
     @test Flux.msle(y1, y) ≈ 0.38813985859136585
   end

From 197a1a70c09deba9f4d5ae1bf74bc12a86314288 Mon Sep 17 00:00:00 2001
From: pranjaldatta <pranjaldatta99@gmail.com>
Date: Fri, 7 Feb 2020 03:47:19 +0530
Subject: [PATCH 011/113] added BostonHousing dataset and testing

---
 src/data/Data.jl    |   3 +
 src/data/housing.jl | 136 ++++++++++++++++++++++++++++++++++++++++++++
 test/data.jl        |   8 ++-
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 src/data/housing.jl

diff --git a/src/data/Data.jl b/src/data/Data.jl
index d7cd0303..88af9549 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -42,4 +42,7 @@ using .Sentiment
 include("iris.jl")
 export Iris
 
+include("housing.jl")
+export Housing
+
 end
diff --git a/src/data/housing.jl b/src/data/housing.jl
new file mode 100644
index 00000000..0d167dc0
--- /dev/null
+++ b/src/data/housing.jl
@@ -0,0 +1,136 @@
+"""
+1. Title: Boston Housing Data
+
+2. Sources:
+   (a) Origin:  This dataset was taken from the StatLib library which is
+                maintained at Carnegie Mellon University.
+   (b) Creator:  Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the 
+                 demand for clean air', J. Environ. Economics & Management,
+                 vol.5, 81-102, 1978.
+   (c) Date: July 7, 1993
+
+3. Number of Instances: 506
+
+4. Number of Attributes: 13 continuous attributes (including "class"
+                            attribute "MEDV"), 1 binary-valued attribute.
+                            
+5. Attribute Information:
+   
+       1. CRIM      per capita crime rate by town
+       2. ZN        proportion of residential land zoned for lots over 
+                    25,000 sq.ft.
+       3. INDUS     proportion of non-retail business acres per town
+       4. CHAS      Charles River dummy variable (= 1 if tract bounds 
+                    river; 0 otherwise)
+       5. NOX       nitric oxides concentration (parts per 10 million)
+       6. RM        average number of rooms per dwelling
+       7. AGE       proportion of owner-occupied units built prior to 1940
+       8. DIS       weighted distances to five Boston employment centres
+       9. RAD       index of accessibility to radial highways
+       10. TAX      full-value property-tax rate per 10,000 dollars
+       11. PTRATIO  pupil-teacher ratio by town
+       12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
+                    by town
+       13. LSTAT    % lower status of the population
+       14. MEDV     Median value of owner-occupied homes in 1000's of dollars   
+
+       Downloaded From: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data
+
+"""
+module Housing
+
+using DelimitedFiles
+using ..Data: deps, download_and_verify
+
+#Uncomment if package exists
+#const cache_prefix = "https://cache.julialang.org/"
+const cache_prefix = ""
+
+function load()
+    isfile(deps("housing.data")) && return
+    
+    @info "Downloading the Boston housing Dataset"
+    download_and_verify("$(cache_prefix)https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
+                        deps("housing.data"),
+                        "baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")
+    
+    #@info "Download complete. Working on the files"
+    path = deps()
+    isfile(deps("housing.data")) && touch(joinpath(path, "tempfile.data"))
+    open(joinpath(path, "tempfile.data"), "a") do fout
+        open(deps("housing.data"), "r") do fin
+            for line in eachline(fin)
+                line = replace(lstrip(line), r" +" => s",")
+                println(fout, line)
+            end
+        end
+    end
+    mv(joinpath(path, "tempfile.data"), deps("housing.data"), force=true)
+end
+
+"""
+Gets the targets for the Boston housing dataset, a 506 element array listing the targets for each example
+
+```jldoctest
+julia> using Flux
+
+julia> target = Flux.Data.Housing.targets()
+
+julia> summary(target)
+506×1 Array{Float64,2}
+
+julia> target[1]
+24.0
+
+"""
+function targets()
+    load()
+    housing = readdlm(deps("housing.data"), ',')
+    reshape(Vector{Float64}(housing[1:end,end]), (506, 1))           
+end
+
+
+"""
+Gets the names of the features provided in the dataset
+
+"""
+function feature_names()
+    ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"]
+end
+
+
+"""
+Gets the features of the Boston Housing Dataset. This is a 506x13 Matrix of Float64 datatypes.
+The values are in the order ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"].
+It has 506 examples.
+
+```jldoctest
+julia> using Flux
+
+julia> features = Flux.Data.Housing.features()
+
+julia> summary(features)
+506×13 Array{Float64,2}
+
+julia> features[1, :]
+13-element Array{Float64,1}:
+0.00632
+18.0    
+2.31   
+0.0    
+0.538  
+   ⋮      
+296.0    
+15.3    
+396.9    
+4.98   
+
+"""
+function features()
+    load()
+    housing = readdlm(deps("housing.data"), ',')
+    Matrix{Float64}(housing[1:end, 1:13])    
+end
+
+
+end
\ No newline at end of file
diff --git a/test/data.jl b/test/data.jl
index 6b777873..aa913806 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -16,7 +16,13 @@ using Test
 @test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
 
 @test Iris.features() isa Matrix
-@test size(Iris.features()) == (4,150)
+@test size(Iris.features()) == (4,150) 
 
 @test Iris.labels() isa Vector{String}
 @test size(Iris.labels()) == (150,)
+
+@test Housing.features() isa Matrix
+@test size(Housing.features()) == (506, 13)
+
+@test Housing.targets() isa Array{Float64}
+@test size(Housing.targets()) == (506, 1)
\ No newline at end of file

From 37d58e16dd234b16fb59bb2dd9cf1a37f54fcbde Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sat, 8 Feb 2020 16:33:18 +0530
Subject: [PATCH 012/113] common questions answered in docs

---
 docs/src/models/basics.md     | 18 ++++++++++++++++++
 docs/src/training/training.md | 17 +++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index d83fc462..76f93684 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -219,3 +219,21 @@ Flux.@functor Affine
 ```
 
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
+
+By default all the fields in the `Affine` type are collected as its parameters, however, in some cases it may be desired to hold other metadata in our "layers" that may not be needed for training, and are hence supposed to be ignored while the parameters are collected. With Flux, it is possible to mark the fields of our layers that are trainable in two ways.
+
+The first way of achieving this is through overloading the `trainable` function.
+
+```julia
+Flux.trainable(a::Affine) = (a.W, a.b,)
+```
+
+To add other fields is simply to add them to the tuple.
+
+Another way of achieving this is through the `@functor` macro. Here, wee can mark the fields we are interested in like so:
+
+```julia
+Flux.@functor Affine (W,)
+```
+
+However, doing this requires the `struct` to have a corresponding constructor that accepts those parameters.
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index b42db7c9..7680a776 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -41,6 +41,23 @@ The model to be trained must have a set of tracked parameters that are used to c
 
 Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
 
+When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
+
+Consider the simple multi-layer model where we want to omit optimising the second layer. This setup would look something like so:
+
+```julia
+m = Chain(
+  Dense(784, 64, σ),
+  Dense(64, 32),
+  Dense(32, 10), softmax)
+
+ps = Flux.params(m[1], m[3:end])
+```
+
+`ps` now holds a reference to only the parameters of the layers passed to it.
+
+Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. 
+
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From ee6d950696e4f163ded8098624f4c8a79d978694 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Wed, 12 Feb 2020 11:25:50 +0530
Subject: [PATCH 013/113] Update docs/src/models/basics.md

Co-Authored-By: Carlo Lucibello <carlo.lucibello@gmail.com>
---
 docs/src/models/basics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 76f93684..8982fdfb 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -228,7 +228,7 @@ The first way of achieving this is through overloading the `trainable` function.
 Flux.trainable(a::Affine) = (a.W, a.b,)
 ```
 
-To add other fields is simply to add them to the tuple.
+Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`. 
 
 Another way of achieving this is through the `@functor` macro. Here, wee can mark the fields we are interested in like so:
 

From d5ed9a447858745a4551d646d54788ed94074c23 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Wed, 12 Feb 2020 11:26:11 +0530
Subject: [PATCH 014/113] Update docs/src/models/basics.md

Co-Authored-By: Carlo Lucibello <carlo.lucibello@gmail.com>
---
 docs/src/models/basics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 8982fdfb..3f43f29d 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -231,7 +231,7 @@ Flux.trainable(a::Affine) = (a.W, a.b,)
 Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`. 
 
 Another way of achieving this is through the `@functor` macro. Here, wee can mark the fields we are interested in like so:
-
+Another way of achieving this is through the `@functor` macro. Here, we can mark the fields we are interested in by grouping them in the second argument:
 ```julia
 Flux.@functor Affine (W,)
 ```

From 7c12af065a2d8fb20359321e34f3c0731ae5559f Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 20 Feb 2020 23:27:36 -0600
Subject: [PATCH 015/113] Added testmode! functionality back to normalization
 layers.

---
 src/Flux.jl                  |  2 +-
 src/layers/normalise.jl      | 72 ++++++++++++++++++++++++++++++------
 test/layers/normalisation.jl | 31 +++++++++-------
 3 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 9969b323..5f9878f3 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -11,7 +11,7 @@ export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
        DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
-       SkipConnection, params, fmap, cpu, gpu, f32, f64
+       SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!
 
 include("optimise/Optimise.jl")
 using .Optimise
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index b421d3e7..ee6b6fdd 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -2,6 +2,23 @@ istraining() = false
 
 @adjoint istraining() = true, _ -> nothing
 
+_isactive(m) = isnothing(m.active) ? istraining() : m.active
+# @adjoint _isactive(m) = _isactive(m), Δ -> nothing
+
+"""
+  testmode!(m, mode = :auto)
+
+Set a layer or model's test mode (see below).
+Using `:auto` mode will treat any gradient computation as training.
+
+Possible values include:
+- `false` for training
+- `true` for testing
+- `:auto` or `nothing` for Flux to detect the mode automatically
+"""
+testmode!(m, mode) = nothing
+testmode!(m::Chain, mode = :auto) = map(x -> testmode!(x, mode), m.layers)
+
 _dropout_shape(s, ::Colon) = size(s)
 _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
 
@@ -22,18 +39,27 @@ A Dropout layer. For each input, either sets that input to `0` (with probability
 `p`) or scales it by `1/(1-p)`. The `dims` argument is to specified the unbroadcasted
  dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
  used as a regularisation, i.e. it reduces overfitting during training. see also [`dropout`](@ref).
+
+Does nothing to the input once [`testmode!`](@ref) is false.
 """
 mutable struct Dropout{F,D}
   p::F
   dims::D
+  active::Union{Bool, Nothing}
 end
 
 function Dropout(p; dims = :)
   @assert 0 ≤ p ≤ 1
-  Dropout{typeof(p),typeof(dims)}(p, dims)
+  Dropout{typeof(p),typeof(dims)}(p, dims, nothing)
 end
 
-(a::Dropout)(x) = dropout(x, a.p; dims = a.dims)
+function (a::Dropout)(x)
+  _isactive(a) || return x
+  return dropout(x, a.p; dims = a.dims)
+end
+
+testmode!(m::Dropout, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, d::Dropout)
   print(io, "Dropout(", d.p)
@@ -46,17 +72,20 @@ end
 A dropout layer. It is used in Self-Normalizing Neural Networks.
 (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
 The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
+
+Does nothing to the input once [`testmode!`](@ref) is false.
 """
 mutable struct AlphaDropout{F}
   p::F
-  function AlphaDropout(p)
+  active::Union{Bool, Nothing}
+  function AlphaDropout(p, active = nothing)
     @assert 0 ≤ p ≤ 1
-    new{typeof(p)}(p)
+    new{typeof(p)}(p, active)
   end
 end
 
 function (a::AlphaDropout)(x)
-  istraining() || return x
+  _isactive(a) || return x
   λ = eltype(x)(1.0507009873554804934193349852946)
   α = eltype(x)(1.6732632423543772848170429916717)
   α1 = eltype(x)(-λ*α)
@@ -68,6 +97,9 @@ function (a::AlphaDropout)(x)
   return x
 end
 
+testmode!(m::AlphaDropout, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+
 """
     LayerNorm(h::Integer)
 
@@ -106,6 +138,8 @@ it's the usual channel dimension.)
 shifts them to have a new mean and variance (corresponding to the learnable,
 per-channel `bias` and `scale` parameters).
 
+Use [`testmode!`](@ref) during inference.
+
 See [Batch Normalization: Accelerating Deep Network Training by Reducing
 Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf).
 
@@ -127,12 +161,13 @@ mutable struct BatchNorm{F,V,W,N}
   σ²::W  # moving std
   ϵ::N
   momentum::N
+  active::Union{Bool, Nothing}
 end
 
 BatchNorm(chs::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   BatchNorm(λ, initβ(chs), initγ(chs),
-            zeros(chs), ones(chs), ϵ, momentum)
+            zeros(chs), ones(chs), ϵ, momentum, nothing)
 
 trainable(bn::BatchNorm) = (bn.β, bn.γ)
 
@@ -145,7 +180,7 @@ function (BN::BatchNorm)(x)
   m = div(prod(size(x)), channels)
   γ = reshape(BN.γ, affine_shape...)
   β = reshape(BN.β, affine_shape...)
-  if !istraining()
+  if !_isactive(BN)
     μ = reshape(BN.μ, affine_shape...)
     σ² = reshape(BN.σ², affine_shape...)
     ϵ = BN.ϵ
@@ -170,6 +205,9 @@ end
 
 @functor BatchNorm
 
+testmode!(m::BatchNorm, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+
 function Base.show(io::IO, l::BatchNorm)
   print(io, "BatchNorm($(join(size(l.β), ", "))")
   (l.λ == identity) || print(io, ", λ = $(l.λ)")
@@ -193,6 +231,8 @@ it's the usual channel dimension.)
 shifts them to have a new mean and variance (corresponding to the learnable,
 per-channel `bias` and `scale` parameters).
 
+Use [`testmode!`](@ref) during inference.
+
 See [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
 
 Example:
@@ -215,12 +255,13 @@ mutable struct InstanceNorm{F,V,W,N}
   σ²::W  # moving std
   ϵ::N
   momentum::N
+  active::Union{Bool, Nothing}
 end
 
 InstanceNorm(chs::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   InstanceNorm(λ, initβ(chs), initγ(chs),
-            zeros(chs), ones(chs), ϵ, momentum)
+            zeros(chs), ones(chs), ϵ, momentum, nothing)
 
 trainable(in::InstanceNorm) = (in.β, in.γ)
 
@@ -237,7 +278,7 @@ function (in::InstanceNorm)(x)
   m = div(prod(size(x)), c*bs)
   γ, β = expand_inst(in.γ, affine_shape), expand_inst(in.β, affine_shape)
 
-  if !istraining()
+  if !_isactive(in)
     μ = expand_inst(in.μ, affine_shape)
     σ² = expand_inst(in.σ², affine_shape)
     ϵ = in.ϵ
@@ -263,6 +304,9 @@ end
 
 @functor InstanceNorm
 
+testmode!(m::InstanceNorm, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+
 function Base.show(io::IO, l::InstanceNorm)
   print(io, "InstanceNorm($(join(size(l.β), ", "))")
   (l.λ == identity) || print(io, ", λ = $(l.λ)")
@@ -283,6 +327,8 @@ For an array of N dimensions, the (N-1)th index is the channel dimension.
 ``G`` is the number of groups along which the statistics would be computed.
 The number of channels must be an integer multiple of the number of groups.
 
+Use [`testmode!`](@ref) during inference.
+
 Example:
 ```
 m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1),
@@ -300,12 +346,13 @@ mutable struct GroupNorm{F,V,W,N,T}
   σ²::W  # moving std
   ϵ::N
   momentum::N
+  active::Union{Bool, Nothing}
 end
 
 GroupNorm(chs::Integer, G::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   GroupNorm(G, λ, initβ(chs), initγ(chs),
-            zeros(G,1), ones(G,1), ϵ, momentum)
+            zeros(G,1), ones(G,1), ϵ, momentum, nothing)
 
 trainable(gn::GroupNorm) = (gn.β, gn.γ)
 
@@ -329,7 +376,7 @@ function(gn::GroupNorm)(x)
   β = reshape(gn.β, affine_shape...)
 
   y = reshape(x,((size(x))[1:end-2]...,channels_per_group,groups,batches))
-  if !istraining()
+  if !_isactive(gn)
     og_shape = size(x)
     μ = reshape(gn.μ, μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
     σ² = reshape(gn.σ², μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
@@ -360,6 +407,9 @@ end
 
 @functor GroupNorm
 
+testmode!(m::GroupNorm, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+
 function Base.show(io::IO, l::GroupNorm)
   print(io, "GroupNorm($(join(size(l.β), ", "))")
   (l.λ == identity) || print(io, ", λ = $(l.λ)")
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 4399a256..594fb586 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -1,30 +1,33 @@
 using Flux, Test, Statistics
 using Zygote: pullback
 
-trainmode(f, x...) = pullback(f, x...)[1]
-trainmode(f) = (x...) -> trainmode(f, x...)
+evalwgrad(f, x...) = pullback(f, x...)[1]
+trainmode(f) = (testmode!(f, false); f)
 
 @testset "Dropout" begin
   x = [1.,2.,3.]
   @test x == Dropout(0.1)(x)
-  @test x == trainmode(Dropout(0), x)
-  @test zero(x) == trainmode(Dropout(1), x)
+  @test x == evalwgrad(Dropout(0), x)
+  @test zero(x) == evalwgrad(Dropout(1), x)
 
   x = rand(100)
   m = Dropout(0.9)
-  y = trainmode(m, x)
+  y = evalwgrad(m, x)
   @test count(a->a==0, y) > 50
-  y = m(x)
+  testmode!(m, true)
+  y = evalwgrad(m, x) # should override istraining
   @test count(a->a==0, y) == 0
-  y = trainmode(m, x)
+  testmode!(m, false)
+  y = evalwgrad(m, x)
   @test count(a->a==0, y) > 50
 
   x = rand(Float32, 100)
   m = Chain(Dense(100,100),
             Dropout(0.9))
-  y = trainmode(m, x)
+  y = evalwgrad(m, x)
   @test count(a->a == 0, y) > 50
-  y = m(x)
+  testmode!(m, true)
+  y = evalwgrad(m, x) # should override istraining
   @test count(a->a == 0, y) == 0
 
   x = rand(100, 50)
@@ -49,7 +52,7 @@ end
     # initial m.σ is 1
     # initial m.μ is 0
 
-    y = trainmode(m, x)
+    y = evalwgrad(m, x)
     @test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5)
     # julia> x
     #  2×3 Array{Float64,2}:
@@ -117,7 +120,7 @@ end
       x = Float64.(x)
       @test m.β == [0, 0]  # initβ(2)
       @test m.γ == [1, 1]  # initγ(2)
-      y = trainmode(m, x)
+      y = evalwgrad(m, x)
 
       #julia> x
       #[:, :, 1] =
@@ -172,7 +175,7 @@ end
   # check that μ, σ², and the output are the correct size for higher rank tensors
   let m = InstanceNorm(2), sizes = (5, 5, 3, 4, 2, 6),
       x = reshape(Float32.(collect(1:prod(sizes))), sizes)
-    y = trainmode(m, x)
+    y = evalwgrad(m, x)
     @test size(m.μ) == (sizes[end - 1], )
     @test size(m.σ²) == (sizes[end - 1], )
     @test size(y) == sizes
@@ -204,7 +207,7 @@ if VERSION >= v"1.1"
       @test m.β == [0, 0, 0, 0]  # initβ(32)
       @test m.γ == [1, 1, 1, 1]  # initγ(32)
 
-      y = trainmode(m, x)
+      y = evalwgrad(m, x)
 
       #julia> x
       #[:, :, 1]  =
@@ -273,7 +276,7 @@ if VERSION >= v"1.1"
   # check that μ, σ², and the output are the correct size for higher rank tensors
   let m = GroupNorm(4,2), sizes = (5, 5, 3, 4, 4, 6),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
-    y = trainmode(m, x)
+    y = evalwgrad(m, x)
     @test size(m.μ) == (m.G,1)
     @test size(m.σ²) == (m.G,1)
     @test size(y) == sizes

From 924b8f49ec9a438d35159e4e8ad5fbd75f0654ba Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Fri, 21 Feb 2020 15:10:28 -0600
Subject: [PATCH 016/113] Updated to place function definitions in the
 appropriate places.

---
 src/functor.jl          | 13 +++++++++++++
 src/layers/basic.jl     |  2 ++
 src/layers/normalise.jl | 25 +++++--------------------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index a36b5765..4edfbd98 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -39,6 +39,19 @@ end
 
 trainable(m) = functor(m)[1]
 
+"""
+  testmode!(m, mode = true)
+
+Set a layer or model's test mode (see below).
+Using `:auto` mode will treat any gradient computation as training.
+
+Possible values include:
+- `false` for training
+- `true` for testing
+- `:auto` or `nothing` for Flux to detect the mode automatically
+"""
+testmode!(m, mode) = nothing
+
 params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
 
 function params!(p::Params, x, seen = IdSet())
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 2a465208..6788f761 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -33,6 +33,8 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
+testmode!(m::Chain, mode = true) = map(x -> testmode!(x, mode), m.layers)
+
 function Base.show(io::IO, c::Chain)
   print(io, "Chain(")
   join(io, c.layers, ", ")
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index ee6b6fdd..7b438bc2 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -3,21 +3,6 @@ istraining() = false
 @adjoint istraining() = true, _ -> nothing
 
 _isactive(m) = isnothing(m.active) ? istraining() : m.active
-# @adjoint _isactive(m) = _isactive(m), Δ -> nothing
-
-"""
-  testmode!(m, mode = :auto)
-
-Set a layer or model's test mode (see below).
-Using `:auto` mode will treat any gradient computation as training.
-
-Possible values include:
-- `false` for training
-- `true` for testing
-- `:auto` or `nothing` for Flux to detect the mode automatically
-"""
-testmode!(m, mode) = nothing
-testmode!(m::Chain, mode = :auto) = map(x -> testmode!(x, mode), m.layers)
 
 _dropout_shape(s, ::Colon) = size(s)
 _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
@@ -58,7 +43,7 @@ function (a::Dropout)(x)
   return dropout(x, a.p; dims = a.dims)
 end
 
-testmode!(m::Dropout, mode = :auto) =
+testmode!(m::Dropout, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, d::Dropout)
@@ -97,7 +82,7 @@ function (a::AlphaDropout)(x)
   return x
 end
 
-testmode!(m::AlphaDropout, mode = :auto) =
+testmode!(m::AlphaDropout, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 """
@@ -205,7 +190,7 @@ end
 
 @functor BatchNorm
 
-testmode!(m::BatchNorm, mode = :auto) =
+testmode!(m::BatchNorm, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, l::BatchNorm)
@@ -304,7 +289,7 @@ end
 
 @functor InstanceNorm
 
-testmode!(m::InstanceNorm, mode = :auto) =
+testmode!(m::InstanceNorm, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, l::InstanceNorm)
@@ -407,7 +392,7 @@ end
 
 @functor GroupNorm
 
-testmode!(m::GroupNorm, mode = :auto) =
+testmode!(m::GroupNorm, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, l::GroupNorm)

From 4ed7d984db6167b5b4254588434566418037b375 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 25 Feb 2020 14:09:03 +0100
Subject: [PATCH 017/113] Adapt to CuArrays ArrayStyle changes.

---
 src/onehot.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/onehot.jl b/src/onehot.jl
index 7a3123ec..b480d9c0 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -37,9 +37,9 @@ import Adapt: adapt, adapt_structure
 
 adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
 
-import .CuArrays: CuArray, cudaconvert
+import .CuArrays: CuArray, CuArrayStyle, cudaconvert
 import Base.Broadcast: BroadcastStyle, ArrayStyle
-BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
+BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = CuArrayStyle{2}()
 cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
 
 """

From ba5259a269f93b0dcf65dfca43b29b219bf81415 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Tue, 25 Feb 2020 13:53:49 -0600
Subject: [PATCH 018/113] Added docs on testmode!

---
 docs/src/models/layers.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 5f2ab3ce..763fbf8c 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,6 +66,14 @@ LayerNorm
 GroupNorm
 ```
 
+### Testmode
+
+Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified.
+
+```@docs
+testmode!
+```
+
 ## Cost Functions
 ```@docs
 mse

From 569021a9f1f9910f7f2e9ac6869bb149b9da7023 Mon Sep 17 00:00:00 2001
From: pranjaldatta <pranjaldatta99@gmail.com>
Date: Wed, 26 Feb 2020 15:05:23 +0530
Subject: [PATCH 019/113] added newlines  at end of file

---
 src/data/housing.jl | 2 +-
 test/data.jl        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/data/housing.jl b/src/data/housing.jl
index 0d167dc0..61391304 100644
--- a/src/data/housing.jl
+++ b/src/data/housing.jl
@@ -133,4 +133,4 @@ function features()
 end
 
 
-end
\ No newline at end of file
+end
diff --git a/test/data.jl b/test/data.jl
index aa913806..6c012a93 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -25,4 +25,4 @@ using Test
 @test size(Housing.features()) == (506, 13)
 
 @test Housing.targets() isa Array{Float64}
-@test size(Housing.targets()) == (506, 1)
\ No newline at end of file
+@test size(Housing.targets()) == (506, 1)

From 759fe9df2fb0a4665052383fae1b0fd8978a2f52 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Wed, 26 Feb 2020 20:27:39 +0100
Subject: [PATCH 020/113] update docs and export update!

---
 docs/src/training/optimisers.md |  3 ++-
 src/optimise/Optimise.jl        |  2 +-
 src/optimise/train.jl           | 17 +++++++++++++++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 5e8b95de..37288b5d 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -21,7 +21,7 @@ grads = gradient(() -> loss(x, y), θ)
 We want to update each parameter, using the gradient, in order to improve (reduce) the loss. Here's one way to do that:
 
 ```julia
-using Flux: update!
+using Flux.Optimise: update!
 
 η = 0.1 # Learning Rate
 for p in (W, b)
@@ -46,6 +46,7 @@ An optimiser `update!` accepts a parameter and a gradient, and updates the param
 All optimisers return an object that, when passed to `train!`, will update the parameters passed to it.
 
 ```@docs
+Flux.Optimise.update!
 Descent
 Momentum
 Nesterov
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index 68c18a6f..28a1849d 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -1,6 +1,6 @@
 module Optimise
 
-export train!,
+export train!, update!,
 	SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
 	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
 	InvDecay, ExpDecay, WeightDecay, stop, Optimiser
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index ae0f334c..59404a42 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -1,9 +1,22 @@
 using Juno
 import Zygote: Params, gradient
 
+
+"""
+  update!(opt, p, g)
+  update!(opt, ps::Params, gs)
+
+Perform an update step of the parameters `ps` (or the single parameter `p`) 
+according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
+
+As a result, the parameters are mutated and the optimizer's internal state may change. 
+
+  update!(x, x̄)
+  
+Update the array `x` according to `x .-= x̄`.
+"""
 function update!(x::AbstractArray, x̄)
-  x .+= x̄
-  return x
+  x .-= x̄
 end
 
 function update!(opt, x, x̄)

From 980ce72914abb21224a7b21e9f8c60bfbcbcfa48 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 27 Feb 2020 02:00:28 +0530
Subject: [PATCH 021/113] Added tversky and dice loss

---
 src/layers/stateless.jl | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index e3bdfe00..74236700 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -74,14 +74,14 @@ Computes the mean of the Huber loss between prediction ŷ and true values y. By
 """
 function huber_loss(ŷ, y,delta=1.0)
   abs_error = abs.(ŷ.-y)
-  type_ = eltype(ŷ)
-  delta = type_(delta)
-  hub_loss =type_(0)
+  dtype= eltype(ŷ)
+  delta = dtype(delta)
+  hub_loss = dtype(0)
   for i in 1:length(y)
     if (abs_error[i]<=delta)
-      hub_loss+=abs_error[i]^2*type_(0.5)
+      hub_loss+=abs_error[i]^2*dtype(0.5)
     else
-      hub_loss+=delta*(abs_error[i]-type_(0.5*delta))
+      hub_loss+=delta*(abs_error[i]- dtype(0.5*delta))
     end
   
   return hub_loss*1//length(y)
@@ -226,4 +226,29 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 L2 loss function. Computes squared hinge loss over the prediction ŷ and true labels y(conatining 1 or -1)
 """
 squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
-  
+
+"""
+    dice_coeff_loss(y_pred,y_true,smooth = 1)
+
+Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score
+    Dice_Coefficient(A,B) = 2*sum(|A*B|+smooth)/(sum(A^2)+sum(B^2)+ smooth)
+    Dice_loss = 1-Dice_Coefficient
+
+Ref: [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
+"""
+function dice_coeff_loss(y_pred,y_true,smooth=eltype(y_pred)(1.0))
+    intersection = sum(y_true.*y_pred)
+    return 1 - (2*intersection + smooth)/(sum(y_true.^2) + sum(y_pred.^2)+smooth)
+end
+
+"""
+    tversky_loss(y_pred,y_true,beta = 0.7)
+
+Used with imbalanced data to give more weightage to False negatives. Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
+    tversky_loss(ŷ,y,beta) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + beta*(1 .- y).*ŷ + (1 .- beta)*y.*(1 .- ŷ))+ 1)
+Ref: [Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
+"""
+function tversky_loss(y_pred,y_true,beta = eltype(y_pred)(0.7))
+    intersection = sum(y_true.*y_pred)
+    return 1 - (intersection+1)/(sum(y_true.*y_pred + beta*(1 .- y_true).* y_pred + (1-beta).*y_true.*(1 .- y_pred))+1)
+end

From 3d8965230fc45f687d943f614dacd154f6212f11 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 27 Feb 2020 02:29:39 +0530
Subject: [PATCH 022/113] Added tests for dice and Tversky loss

---
 test/layers/stateless.jl | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index d038bcda..b7d15634 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -89,12 +89,25 @@ const ϵ = 1e-7
     @test Flux.poisson(y, y1) ≈ 1.0160455586700767
     @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
+  
+  y = [1.0 0.5 0.3 2.4]
+  y1 = [0 1.4 0.5 1.2]
+  @testset "dice_coeff_loss" begin
+    @test Flux.dice_coeff_loss(y, y1) ≈ 0.2799999999999999
+    @test Flux.dice_coeff_loss(y,y) ≈ 0.0
+  end
+            
+  @testset "tversky_loss" begin
+    @test Flux.tversky_loss(y,y1) ≈ 0.028747433264887046
+    @test Flux.tversky_loss(y,y1,0.8) ≈ 0.050200803212851364
+    @test Flux.tversky_loss(y,y) ≈ -0.5576923076923075
+  end
             
   @testset "no spurious promotions" begin
     for T in (Float32, Float64)
       y = rand(T, 2)
       ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,Flux.mae,Flux.huber_loss,Flux.msle,Flux.squared_hinge)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,Flux.mae,Flux.huber_loss,Flux.msle,Flux.squared_hinge,Flux.dice_coeff_loss,Flux.tversky_loss)
         fwd, back = Flux.pullback(f, ŷ, y)
         @test fwd isa T
         @test eltype(back(one(T))[1]) == T

From a121742f9c766b954f56a46e631333853e97d5ad Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 27 Feb 2020 13:56:05 +0530
Subject: [PATCH 023/113] pkg up

---
 Manifest.toml | 68 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 12986ccd..55f3e229 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -40,15 +40,15 @@ version = "2.1.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
+git-tree-sha1 = "5660775f2a3214420add960e1ff2baf46d5297cd"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "5.0.1"
+version = "5.1.0"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
+git-tree-sha1 = "e0c2805c9a7d338823c0d8f574242e284410fa61"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.7.0"
+version = "2.9.1"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -74,6 +74,12 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"
 
+[[CompilerSupportLibraries_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "b57c5d019367c90f234a7bc7e24ff0a84971da5d"
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "0.2.0+1"
+
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
 git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
@@ -87,9 +93,9 @@ version = "1.1.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
+git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.7"
+version = "0.17.10"
 
 [[Dates]]
 deps = ["Printf"]
@@ -107,9 +113,9 @@ version = "1.0.2"
 
 [[DiffRules]]
 deps = ["NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
+git-tree-sha1 = "eb0c34204c8410888844ada5359ac8b96292cfd1"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.0.0"
+version = "1.0.1"
 
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
@@ -123,15 +129,15 @@ version = "1.2.0"
 
 [[FFTW_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
+git-tree-sha1 = "ddb57f4cf125243b4aa4908c94d73a805f3cbf2c"
 uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
-version = "3.3.9+3"
+version = "3.3.9+4"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
+git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.4"
+version = "0.8.5"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -140,9 +146,9 @@ version = "0.6.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
+git-tree-sha1 = "88b082d492be6b63f967b6c96b352e25ced1a34c"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.8"
+version = "0.10.9"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
@@ -152,9 +158,9 @@ version = "2.0.1"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
+git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.3.0"
+version = "0.3.1"
 
 [[IntelOpenMP_jll]]
 deps = ["Libdl", "Pkg"]
@@ -192,10 +198,10 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
 [[MKL_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
+deps = ["IntelOpenMP_jll", "Libdl", "Pkg"]
+git-tree-sha1 = "720629cc8cbd12c146ca01b661fd1a6cf66e2ff4"
 uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
-version = "2019.0.117+0"
+version = "2019.0.117+2"
 
 [[MacroTools]]
 deps = ["DataStructures", "Markdown", "Random"]
@@ -234,10 +240,10 @@ uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
 version = "0.3.3"
 
 [[OpenSpecFun_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
+deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
+git-tree-sha1 = "d110040968b9afe95c6bd9c6233570b0fe8abd22"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.3+1"
+version = "0.5.3+2"
 
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@@ -273,9 +279,9 @@ version = "0.2.0"
 
 [[Requires]]
 deps = ["UUIDs"]
-git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
+git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.0.0"
+version = "1.0.1"
 
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@@ -298,9 +304,9 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[SpecialFunctions]]
 deps = ["OpenSpecFun_jll"]
-git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
+git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.9.0"
+version = "0.10.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
@@ -349,15 +355,17 @@ version = "0.9.0"
 
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
+git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e"
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.11+7"
+version = "1.2.11+8"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
+git-tree-sha1 = "ab2683e7670925ed73b7f076b26847683e38db8c"
+repo-rev = "master"
+repo-url = "https://github.com/FluxML/Zygote.jl.git"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.5"
+version = "0.4.7"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]

From 9dce6232143cda235fe235016c332f8fe1fd939a Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 27 Feb 2020 16:26:17 +0530
Subject: [PATCH 024/113] Updated Msle loss

---
 src/layers/stateless.jl | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 74236700..b4e97660 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -17,33 +17,28 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
 
 """
-    mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
+    msle(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
 
-L2 loss function. Returns the mean of the squared logarithmic errors of prediction ŷ, and true values y. The ϵ1 and ϵ2 terms provide numerical stability.
+Mean Squared Logarithmic Error,an L2 loss function. Returns the mean of the squared logarithmic errors of prediction ŷ, and true values y. The ϵ1 and ϵ2 terms provide numerical stability.
 (Computes mean of squared(log(predicted values)-log(true value)). This error penalizes an under-predicted estimate greater than an over-predicted estimate.
 
   ```julia
-  julia> y_=[14726,327378,74734]
+  julia> y=[14726,327378,74734]
   3-element Array{Int64,1}:
     14726
   327378
     74734
 
-  julia> y = [12466.1,16353.95,16367.98]
+  julia> ŷ = [12466.1,16353.95,16367.98]
   3-element Array{Float64,1}:
   12466.1 
   16353.95
   16367.98
 
-  julia> mean_squared_logarithmic_error(y,y_)
+  julia> msle(ŷ,y)
   3.771271382334686
   ```
-Alias:
-  msle(ŷ,y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
-
 """
-mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
-#Alias
 msle(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
 
 

From 35f6998be7572bb557948d3cee65797be22c9019 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 27 Feb 2020 22:19:06 +0530
Subject: [PATCH 025/113] pkg up

---
 Manifest.toml | 60 +++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 55f3e229..693f7ca2 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -8,15 +8,15 @@ version = "0.5.0"
 
 [[AbstractTrees]]
 deps = ["Markdown"]
-git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
+git-tree-sha1 = "86d092c2599f1f7bb01668bf8eb3412f98d61e47"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.3.0"
+version = "0.3.2"
 
 [[Adapt]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "82dab828020b872fa9efd3abec1152b075bc7cbf"
+git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "1.0.0"
+version = "1.0.1"
 
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
@@ -34,21 +34,21 @@ version = "0.2.0"
 
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
+git-tree-sha1 = "d7ceadd8f821177d05b897c0517e94633db535fe"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "2.1.0"
+version = "3.1.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "5660775f2a3214420add960e1ff2baf46d5297cd"
+git-tree-sha1 = "01e90fa34e25776bc7c8661183d4519149ebfe59"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "5.1.0"
+version = "6.0.0"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "e0c2805c9a7d338823c0d8f574242e284410fa61"
+git-tree-sha1 = "f86269ff60ebe082a2806ecbce51f3cadc68afe9"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.9.1"
+version = "2.10.2"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -58,15 +58,15 @@ version = "0.6.0"
 
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
+git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.8.1"
+version = "0.9.1"
 
 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
-git-tree-sha1 = "c9c1845d6bf22e34738bee65c357a69f416ed5d1"
+git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.9.6"
+version = "0.11.2"
 
 [[CommonSubexpressions]]
 deps = ["Test"]
@@ -82,9 +82,9 @@ version = "0.2.0+1"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
+git-tree-sha1 = "7c20c5a45bb245cf248f454d26966ea70255b271"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.6.0"
+version = "1.7.2"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -140,9 +140,9 @@ uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
 version = "0.8.5"
 
 [[FixedPointNumbers]]
-git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
+git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.6.1"
+version = "0.7.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
@@ -173,10 +173,10 @@ deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[Juno]]
-deps = ["Base64", "Logging", "Media", "Profile", "Test"]
-git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8"
+deps = ["Base64", "Logging", "Media", "Profile"]
+git-tree-sha1 = "4f2249fb58cfb140eeb89428e31791e2f8959d8c"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.7.2"
+version = "0.8.0"
 
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
@@ -205,9 +205,9 @@ version = "2019.0.117+2"
 
 [[MacroTools]]
 deps = ["DataStructures", "Markdown", "Random"]
-git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
+git-tree-sha1 = "07ee65e03e28ca88bc9a338a3726ae0c3efaa94b"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.3"
+version = "0.5.4"
 
 [[Markdown]]
 deps = ["Base64"]
@@ -230,9 +230,9 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[NNlib]]
 deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
-git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
+git-tree-sha1 = "755c0bab3912ff782167e1b4b774b833f8a0e550"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.2"
+version = "0.6.4"
 
 [[NaNMath]]
 git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
@@ -320,9 +320,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [[StatsBase]]
 deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "c53e809e63fe5cf5de13632090bc3520649c9950"
+git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.32.0"
+version = "0.32.1"
 
 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
@@ -349,9 +349,9 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [[ZipFile]]
 deps = ["Libdl", "Printf", "Zlib_jll"]
-git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
+git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.9.0"
+version = "0.9.1"
 
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
@@ -361,7 +361,7 @@ version = "1.2.11+8"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "ab2683e7670925ed73b7f076b26847683e38db8c"
+git-tree-sha1 = "3c65158c0aa0808cdfff8bca2a36430b038aad00"
 repo-rev = "master"
 repo-url = "https://github.com/FluxML/Zygote.jl.git"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"

From 8afed013458cf3f064f6be8f0e9427f49e1bade3 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 27 Feb 2020 23:23:53 +0530
Subject: [PATCH 026/113] Apply suggestions from code review

Co-Authored-By: David Lung <lungd@users.noreply.github.com>
---
 src/layers/stateless.jl  | 2 +-
 test/layers/stateless.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index b4e97660..f05f19fc 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -79,8 +79,8 @@ function huber_loss(ŷ, y,delta=1.0)
       hub_loss+=delta*(abs_error[i]- dtype(0.5*delta))
     end
   
-  return hub_loss*1//length(y)
   end
+  hub_loss*1//length(y)
 end
 
 
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index b7d15634..c09d1aae 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -18,7 +18,7 @@ const ϵ = 1e-7
   end
   
   @testset "huber_loss" begin
-    @test Flux.huber_loss(ŷ, y) ≈ 0.0012499999999999994
+    @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
   end       
             
   y = [123.0,456.0,789.0]

From 425fcdbe6964d581b4d5f6eda1615e883a83b5bd Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 11:14:48 +0100
Subject: [PATCH 027/113] NNlib docs + misc docs improvements

---
 docs/make.jl                      |  3 +-
 docs/src/gpu.md                   |  4 +-
 docs/src/models/layers.md         | 30 ++++-------
 docs/src/models/nnlib.md          | 37 +++++++++++++
 docs/src/models/regularisation.md |  4 +-
 src/layers/normalise.jl           | 16 ++++--
 src/layers/stateless.jl           | 87 ++++++++++++++++++-------------
 7 files changed, 115 insertions(+), 66 deletions(-)
 create mode 100644 docs/src/models/nnlib.md

diff --git a/docs/make.jl b/docs/make.jl
index b950e959..fe3544fc 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -13,7 +13,8 @@ makedocs(modules=[Flux, NNlib],
                     ["Basics" => "models/basics.md",
                      "Recurrence" => "models/recurrence.md",
                      "Regularisation" => "models/regularisation.md",
-                     "Model Reference" => "models/layers.md"],
+                     "Model Reference" => "models/layers.md",
+                     "NNlib" => "models/nnlib.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
                      "Training" => "training/training.md"],
diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index bb13fdd1..19d0c8c6 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -30,7 +30,7 @@ If you define a structured model, like a `Dense` layer or `Chain`, you just need
 ```julia
 d = Dense(10, 5, σ)
 d = fmap(cu, d)
-d.W # Tracked CuArray
+d.W # CuArray
 d(cu(rand(10))) # CuArray output
 
 m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
@@ -53,7 +53,7 @@ julia> x = rand(10) |> gpu
  0.511655
 
 julia> m(x)
-Tracked 5-element CuArray{Float32,1}:
+5-element CuArray{Float32,1}:
  -0.30535
  ⋮
  -0.618002
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 5f2ab3ce..41e98f32 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -40,19 +40,6 @@ Maxout
 SkipConnection
 ```
 
-## Activation Functions
-
-Non-linearities that go between layers of your model. Most of these functions are defined in [NNlib](https://github.com/FluxML/NNlib.jl) but are available by default in Flux.
-
-Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
-
-```@docs
-σ
-relu
-leakyrelu
-elu
-swish
-```
 
 ## Normalisation & Regularisation
 
@@ -61,6 +48,7 @@ These layers don't affect the structure of the network but may improve training
 ```@docs
 BatchNorm
 Dropout
+Flux.dropout
 AlphaDropout
 LayerNorm
 GroupNorm
@@ -68,12 +56,12 @@ GroupNorm
 
 ## Cost Functions
 ```@docs
-mse
-crossentropy
-logitcrossentropy
-binarycrossentropy
-logitbinarycrossentropy
-kldivergence
-poisson
-hinge
+Flux.mse
+Flux.crossentropy
+Flux.logitcrossentropy
+Flux.binarycrossentropy
+Flux.logitbinarycrossentropy
+Flux.kldivergence
+Flux.poisson
+Flux.hinge
 ```
diff --git a/docs/src/models/nnlib.md b/docs/src/models/nnlib.md
new file mode 100644
index 00000000..f5732574
--- /dev/null
+++ b/docs/src/models/nnlib.md
@@ -0,0 +1,37 @@
+## NNlib
+Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
+
+## Activation Functions
+Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
+
+```@docs
+NNlib.elu
+NNlib.gelu
+NNlib.leakyrelu
+NNlib.logcosh
+NNlib.logsigmoid
+NNlib.sigmoid
+NNlib.relu
+NNlib.selu
+NNlib.softplus
+NNlib.softsign
+NNlib.swish
+```
+
+## Softmax
+```@docs
+NNlib.softmax
+NNlib.logsoftmax
+```
+
+## Pooling
+```@docs
+NNlib.maxpool
+NNlib.meanpool
+```
+
+## Convolution
+```@docs
+NNlib.conv
+NNlib.depthwiseconv
+```
\ No newline at end of file
diff --git a/docs/src/models/regularisation.md b/docs/src/models/regularisation.md
index e1d88d77..02aa3da8 100644
--- a/docs/src/models/regularisation.md
+++ b/docs/src/models/regularisation.md
@@ -31,7 +31,7 @@ julia> params(m)
  param([0.0, 0.0, 0.0, 0.0, 0.0])
 
 julia> sum(norm, params(m))
-26.01749952921026 (tracked)
+26.01749952921026
 ```
 
 Here's a larger example with a multi-layer perceptron.
@@ -52,7 +52,7 @@ One can also easily add per-layer regularisation via the `activations` function:
 ```julia
 julia> using Flux: activations
 
-julia> c = Chain(Dense(10,5,σ),Dense(5,2),softmax)
+julia> c = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
 Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
 
 julia> activations(c, rand(10))
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index b421d3e7..2268fdc0 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -7,6 +7,16 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
+"""
+    dropout(p, dims = :)
+
+Dropout function. For each input, either sets that input to `0` (with probability
+`p`) or scales it by `1/(1-p)`. The `dims` argument is to specify the unbroadcasted
+dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
+used as a regularisation, i.e. it reduces overfitting during training. 
+ 
+See also [`Dropout`](@ref).
+"""
 dropout(x, p; dims = :) = x
 
 @adjoint function dropout(x, p; dims = :)
@@ -18,10 +28,7 @@ end
 """
     Dropout(p, dims = :)
 
-A Dropout layer. For each input, either sets that input to `0` (with probability
-`p`) or scales it by `1/(1-p)`. The `dims` argument is to specified the unbroadcasted
- dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
- used as a regularisation, i.e. it reduces overfitting during training. see also [`dropout`](@ref).
+A Dropout layer. In the forward pass, applies the [`dropout`](@ref) function on the input.
 """
 mutable struct Dropout{F,D}
   p::F
@@ -43,6 +50,7 @@ end
 
 """
     AlphaDropout(p)
+    
 A dropout layer. It is used in Self-Normalizing Neural Networks.
 (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
 The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 159a8385..5de5842b 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -1,10 +1,12 @@
-using CuArrays
-using NNlib: logsoftmax, logσ
-
 # Cost functions
+"""
+    mse(ŷ, y)
 
+Return the mean squared error `sum((ŷ .- y).^2) / length(y)`. 
+"""
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
+
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
   return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
 end
@@ -17,10 +19,26 @@ function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Abstr
   return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
 end
 
+"""
+    crossentropy(ŷ, y; weight=1)
+
+Return the crossentropy computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`. 
+
+See also [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref).
+"""
 crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
 
-function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
-  return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
+"""
+    logitcrossentropy(ŷ, y; weight=1)
+
+Return the crossentropy computed after a [softmax](@ref) operation: 
+
+  -sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)
+
+See also [`crossentropy`](@ref), [`binarycrossentropy`](@ref).
+"""
+function logitcrossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
+  return -sum(y .* logsoftmax(ŷ) .* weight) * 1 // size(y, 2)
 end
 
 """
@@ -28,11 +46,7 @@ end
 
 Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerical stability.
 
-    julia> binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0.])
-    3-element Array{Float64,1}:
-    1.4244
-    0.352317
-    0.86167
+Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation.
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
@@ -40,44 +54,42 @@ binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ
 CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
 """
-    logitbinarycrossentropy(logŷ, y)
+    logitbinarycrossentropy(ŷ, y)
 
-`logitbinarycrossentropy(logŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(logŷ), y)`
+`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(ŷ), y)`
 but it is more numerically stable.
 
-    julia> logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0.])
-    3-element Array{Float64,1}:
-     1.4244
-     0.352317
-     0.86167
+See also [`binarycrossentropy`](@ref), [`sigmoid`](@ref), [`logsigmoid`](@ref).  
 """
-logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 
 # Re-definition to fix interaction with CuArrays.
-CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 
 """
-    normalise(x::AbstractArray; dims=1)
+    normalise(x; dims=1)
 
 Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
 
-    julia> a = reshape(collect(1:9), 3, 3)
-    3×3 Array{Int64,2}:
-     1  4  7
-     2  5  8
-     3  6  9
+```julia-repl
+julia> a = reshape(collect(1:9), 3, 3)
+3×3 Array{Int64,2}:
+  1  4  7
+  2  5  8
+  3  6  9
 
-    julia> normalise(a)
-    3×3 Array{Float64,2}:
-     -1.22474  -1.22474  -1.22474
-      0.0       0.0       0.0
-      1.22474   1.22474   1.22474
+julia> normalise(a)
+3×3 Array{Float64,2}:
+  -1.22474  -1.22474  -1.22474
+  0.0       0.0       0.0
+  1.22474   1.22474   1.22474
 
-    julia> normalise(a, dims=2)
-    3×3 Array{Float64,2}:
-     -1.22474  0.0  1.22474
-     -1.22474  0.0  1.22474
-     -1.22474  0.0  1.22474
+julia> normalise(a, dims=2)
+3×3 Array{Float64,2}:
+  -1.22474  0.0  1.22474
+  -1.22474  0.0  1.22474
+  -1.22474  0.0  1.22474
+```
 """
 function normalise(x::AbstractArray; dims=1)
   μ′ = mean(x, dims = dims)
@@ -87,6 +99,7 @@ end
 
 """
     kldivergence(ŷ, y)
+
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
 [KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
@@ -99,6 +112,7 @@ end
 
 """
     poisson(ŷ, y)
+
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
 [Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
@@ -106,7 +120,8 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
 """
     hinge(ŷ, y)
-Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
+
+Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)

From 169ed6eb25e3867f23c80af972830f6e8a1361b6 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 13:43:03 +0100
Subject: [PATCH 028/113] add ecosystem

---
 docs/src/ecosystem.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 docs/src/ecosystem.md

diff --git a/docs/src/ecosystem.md b/docs/src/ecosystem.md
new file mode 100644
index 00000000..e315244d
--- /dev/null
+++ b/docs/src/ecosystem.md
@@ -0,0 +1,18 @@
+# The Julia Ecosystem
+
+One of the main strengths of Julia lies in an ecosystem of packages 
+globally providing a rich and consistent user experience.
+
+This is a non-exhaustive list of Julia packages, nicely complementing `Flux` in typical
+machine learning and deep learning workflows:
+
+- [ArgParse.jl](https://github.com/carlobaldassi/ArgParse.jl): package for parsing command-line arguments to Julia programs.
+- [Augmentor.jl](https://github.com/Evizero/Augmentor.jl): a fast image augmentation library in Julia for machine learning.
+- [BSON.jl](https://github.com/JuliaIO/BSON.jl): package for working with the Binary JSON serialisation format
+- [DataFrames.jl](https://github.com/joshday/OnlineStats.jl): in-memory tabular data in Julia
+- [DrWatson.jl](https://github.com/JuliaDynamics/DrWatson.jl):  a scientific project assistant software
+- [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl): utility package for accessing common machine learning datasets
+- [OnlineStats.jl](https://github.com/joshday/OnlineStats.jl): single-pass algorithms for statistics
+- [Parameters.jl](https://github.com/mauro3/Parameters.jl): types with default field values, keyword constructors and (un-)pack macros
+- [ProgressMeters.jl](https://github.com/timholy/ProgressMeter.jl): progress meters for long-running computations
+- [TensorBoardLogger.jl](https://github.com/PhilipVinc/TensorBoardLogger.jl): easy peasy logging to [tensorboard](https://www.tensorflow.org/tensorboard) in Julia

From 4109f2e0d76bf88448b08bc45c6c4630ca25c1e7 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 13:45:17 +0100
Subject: [PATCH 029/113] cleanup

---
 docs/make.jl             | 1 +
 docs/src/models/nnlib.md | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index fe3544fc..7f73808a 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -21,6 +21,7 @@ makedocs(modules=[Flux, NNlib],
                   "One-Hot Encoding" => "data/onehot.md",
                   "GPU Support" => "gpu.md",
                   "Saving & Loading" => "saving.md",
+                  "The Julia Ecosystem" => "ecosystem.md",
                   "Performance Tips" => "performance.md",
                   "Community" => "community.md"],
          format = Documenter.HTML(assets = ["assets/flux.css"],
diff --git a/docs/src/models/nnlib.md b/docs/src/models/nnlib.md
index f5732574..9e570cb3 100644
--- a/docs/src/models/nnlib.md
+++ b/docs/src/models/nnlib.md
@@ -1,4 +1,4 @@
-## NNlib
+# NNlib
 Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
 
 ## Activation Functions

From 4f693e02cb210535aa19d16b4f04adf840b018c8 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 13:50:23 +0100
Subject: [PATCH 030/113] add model zoo reference

---
 docs/src/ecosystem.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/src/ecosystem.md b/docs/src/ecosystem.md
index e315244d..0672ffe6 100644
--- a/docs/src/ecosystem.md
+++ b/docs/src/ecosystem.md
@@ -16,3 +16,6 @@ machine learning and deep learning workflows:
 - [Parameters.jl](https://github.com/mauro3/Parameters.jl): types with default field values, keyword constructors and (un-)pack macros
 - [ProgressMeters.jl](https://github.com/timholy/ProgressMeter.jl): progress meters for long-running computations
 - [TensorBoardLogger.jl](https://github.com/PhilipVinc/TensorBoardLogger.jl): easy peasy logging to [tensorboard](https://www.tensorflow.org/tensorboard) in Julia
+
+
+This tight integration among Julia pakages is shown in some of the examples in the [model-zoo](https://github.com/FluxML/model-zoo) repository.
\ No newline at end of file

From b6c79b38b4bf54aba0ee096b38afd1180ad1ee55 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Wed, 26 Feb 2020 13:48:27 +0100
Subject: [PATCH 031/113] add DataLoader

special case train! for the unsupervised data iterator
---
 Manifest.toml                 |  2 +-
 Project.toml                  |  5 +-
 docs/make.jl                  |  4 +-
 docs/src/data/dataloader.md   |  6 +++
 docs/src/training/training.md | 19 +++++--
 src/Flux.jl                   |  1 +
 src/data/Data.jl              | 10 ++++
 src/data/dataloader.jl        | 88 +++++++++++++++++++++++++++++++++
 src/optimise/train.jl         | 19 ++++---
 test/data.jl                  | 93 ++++++++++++++++++++++++++++-------
 test/runtests.jl              | 59 ++++++++++++++--------
 11 files changed, 253 insertions(+), 53 deletions(-)
 create mode 100644 docs/src/data/dataloader.md
 create mode 100644 src/data/dataloader.jl

diff --git a/Manifest.toml b/Manifest.toml
index 693f7ca2..788e5354 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -252,7 +252,7 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.1.0"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
diff --git a/Project.toml b/Project.toml
index 71282a10..bd105730 100644
--- a/Project.toml
+++ b/Project.toml
@@ -40,7 +40,10 @@ julia = "1"
 
 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+
 [targets]
-test = ["Test", "Documenter"]
+test = ["Test", "Documenter", "IterTools", "LinearAlgebra"]
diff --git a/docs/make.jl b/docs/make.jl
index fe3544fc..0d597500 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -15,10 +15,12 @@ makedocs(modules=[Flux, NNlib],
                      "Regularisation" => "models/regularisation.md",
                      "Model Reference" => "models/layers.md",
                      "NNlib" => "models/nnlib.md"],
+                  "Handling Data" =>
+                    ["One-Hot Encoding" => "data/onehot.md",
+                     "DataLoader" => "data/dataloader.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
                      "Training" => "training/training.md"],
-                  "One-Hot Encoding" => "data/onehot.md",
                   "GPU Support" => "gpu.md",
                   "Saving & Loading" => "saving.md",
                   "Performance Tips" => "performance.md",
diff --git a/docs/src/data/dataloader.md b/docs/src/data/dataloader.md
new file mode 100644
index 00000000..70a883c9
--- /dev/null
+++ b/docs/src/data/dataloader.md
@@ -0,0 +1,6 @@
+# DataLoader
+Flux provides the `DataLoader` type in the `Flux.Data` module to handle iteration over mini-batches of data. 
+
+```@docs
+Flux.Data.DataLoader
+```
\ No newline at end of file
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index b42db7c9..64b2b5e8 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -7,10 +7,10 @@ To actually train a model we need four things:
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 
-With these we can call `Flux.train!`:
+With these we can call `train!`:
 
-```julia
-Flux.train!(objective, params, data, opt)
+```@docs
+Flux.Optimise.train!
 ```
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
@@ -56,7 +56,8 @@ data = [(x, y)]
 ```julia
 data = [(x, y), (x, y), (x, y)]
 # Or equivalently
-data = Iterators.repeated((x, y), 3)
+using IterTools: ncycle
+data = ncycle([(x, y)], 3)
 ```
 
 It's common to load the `x`s and `y`s separately. In this case you can use `zip`:
@@ -67,6 +68,14 @@ ys = [rand( 10), rand( 10), rand( 10)]
 data = zip(xs, ys)
 ```
 
+Training data can be conveniently  partitioned for mini-batch training using the [`Flux.Data.DataLoader`](@ref) type:
+
+```julia
+X = rand(28, 28, 60000)
+Y = rand(0:9, 60000)
+data = DataLoader(X, Y, batchsize=128) 
+```
+
 Note that, by default, `train!` only loops over the data once (a single "epoch").
 A convenient way to run multiple epochs from the REPL is provided by `@epochs`.
 
@@ -120,7 +129,7 @@ An example follows that works similar to the default `Flux.train` but with no ca
 You don't need callbacks if you just code the calls to your functions directly into the loop.
 E.g. in the places marked with comments.
 
-```
+```julia
 function my_custom_train!(loss, ps, data, opt)
   ps = Params(ps)
   for d in data
diff --git a/src/Flux.jl b/src/Flux.jl
index 9969b323..c99e41a1 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -7,6 +7,7 @@ using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using MacroTools: @forward
 @reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
+
 export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
diff --git a/src/data/Data.jl b/src/data/Data.jl
index 88af9549..940b7ea7 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -3,6 +3,9 @@ module Data
 import ..Flux
 import SHA
 
+using Random: shuffle!
+using Base: @propagate_inbounds
+
 export CMUDict, cmudict
 
 deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...)
@@ -26,6 +29,9 @@ function __init__()
   mkpath(deps())
 end
 
+include("dataloader.jl")
+export DataLoader
+
 include("mnist.jl")
 export MNIST
 
@@ -42,7 +48,11 @@ using .Sentiment
 include("iris.jl")
 export Iris
 
+<<<<<<< HEAD
 include("housing.jl")
 export Housing
 
 end
+=======
+end #module
+>>>>>>> af20a785... add DataLoader
diff --git a/src/data/dataloader.jl b/src/data/dataloader.jl
new file mode 100644
index 00000000..baf32a83
--- /dev/null
+++ b/src/data/dataloader.jl
@@ -0,0 +1,88 @@
+# Adapted from Knet's src/data.jl (author: Deniz Yuret)
+
+struct DataLoader
+    data
+    batchsize::Int
+    nobs::Int
+    partial::Bool
+    imax::Int
+    indices::Vector{Int}
+    shuffle::Bool
+end
+
+"""
+     DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+
+An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
+(except possibly the last one). 
+
+Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in 
+supervised learning. The last dimension in each tensor is considered to be the observation
+dimension. 
+
+If `shuffle=true`, shuffles the observations each time iterations are re-started.
+If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
+
+Example usage:
+
+    Xtrain = rand(10, 100)
+    dtrain = DataLoader(Xtrain, batchsize=2) 
+    # iterate over 50 mini-batches
+    for x in dtrain: 
+        @assert size(x) == (10, 2)
+        ...
+    end
+
+    Xtrain = rand(10, 100)
+    Ytrain = rand(100)
+    dtrain = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true) 
+    for epoch in 1:100
+        for (x, y) in dtrain: 
+            @assert size(x) == (10, 2)
+            @assert size(y) == (2,)
+            ...
+        end
+    end
+
+    # train for 10 epochs
+    using IterTools: ncycle 
+    Flux.train!(loss, ps, ncycle(dtrain, 10), opt)
+"""
+function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
+    batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
+    
+    nx = size(data[1])[end]
+    for i=2:length(data)
+        nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
+    end
+    if nx < batchsize
+        @warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
+        batchsize = nx
+    end
+    imax = partial ? nx : nx - batchsize + 1
+    ids = 1:min(nx, batchsize)
+    DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
+end
+
+getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
+
+@propagate_inbounds function Base.iterate(d::DataLoader, i=0)     # returns data in d.indices[i+1:i+batchsize]
+    i >= d.imax && return nothing
+    if d.shuffle && i == 0
+        shuffle!(d.indices)
+    end
+    nexti = min(i + d.batchsize, d.nobs)
+    ids = d.indices[i+1:nexti]
+    if length(d.data) == 1
+        batch = getdata(d.data[1], ids)
+    else
+        batch = ((getdata(x, ids) for x in d.data)...,)
+    end
+    return (batch, nexti)
+end
+
+function Base.length(d::DataLoader)
+    n = d.nobs / d.batchsize
+    d.partial ? ceil(Int,n) : floor(Int,n)
+end
\ No newline at end of file
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 59404a42..34a98394 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -61,13 +61,14 @@ end
 For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
 backpropagation and calls the optimizer `opt`.
 
+In case datapoints `d` are of array type, assumes no splatting is needed 
+and computes the gradient of `loss(d)`.
+
 Takes a callback as keyword argument `cb`. For example, this will print "training"
 every 10 seconds:
 
-```julia
-Flux.train!(loss, params, data, opt,
-            cb = throttle(() -> println("training"), 10))
-```
+  train!(loss, params, data, opt,
+         cb = throttle(() -> println("training"), 10))
 
 The callback can call `Flux.stop()` to interrupt the training loop.
 
@@ -78,8 +79,14 @@ function train!(loss, ps, data, opt; cb = () -> ())
   cb = runall(cb)
   @progress for d in data
     try
-      gs = gradient(ps) do
-        loss(d...)
+      if d isa AbstractArray
+        gs = gradient(ps) do
+          loss(d)
+        end
+      else
+        gs = gradient(ps) do
+          loss(d...)
+        end
       end
       update!(opt, ps, gs)
       cb()
diff --git a/test/data.jl b/test/data.jl
index 6c012a93..1a090174 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -1,28 +1,85 @@
-using Flux.Data
-using Test
+@testset "DataLoader" begin
+    X = reshape([1:10;], (2, 5))
+    Y = [1:5;]
 
-@test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
+    d = DataLoader(X, batchsize=2)
+    batches = collect(d)
+    @test length(batches) == 3
+    @test batches[1] == X[:,1:2]
+    @test batches[2] == X[:,3:4]
+    @test batches[3] == X[:,5:5]
 
-@test length(CMUDict.phones()) == 39
+    d = DataLoader(X, batchsize=2, partial=false)
+    batches = collect(d)
+    @test length(batches) == 2
+    @test batches[1] == X[:,1:2]
+    @test batches[2] == X[:,3:4]
 
-@test length(CMUDict.symbols()) == 84
+    d = DataLoader(X, Y, batchsize=2)
+    batches = collect(d)
+    @test length(batches) == 3
+    @test length(batches[1]) == 2
+    @test length(batches[2]) == 2
+    @test length(batches[3]) == 2
+    @test batches[1][1] == X[:,1:2]
+    @test batches[1][2] == Y[1:2]
+    @test batches[2][1] == X[:,3:4]
+    @test batches[2][2] == Y[3:4]
+    @test batches[3][1] == X[:,5:5]
+    @test batches[3][2] == Y[5:5]
 
-@test MNIST.images()[1] isa Matrix
-@test MNIST.labels() isa Vector{Int64}
+    # test interaction with `train!`
+    θ = ones(2)
+    X = zeros(2, 10)
+    loss(x) = sum((x .- θ).^2)
+    d  = DataLoader(X) 
+    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    @test norm(θ) < 1e-4
 
-@test FashionMNIST.images()[1] isa Matrix
-@test FashionMNIST.labels() isa Vector{Int64}
+    # test interaction with `train!`
+    θ = zeros(2)
+    X = ones(2, 10)
+    Y = fill(2, 10)
+    loss(x, y) = sum((y - x'*θ).^2)
+    d  = DataLoader(X, Y) 
+    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    @test norm(θ .- 1) < 1e-10
+end
 
-@test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
+@testset "CMUDict" begin 
+    @test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
 
-@test Iris.features() isa Matrix
-@test size(Iris.features()) == (4,150) 
+    @test length(CMUDict.phones()) == 39
 
-@test Iris.labels() isa Vector{String}
-@test size(Iris.labels()) == (150,)
+    @test length(CMUDict.symbols()) == 84
+end
 
-@test Housing.features() isa Matrix
-@test size(Housing.features()) == (506, 13)
+@testset "MNIST" begin 
+    @test MNIST.images()[1] isa Matrix
+    @test MNIST.labels() isa Vector{Int64}
+end
 
-@test Housing.targets() isa Array{Float64}
-@test size(Housing.targets()) == (506, 1)
+@testset "FashionMNIST" begin 
+    @test FashionMNIST.images()[1] isa Matrix
+    @test FashionMNIST.labels() isa Vector{Int64}
+end
+
+@testset "Sentiment" begin 
+    @test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
+end
+
+@testset "Iris" begin 
+    @test Iris.features() isa Matrix
+    @test size(Iris.features()) == (4,150)
+
+    @test Iris.labels() isa Vector{String}
+    @test size(Iris.labels()) == (150,)
+end
+
+@testest "Housing" begin
+    @test Housing.features() isa Matrix
+    @test size(Housing.features()) == (506, 13)
+
+    @test Housing.targets() isa Array{Float64}
+    @test size(Housing.targets()) == (506, 1)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 1505e96a..81182f0d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,32 +1,49 @@
-using Flux, Test, Random, Statistics, Documenter
-using Random
+using Flux 
+using Flux.Data
+using Test 
+using Random, Statistics, LinearAlgebra
+using Documenter
+using IterTools: ncycle
 
 Random.seed!(0)
 
 @testset "Flux" begin
 
-@info "Testing Basics"
+  @testset "Utils" begin
+    include("utils.jl")
+  end
 
-include("utils.jl")
-include("onehot.jl")
-include("optimise.jl")
-include("data.jl")
+  @testset "Onehot" begin
+    include("onehot.jl")
+  end
 
-@info "Testing Layers"
+  @testset "Optimise" begin
+    include("optimise.jl")
+  end
 
-include("layers/basic.jl")
-include("layers/normalisation.jl")
-include("layers/stateless.jl")
-include("layers/conv.jl")
+  @testset "Data" begin
+    include("data.jl")
+  end
 
-if Flux.use_cuda[]
-  include("cuda/cuda.jl")
-else
-  @warn "CUDA unavailable, not testing GPU support"
-end
+  @testset "Layers" begin
+    include("layers/basic.jl")
+    include("layers/normalisation.jl")
+    include("layers/stateless.jl")
+    include("layers/conv.jl")
+  end
 
-if VERSION >= v"1.2"
-  doctest(Flux)
-end
+  @testset "CUDA" begin
+    if Flux.use_cuda[]
+      include("cuda/cuda.jl")
+    else
+      @warn "CUDA unavailable, not testing GPU support"
+    end
+  end
 
-end
+  @testset "Docs" begin
+    if VERSION >= v"1.2"
+      doctest(Flux)
+    end
+  end
+
+end # testset Flux

From 487002878ed530303cf9527e7cca0ea57b34d5b2 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 27 Feb 2020 20:49:05 +0100
Subject: [PATCH 032/113] restrict train! special casing

---
 src/optimise/train.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 34a98394..54b7f53a 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -79,7 +79,7 @@ function train!(loss, ps, data, opt; cb = () -> ())
   cb = runall(cb)
   @progress for d in data
     try
-      if d isa AbstractArray
+      if d isa AbstractArray{<:Number}
         gs = gradient(ps) do
           loss(d)
         end

From 97141e8c98fc94feadbe287f45a32b58bd3d515c Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 27 Feb 2020 20:49:55 +0100
Subject: [PATCH 033/113] improve docstring

---
 src/optimise/train.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 54b7f53a..79ebcc06 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -61,7 +61,7 @@ end
 For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
 backpropagation and calls the optimizer `opt`.
 
-In case datapoints `d` are of array type, assumes no splatting is needed 
+In case datapoints `d` are of numeric array type, assumes no splatting is needed 
 and computes the gradient of `loss(d)`.
 
 Takes a callback as keyword argument `cb`. For example, this will print "training"

From a72258ea2a428ce4b12e711395856091f17f9fcc Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 18:55:49 +0100
Subject: [PATCH 034/113] fix rebase

---
 src/data/Data.jl | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/data/Data.jl b/src/data/Data.jl
index 940b7ea7..16a025a7 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -48,11 +48,7 @@ using .Sentiment
 include("iris.jl")
 export Iris
 
-<<<<<<< HEAD
 include("housing.jl")
 export Housing
 
 end
-=======
-end #module
->>>>>>> af20a785... add DataLoader

From a1efc434c21d2e4026e5d4f8764854451bac88c5 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 19:40:44 +0100
Subject: [PATCH 035/113] fix typo

---
 test/data.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index 1a090174..c7a8fdfd 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -76,7 +76,7 @@ end
     @test size(Iris.labels()) == (150,)
 end
 
-@testest "Housing" begin
+@testset "Housing" begin
     @test Housing.features() isa Matrix
     @test size(Housing.features()) == (506, 13)
 

From 5cbd2cecf29cf58a4e4bd97e637515c299a522d8 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 29 Feb 2020 16:09:59 -0600
Subject: [PATCH 036/113] Changed testmode! to return model

---
 src/functor.jl               |  2 +-
 src/layers/basic.jl          |  2 +-
 src/layers/normalise.jl      | 10 +++++-----
 test/layers/normalisation.jl | 16 ++++++++--------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index 4edfbd98..ee384b98 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -50,7 +50,7 @@ Possible values include:
 - `true` for testing
 - `:auto` or `nothing` for Flux to detect the mode automatically
 """
-testmode!(m, mode) = nothing
+testmode!(m, mode = true) = m
 
 params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
 
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 6788f761..10d1f07b 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -33,7 +33,7 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
-testmode!(m::Chain, mode = true) = map(x -> testmode!(x, mode), m.layers)
+testmode!(m::Chain, mode = true) = (map(x -> testmode!(x, mode), m.layers); m)
 
 function Base.show(io::IO, c::Chain)
   print(io, "Chain(")
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7b438bc2..36c6d2bd 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -44,7 +44,7 @@ function (a::Dropout)(x)
 end
 
 testmode!(m::Dropout, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, d::Dropout)
   print(io, "Dropout(", d.p)
@@ -83,7 +83,7 @@ function (a::AlphaDropout)(x)
 end
 
 testmode!(m::AlphaDropout, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 """
     LayerNorm(h::Integer)
@@ -191,7 +191,7 @@ end
 @functor BatchNorm
 
 testmode!(m::BatchNorm, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::BatchNorm)
   print(io, "BatchNorm($(join(size(l.β), ", "))")
@@ -290,7 +290,7 @@ end
 @functor InstanceNorm
 
 testmode!(m::InstanceNorm, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::InstanceNorm)
   print(io, "InstanceNorm($(join(size(l.β), ", "))")
@@ -393,7 +393,7 @@ end
 @functor GroupNorm
 
 testmode!(m::GroupNorm, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::GroupNorm)
   print(io, "GroupNorm($(join(size(l.β), ", "))")
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 594fb586..79bd9c77 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -85,19 +85,19 @@ end
     @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
   end
 
-  let m = trainmode(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
+  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:6), 3, 2, 1)
     y = reshape(permutedims(x, [2, 1, 3]), 2, :)
     y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
     @test m(x) == y
   end
 
-  let m = trainmode(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
+  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:12), 2, 3, 2, 1)
     y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
     @test m(x) == y
   end
 
-  let m = trainmode(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
+  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
     y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
     @test m(x) == y
@@ -165,7 +165,7 @@ end
     @test isapprox(y, sigmoid.((x .- expand_inst(m.μ, affine_shape)) ./ sqrt.(expand_inst(m.σ², affine_shape) .+ m.ϵ)), atol = 1.0e-7)
   end
 
-  let m = trainmode(InstanceNorm(2)), sizes = (2, 4, 1, 2, 3),
+  let m = testmode!(InstanceNorm(2), false), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
     y = reshape(m(y), sizes...)
@@ -182,7 +182,7 @@ end
   end
 
   # show that instance norm is equal to batch norm when channel and batch dims are squashed
-  let m_inorm = trainmode(InstanceNorm(2)), m_bnorm = trainmode(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
+  let m_inorm = testmode!(InstanceNorm(2), false), m_bnorm = testmode!(BatchNorm(12), false), sizes = (5, 5, 3, 4, 2, 6),
       x = reshape(Float32.(collect(1:prod(sizes))), sizes)
     @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes)
   end
@@ -266,7 +266,7 @@ if VERSION >= v"1.1"
     @test isapprox(y, out, atol = 1.0e-7)
   end
 
-  let m = trainmode(GroupNorm(2,2)), sizes = (2, 4, 1, 2, 3),
+  let m = testmode!(GroupNorm(2,2), false), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
     y = reshape(m(y), sizes...)
@@ -283,13 +283,13 @@ if VERSION >= v"1.1"
   end
 
   # show that group norm is the same as instance norm when the group size is the same as the number of channels
-  let IN = trainmode(InstanceNorm(4)), GN = trainmode(GroupNorm(4,4)), sizes = (2,2,3,4,5),
+  let IN = testmode!(InstanceNorm(4), false), GN = testmode!(GroupNorm(4,4), false), sizes = (2,2,3,4,5),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test IN(x) ≈ GN(x)
   end
 
   # show that group norm is the same as batch norm for a group of size 1 and batch of size 1
-  let BN = trainmode(BatchNorm(4)), GN = trainmode(GroupNorm(4,4)), sizes = (2,2,3,4,1),
+  let BN = testmode!(BatchNorm(4), false), GN = testmode!(GroupNorm(4,4), false), sizes = (2,2,3,4,1),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test BN(x) ≈ GN(x)
   end

From 568ecb1c979a6b05e379d13c2ed2d6ed45f2a71b Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 29 Feb 2020 16:25:18 -0600
Subject: [PATCH 037/113] Removed trainmode from tests

---
 test/layers/normalisation.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 79bd9c77..f9d4849a 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -2,7 +2,6 @@ using Flux, Test, Statistics
 using Zygote: pullback
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
-trainmode(f) = (testmode!(f, false); f)
 
 @testset "Dropout" begin
   x = [1.,2.,3.]

From 08dabce57e41a23f060ed019f84b32b962afeac6 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Sun, 1 Mar 2020 12:00:11 +0530
Subject: [PATCH 038/113] Updated loss function docs

---
 src/layers/stateless.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 9b6db037..592e2fa1 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -29,7 +29,7 @@ msle(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(
     huber_loss(ŷ, y,delta=1.0)
 
 Computes the mean of the Huber loss. By default, delta is set to 1.0.
-                    | 0.5*|(ŷ-y)|,   for |ŷ-y|<delta
+                    | 0.5*|(ŷ-y)|,   for |ŷ-y|<=delta
       Hubber loss = |
                     | delta*(|ŷ-y| - 0.5*delta),  otherwise
 
@@ -169,7 +169,7 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
-See also [`squared_hinge`](@ref)
+See also [`squared_hinge`](@ref).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
@@ -178,7 +178,7 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
 Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1)
 
-See also [`hinge`](@ref)
+See also [`hinge`](@ref).
 """
 squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
 
@@ -186,8 +186,8 @@ squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
     dice_coeff_loss(y_pred,y_true,smooth = 1)
 
 Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score
-    Dice_Coefficient(A,B) = 2*sum(|A*B|+smooth)/(sum(A^2)+sum(B^2)+ smooth)
-    Dice_loss = 1-Dice_Coefficient
+    Dice_Coefficient(A,B) = 2 * sum( |A*B| + smooth) / (sum( A^2 ) + sum( B^2 )+ smooth)
+    Dice_loss = 1 - Dice_Coefficient
 
 Ref: [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
 """

From 6076847a454027c2599b9e8588df824f734a087e Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Sun, 1 Mar 2020 15:07:12 +0100
Subject: [PATCH 039/113] fix a few typos in docstrings

---
 docs/src/training/optimisers.md | 8 ++++----
 src/optimise/optimisers.jl      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 37288b5d..1ee526b3 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -62,7 +62,7 @@ ADAMW
 
 ## Optimiser Interface
 
-Flux's optimsers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
+Flux's optimisers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
 
 In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
 
@@ -100,15 +100,15 @@ Flux internally calls on this function via the `update!` function. It shares the
 
 ## Composing Optimisers
 
-Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
+Flux defines a special kind of optimiser simply called `Optimiser` which takes in arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
 that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
 
 ```julia
 opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
 ```
 
-Here we apply exponential decay to the `Descent` optimser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
-It is then applied like any optimser.
+Here we apply exponential decay to the `Descent` optimiser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
+It is then applied like any optimiser.
 
 ```julia
 w = randn(10, 10)
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index cf4496f4..212b876e 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -77,7 +77,7 @@ Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
 
 ## Parameters
   - Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
-  - Nesterov Momentum (ρ): Paramters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
+  - Nesterov Momentum (ρ): Parameters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
 
 ## Examples
 ```julia
@@ -105,7 +105,7 @@ end
 """
     RMSProp(η, ρ)
 
-Implements the RMSProp algortihm. Often a good choice for recurrent networks. Paramters other than learning rate generally don't need tuning.
+Implements the RMSProp algortihm. Often a good choice for recurrent networks. Parameters other than learning rate generally don't need tuning.
 
 ## Parameters
   - Learning Rate (η): Defaults to `0.001`.

From 32e0aa9fcb2812b1aca279d5466a2d3c8a6264f4 Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Sun, 1 Mar 2020 15:15:39 +0100
Subject: [PATCH 040/113] docstring ensure signature code formatting

by using a four space indent instead of two
---
 src/data/dataloader.jl     | 2 +-
 src/data/iris.jl           | 2 --
 src/optimise/optimisers.jl | 8 ++++----
 src/optimise/train.jl      | 4 ++--
 src/utils.jl               | 2 +-
 5 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/data/dataloader.jl b/src/data/dataloader.jl
index baf32a83..8868a9b0 100644
--- a/src/data/dataloader.jl
+++ b/src/data/dataloader.jl
@@ -11,7 +11,7 @@ struct DataLoader
 end
 
 """
-     DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+    DataLoader(data...; batchsize=1, shuffle=false, partial=true)
 
 An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
 (except possibly the last one). 
diff --git a/src/data/iris.jl b/src/data/iris.jl
index d78606d8..f74e0709 100644
--- a/src/data/iris.jl
+++ b/src/data/iris.jl
@@ -28,7 +28,6 @@ function load()
 end
 
 """
-
     labels()
 
 Get the labels of the iris dataset, a 150 element array of strings listing the
@@ -53,7 +52,6 @@ function labels()
 end
 
 """
-
     features()
 
 Get the features of the iris dataset.  This is a 4x150 matrix of Float64
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index cf4496f4..75ba8618 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -6,7 +6,7 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-  Descent(η)
+    Descent(η)
 
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
@@ -441,7 +441,7 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
-  InvDecay(γ)
+    InvDecay(γ)
 
 Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
 ```
@@ -470,7 +470,7 @@ function apply!(o::InvDecay, x, Δ)
 end
 
 """
-  ExpDecay(eta, decay, decay_step, clip)
+    ExpDecay(eta, decay, decay_step, clip)
 
 Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
 
@@ -509,7 +509,7 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-  WeightDecay(wd)
+    WeightDecay(wd)
 
 Decays the weight by `wd`
 
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 79ebcc06..e12ab27b 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -3,8 +3,8 @@ import Zygote: Params, gradient
 
 
 """
-  update!(opt, p, g)
-  update!(opt, ps::Params, gs)
+    update!(opt, p, g)
+    update!(opt, ps::Params, gs)
 
 Perform an update step of the parameters `ps` (or the single parameter `p`) 
 according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
diff --git a/src/utils.jl b/src/utils.jl
index 2dba21c7..f483c5d9 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -60,7 +60,7 @@ head(x::Tuple) = reverse(Base.tail(reverse(x)))
 squeezebatch(x) = reshape(x, head(size(x)))
 
 """
-  batch(xs)
+    batch(xs)
 
 Batch the arrays in `xs` into a single array.
 

From f4365dab94e6cc2f46e7604f5ba1de311617db28 Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Sun, 1 Mar 2020 15:19:22 +0100
Subject: [PATCH 041/113] fix docstring example indentation as well

---
 src/optimise/optimisers.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 75ba8618..c8e00126 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -451,7 +451,7 @@ Applies inverse time decay to an optimiser, i.e., the effective step size at ite
 
 ## Example
 ```julia
-  Optimiser(InvDecay(..), Opt(..))
+Optimiser(InvDecay(..), Opt(..))
 ```
 """
 mutable struct InvDecay
@@ -483,9 +483,8 @@ Discount the learning rate `eta` by a multiplicative factor `decay` every `decay
 ## Example
 To apply exponential decay to an optimiser:
 ```julia
-  Optimiser(ExpDecay(..), Opt(..))
-
-  opt = Optimiser(ExpDecay(), ADAM())
+Optimiser(ExpDecay(..), Opt(..))
+opt = Optimiser(ExpDecay(), ADAM())
 ```
 """
 mutable struct ExpDecay

From d67a2e40b3039830c68253e973d292257e00537a Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Sun, 1 Mar 2020 15:20:40 +0100
Subject: [PATCH 042/113] remove stray code block start from docstring

---
 src/optimise/optimisers.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index c8e00126..f853ac23 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -444,7 +444,6 @@ end
     InvDecay(γ)
 
 Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
-```
 
 ## Parameters
   - gamma (γ): Defaults to `0.001`

From c001d0f3c5cf8613cac2be67821cc6d0561280a4 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 12:30:41 -0600
Subject: [PATCH 043/113] Added trainmode! and updated docs with warning

---
 docs/src/models/layers.md    |  1 +
 src/Flux.jl                  |  2 +-
 src/functor.jl               | 21 ++++++++++++++++++++-
 test/layers/normalisation.jl | 16 ++++++++--------
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 763fbf8c..100cee4d 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -72,6 +72,7 @@ Many normalisation layers behave differently under training and inference (testi
 
 ```@docs
 testmode!
+trainmode!
 ```
 
 ## Cost Functions
diff --git a/src/Flux.jl b/src/Flux.jl
index 5f9878f3..163fcdf2 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -11,7 +11,7 @@ export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
        DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
-       SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!
+       SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!
 
 include("optimise/Optimise.jl")
 using .Optimise
diff --git a/src/functor.jl b/src/functor.jl
index ee384b98..fce730b1 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -40,11 +40,14 @@ end
 trainable(m) = functor(m)[1]
 
 """
-  testmode!(m, mode = true)
+    testmode!(m, mode = true)
 
 Set a layer or model's test mode (see below).
 Using `:auto` mode will treat any gradient computation as training.
 
+_Note_: if you manually set a model into test mode, you need to manually place
+it back into train mode.
+
 Possible values include:
 - `false` for training
 - `true` for testing
@@ -52,6 +55,22 @@ Possible values include:
 """
 testmode!(m, mode = true) = m
 
+"""
+    trainmode!(m, mode = true)
+
+Set a layer of model's train mode (see below).
+Symmetric to [`testmode`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
+
+_Note_: if you manually set a model into train mode, you need to manually place
+it into test mode.
+
+Possible values include:
+- `true` for training
+- `false` for testing
+- `:auto` or `nothing` for Flux to detect the mode automatically
+"""
+trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode)
+
 params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
 
 function params!(p::Params, x, seen = IdSet())
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index f9d4849a..ed2879b0 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -84,19 +84,19 @@ end
     @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
   end
 
-  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:6), 3, 2, 1)
+  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
     y = reshape(permutedims(x, [2, 1, 3]), 2, :)
     y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
     @test m(x) == y
   end
 
-  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:12), 2, 3, 2, 1)
+  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
     y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
     @test m(x) == y
   end
 
-  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
+  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
     y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
     @test m(x) == y
@@ -164,7 +164,7 @@ end
     @test isapprox(y, sigmoid.((x .- expand_inst(m.μ, affine_shape)) ./ sqrt.(expand_inst(m.σ², affine_shape) .+ m.ϵ)), atol = 1.0e-7)
   end
 
-  let m = testmode!(InstanceNorm(2), false), sizes = (2, 4, 1, 2, 3),
+  let m = trainmode!(InstanceNorm(2)), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
     y = reshape(m(y), sizes...)
@@ -181,7 +181,7 @@ end
   end
 
   # show that instance norm is equal to batch norm when channel and batch dims are squashed
-  let m_inorm = testmode!(InstanceNorm(2), false), m_bnorm = testmode!(BatchNorm(12), false), sizes = (5, 5, 3, 4, 2, 6),
+  let m_inorm = trainmode!(InstanceNorm(2)), m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
       x = reshape(Float32.(collect(1:prod(sizes))), sizes)
     @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes)
   end
@@ -265,7 +265,7 @@ if VERSION >= v"1.1"
     @test isapprox(y, out, atol = 1.0e-7)
   end
 
-  let m = testmode!(GroupNorm(2,2), false), sizes = (2, 4, 1, 2, 3),
+  let m = trainmode!(GroupNorm(2,2)), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
     y = reshape(m(y), sizes...)
@@ -282,13 +282,13 @@ if VERSION >= v"1.1"
   end
 
   # show that group norm is the same as instance norm when the group size is the same as the number of channels
-  let IN = testmode!(InstanceNorm(4), false), GN = testmode!(GroupNorm(4,4), false), sizes = (2,2,3,4,5),
+  let IN = trainmode!(InstanceNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,5),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test IN(x) ≈ GN(x)
   end
 
   # show that group norm is the same as batch norm for a group of size 1 and batch of size 1
-  let BN = testmode!(BatchNorm(4), false), GN = testmode!(GroupNorm(4,4), false), sizes = (2,2,3,4,1),
+  let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,1),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test BN(x) ≈ GN(x)
   end

From 35e460b044d47433999c5719111ff1b14138fef2 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 12:44:36 -0600
Subject: [PATCH 044/113] Fixed broken @ref in docstring

---
 src/functor.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/functor.jl b/src/functor.jl
index fce730b1..ba8c9212 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -59,7 +59,7 @@ testmode!(m, mode = true) = m
     trainmode!(m, mode = true)
 
 Set a layer of model's train mode (see below).
-Symmetric to [`testmode`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
+Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
 
 _Note_: if you manually set a model into train mode, you need to manually place
 it into test mode.

From 23f791e32b6176500d0a48af1afe90b4f8a7958c Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 12:49:30 -0600
Subject: [PATCH 045/113] Add "during X phase" phrasing to testmode!/trainmode!
 docstring.

---
 src/functor.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index ba8c9212..0d7c55f1 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -46,7 +46,7 @@ Set a layer or model's test mode (see below).
 Using `:auto` mode will treat any gradient computation as training.
 
 _Note_: if you manually set a model into test mode, you need to manually place
-it back into train mode.
+it back into train mode during training phase.
 
 Possible values include:
 - `false` for training
@@ -62,7 +62,7 @@ Set a layer of model's train mode (see below).
 Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
 
 _Note_: if you manually set a model into train mode, you need to manually place
-it into test mode.
+it into test mode during testing phase.
 
 Possible values include:
 - `true` for training

From 88cad1c5e7fb1d16702bff72444a3b91c7bb9469 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 12:50:49 -0600
Subject: [PATCH 046/113] Bump minor version to v0.10.3

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index bd105730..f88d2451 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.2"
+version = "0.10.3"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

From e49d9c4537714441730f4023b12b168916246137 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 13:11:07 -0600
Subject: [PATCH 047/113] Debump version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f88d2451..bd105730 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.3"
+version = "0.10.2"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

From 9b2f4919ee8d6a4c9d8254b4786d7e1d6e64ceec Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Sun, 1 Mar 2020 19:33:16 -0500
Subject: [PATCH 048/113] includ cuda/cuda.jl during precompile, even if cuda
 isn't detected

---
 src/Flux.jl | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 78670e65..8fb0f406 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -39,24 +39,14 @@ include("data/Data.jl")
 
 include("deprecations.jl")
 
+include(joinpath(@__DIR__, "cuda/cuda.jl"))
+
+
 function __init__()
-  precompiling = ccall(:jl_generating_output, Cint, ()) != 0
-
-  # we don't want to include the CUDA module when precompiling,
-  # or we could end up replacing it at run time (triggering a warning)
-  precompiling && return
-
-  if !CuArrays.functional()
-    # nothing to do here, and either CuArrays or one of its dependencies will have warned
-  else
-    use_cuda[] = true
-
-    # FIXME: this functionality should be conditional at run time by checking `use_cuda`
-    #        (or even better, get moved to CuArrays.jl as much as possible)
-    if CuArrays.has_cudnn()
-      include(joinpath(@__DIR__, "cuda/cuda.jl"))
-    else
-      @warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
+  use_cuda[] = CuArrays.functional() # Can be overridden after load with Flux.use_cuda[] = false
+  if CuArrays.functional()
+    if !CuArrays.has_cudnn()
+      @warn "CuArrays.jl found cuda, but did not find libcudnn. Some functionality will not be available."
     end
   end
 end

From 7555e488c626e45b9e0b4d91e56ecb6775384b06 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Sun, 1 Mar 2020 19:40:03 -0500
Subject: [PATCH 049/113] tweaks

---
 src/Flux.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 8fb0f406..5afa1fc0 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -39,11 +39,10 @@ include("data/Data.jl")
 
 include("deprecations.jl")
 
-include(joinpath(@__DIR__, "cuda/cuda.jl"))
-
+include("cuda/cuda.jl")
 
 function __init__()
-  use_cuda[] = CuArrays.functional() # Can be overridden after load with Flux.use_cuda[] = false
+  use_cuda[] = CuArrays.functional() # Can be overridden after load with `Flux.use_cuda[] = false`
   if CuArrays.functional()
     if !CuArrays.has_cudnn()
       @warn "CuArrays.jl found cuda, but did not find libcudnn. Some functionality will not be available."

From 27949693f334b4d803526efa585cc957320e957c Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 2 Mar 2020 12:40:19 +0530
Subject: [PATCH 050/113] refactor

---
 docs/make.jl                  |  3 +-
 docs/src/models/advanced.md   | 61 +++++++++++++++++++++++++++++++++++
 docs/src/models/basics.md     | 18 +----------
 docs/src/training/training.md | 15 +--------
 4 files changed, 65 insertions(+), 32 deletions(-)
 create mode 100644 docs/src/models/advanced.md

diff --git a/docs/make.jl b/docs/make.jl
index b950e959..365cdfc0 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -13,7 +13,8 @@ makedocs(modules=[Flux, NNlib],
                     ["Basics" => "models/basics.md",
                      "Recurrence" => "models/recurrence.md",
                      "Regularisation" => "models/regularisation.md",
-                     "Model Reference" => "models/layers.md"],
+                     "Model Reference" => "models/layers.md",
+                     "Advanced Model Building" => "models/advanced.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
                      "Training" => "training/training.md"],
diff --git a/docs/src/models/advanced.md b/docs/src/models/advanced.md
new file mode 100644
index 00000000..4a023709
--- /dev/null
+++ b/docs/src/models/advanced.md
@@ -0,0 +1,61 @@
+# Advanced Model Building and Customisation
+
+Here we will try and describe usage of some more advanced features that Flux provides to give more control over model building.
+
+## Customising Parameter Collection for a Model
+
+Taking reference from our example `Affine` layer from the [basics](basics.md#Building-Layers-1).
+
+By default all the fields in the `Affine` type are collected as its parameters, however, in some cases it may be desired to hold other metadata in our "layers" that may not be needed for training, and are hence supposed to be ignored while the parameters are collected. With Flux, it is possible to mark the fields of our layers that are trainable in two ways.
+
+The first way of achieving this is through overloading the `trainable` function.
+
+```julia-repl
+julia> @functor Affine
+
+julia> a = Affine(rand(3,3), rand(3))
+Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955])
+
+julia> Flux.params(a) # default behavior
+Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])
+
+julia> Flux.trainable(a::Affine) = (a.W, a.b,)
+
+julia> Flux.params(a)
+Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])
+```
+
+Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`.
+
+Another way of achieving this is through the `@functor` macro directly. Here, we can mark the fields we are interested in by grouping them in the second argument:
+
+```julia
+Flux.@functor Affine (W,)
+```
+
+However, doing this requires the `struct` to have a corresponding constructor that accepts those parameters.
+
+## Freezing Layer Parameters
+
+When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
+
+Consider the simple multi-layer model where we want to omit optimising the first two `Dense` layers. This setup would look something like so:
+
+```julia
+m = Chain(
+  Dense(784, 64, σ),
+  Dense(64, 32),
+  Dense(32, 10), softmax)
+
+ps = Flux.params(m[3:end])
+```
+
+`ps` now holds a reference to only the parameters of the layers passed to it.
+
+During training, now the gradients would only be applied to the last `Dense` layer (and the `softmax` layer, but that is stateless so doesn't have any parameters), so only that would have its parameters changed.
+
+`Flux.params` also takes multiple inputs to make it easy to collect parameters from heterogenous models with a single call. A simple demonstration would be if we wanted to omit optimising the second `Dense` layer in the previous example. It would look something like this:
+
+```julia
+Flux.params(m[1], m[3:end])
+```
diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 3f43f29d..a5e3ca9a 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -220,20 +220,4 @@ Flux.@functor Affine
 
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
 
-By default all the fields in the `Affine` type are collected as its parameters, however, in some cases it may be desired to hold other metadata in our "layers" that may not be needed for training, and are hence supposed to be ignored while the parameters are collected. With Flux, it is possible to mark the fields of our layers that are trainable in two ways.
-
-The first way of achieving this is through overloading the `trainable` function.
-
-```julia
-Flux.trainable(a::Affine) = (a.W, a.b,)
-```
-
-Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`. 
-
-Another way of achieving this is through the `@functor` macro. Here, wee can mark the fields we are interested in like so:
-Another way of achieving this is through the `@functor` macro. Here, we can mark the fields we are interested in by grouping them in the second argument:
-```julia
-Flux.@functor Affine (W,)
-```
-
-However, doing this requires the `struct` to have a corresponding constructor that accepts those parameters.
+For some more helpful tricks, including parameter freezing, please checkout the [advanced usage guide](advacned.md).
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 7680a776..153d0278 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -43,20 +43,7 @@ Such an object contains a reference to the model's parameters, not a copy, such
 
 When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
 
-Consider the simple multi-layer model where we want to omit optimising the second layer. This setup would look something like so:
-
-```julia
-m = Chain(
-  Dense(784, 64, σ),
-  Dense(64, 32),
-  Dense(32, 10), softmax)
-
-ps = Flux.params(m[1], m[3:end])
-```
-
-`ps` now holds a reference to only the parameters of the layers passed to it.
-
-Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. 
+Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. Also, for freezing model parameters, see the [Advanced Usage Guide](../models/advanced.md).
 
 ## Datasets
 

From bb5350591f5f9d84fbc1397da7aea7bd6d54c3a6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 2 Mar 2020 12:42:33 +0530
Subject: [PATCH 051/113] cleanup

---
 docs/src/training/training.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 153d0278..3775f5ba 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -41,8 +41,6 @@ The model to be trained must have a set of tracked parameters that are used to c
 
 Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
 
-When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
-
 Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. Also, for freezing model parameters, see the [Advanced Usage Guide](../models/advanced.md).
 
 ## Datasets

From f9e31a020c6cf65425dc0b0415a241dd946bfd31 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 13:25:23 +0530
Subject: [PATCH 052/113] Updated huber_loss with other minute changes

---
 src/layers/stateless.jl | 47 +++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 592e2fa1..01b26a8a 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -16,38 +16,31 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
 
 """
-    msle(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
+    msle(ŷ, y; ϵ1=eps.(Float64.(ŷ)))
 
-Mean Squared Logarithmic Error. Returns the mean of the squared logarithmic errors `sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 / length(y)`.<br>
-The ϵ1 and ϵ2 terms provide numerical stability. This error penalizes an under-predicted estimate greater than an over-predicted estimate.
+Mean Squared Logarithmic Error. Returns the mean of the squared logarithmic errors `sum((log.(ŷ+ϵ1) .- log.(y+ϵ2)).^2) * 1 / length(y)`.<br>
+The `ϵ` term provides numerical stability. This error penalizes an under-predicted estimate greater than an over-predicted estimate.
 """
-msle(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+msle(ŷ, y; ϵ=eps.(ŷ)) = sum((log.(ŷ+ϵ).-log.(y+ϵ)).^2) * 1 // length(y)
 
 
 
 """
-    huber_loss(ŷ, y,delta=1.0)
+    huber_loss(ŷ, y; delta=1.0)
+
+Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, delta is set to 1.0.
 
-Computes the mean of the Huber loss. By default, delta is set to 1.0.
                     | 0.5*|(ŷ-y)|,   for |ŷ-y|<=delta
       Hubber loss = |
                     | delta*(|ŷ-y| - 0.5*delta),  otherwise
 
 [`Huber Loss`](https://en.wikipedia.org/wiki/Huber_loss).
 """
-function huber_loss(ŷ, y,delta=1.0)
-  abs_error = abs.(ŷ.-y)
-  dtype= eltype(ŷ)
-  delta = dtype(delta)
-  hub_loss = dtype(0)
-  for i in 1:length(y)
-    if (abs_error[i]<=delta)
-      hub_loss+=abs_error[i]^2*dtype(0.5)
-    else
-      hub_loss+=delta*(abs_error[i]- dtype(0.5*delta))
-    end
-  end
-  hub_loss*1//length(y)
+function huber_loss(ŷ, y; delta = eltype(ŷ)(1))
+   abs_error = abs.(ŷ.-y)
+   temp = abs_error.<delta
+   x = eltype(ŷ)(0.5)
+   hub_loss = sum(((abs_error.^2).*temp).*x .+ delta*(abs_error.- x*delta).*(1 .-temp)) * 1 // length(y)
 end
 
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
@@ -167,6 +160,7 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
     hinge(ŷ, y)
 
 Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
+Returns `sum((max.(0,1 .-ŷ .* y))) *1 // size(y, 2)`
 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
 See also [`squared_hinge`](@ref).
@@ -176,35 +170,38 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 """
     squared_hinge(ŷ, y)
 
-Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1)
+Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1).
+Returns `sum((max.(0,1 .-ŷ .* y)).^2) *1 // size(y, 2)`
 
 See also [`hinge`](@ref).
 """
 squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
 
 """
-    dice_coeff_loss(y_pred,y_true,smooth = 1)
+    dice_coeff_loss(y_pred, y_true, smooth = 1)
 
-Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score
+Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score.
+    
     Dice_Coefficient(A,B) = 2 * sum( |A*B| + smooth) / (sum( A^2 ) + sum( B^2 )+ smooth)
     Dice_loss = 1 - Dice_Coefficient
 
 Ref: [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
 """
-function dice_coeff_loss(y_pred,y_true,smooth=eltype(y_pred)(1.0))
+function dice_coeff_loss(y_pred, y_true; smooth=eltype(y_pred)(1.0))
     intersection = sum(y_true.*y_pred)
     return 1 - (2*intersection + smooth)/(sum(y_true.^2) + sum(y_pred.^2)+smooth)
 end
 
 """
-    tversky_loss(y_pred,y_true,beta = 0.7)
+    tversky_loss(y_pred, y_true, beta = 0.7)
 
 Used with imbalanced data to give more weightage to False negatives. Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
+    
     tversky_loss(ŷ,y,beta) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + beta*(1 .- y).*ŷ + (1 .- beta)*y.*(1 .- ŷ))+ 1)
 
 Ref: [Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
 """
-function tversky_loss(y_pred,y_true,beta = eltype(y_pred)(0.7))
+function tversky_loss(y_pred, y_true; beta = eltype(y_pred)(0.7))
     intersection = sum(y_true.*y_pred)
     return 1 - (intersection+1)/(sum(y_true.*y_pred + beta*(1 .- y_true).* y_pred + (1-beta).*y_true.*(1 .- y_pred))+1)
 end

From 89d07c07ec5c781de7525cd9f7a864ebe9978335 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 13:33:44 +0530
Subject: [PATCH 053/113] Added Loss functions to docs

---
 docs/src/models/layers.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 41e98f32..5522fe73 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -56,7 +56,10 @@ GroupNorm
 
 ## Cost Functions
 ```@docs
+Flux.mae
 Flux.mse
+Flux.msle
+Flux.huber_loss
 Flux.crossentropy
 Flux.logitcrossentropy
 Flux.binarycrossentropy
@@ -64,4 +67,7 @@ Flux.logitbinarycrossentropy
 Flux.kldivergence
 Flux.poisson
 Flux.hinge
+Flux.squared_hinge
+Flux.dice_coeff_loss
+Flux.tversky_loss
 ```

From 5565250c28d87eefd694e42ff67f68c2ffec8a35 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 13:46:33 +0530
Subject: [PATCH 054/113] Updated test for tversky

---
 test/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index c09d1aae..702288b6 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -99,7 +99,7 @@ const ϵ = 1e-7
             
   @testset "tversky_loss" begin
     @test Flux.tversky_loss(y,y1) ≈ 0.028747433264887046
-    @test Flux.tversky_loss(y,y1,0.8) ≈ 0.050200803212851364
+    @test Flux.tversky_loss(y,y1,beta = 0.8) ≈ 0.050200803212851364
     @test Flux.tversky_loss(y,y) ≈ -0.5576923076923075
   end
             

From 224ec728acd1bf7fa17e77813c28940ff0a2c7f3 Mon Sep 17 00:00:00 2001
From: Johnny Chen <johnnychen94@hotmail.com>
Date: Mon, 2 Mar 2020 19:23:36 +0800
Subject: [PATCH 055/113] fix travis for documentation build

---
 .travis.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index e02f470f..2e1dab6b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,9 +9,8 @@ julia:
   - 1.3
   - nightly
 
-matrix:
-  allow_failures:
-    - julia: nightly
+notifications:
+  email: false
 
 jobs:
   include:
@@ -24,6 +23,5 @@ jobs:
         - julia --project=docs/ docs/make.jl
       after_success: skip
 
-## uncomment the following lines to override the default test script
-script:
- - julia --color=yes -e 'using Pkg; Pkg.activate(); Pkg.instantiate(); Pkg.test()'
+  allow_failures:
+    - julia: nightly

From f30267e037c69b682848a117d8c55a4a57b651d2 Mon Sep 17 00:00:00 2001
From: Johnny Chen <johnnychen94@hotmail.com>
Date: Mon, 2 Mar 2020 20:14:43 +0800
Subject: [PATCH 056/113] bring back test on custom Manifest.toml

---
 .travis.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 2e1dab6b..90cf039b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,3 +25,7 @@ jobs:
 
   allow_failures:
     - julia: nightly
+
+## uncomment the following lines to override the default test script
+script:
+ - julia --color=yes -e 'using Pkg; Pkg.activate(); Pkg.instantiate(); Pkg.test()'

From e51070bf799b90b40e690a0f5dc4ab728cac76bb Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Mon, 2 Mar 2020 10:52:27 +0100
Subject: [PATCH 057/113] update documenter

---
 Manifest.toml            | 10 ++---
 Project.toml             |  1 -
 docs/Manifest.toml       | 89 ----------------------------------------
 docs/Project.toml        |  3 ++
 docs/make.jl             | 10 ++---
 docs/src/models/nnlib.md | 13 ++++++
 6 files changed, 23 insertions(+), 103 deletions(-)
 delete mode 100644 docs/Manifest.toml

diff --git a/Manifest.toml b/Manifest.toml
index 788e5354..dac05aec 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -230,9 +230,9 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[NNlib]]
 deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
-git-tree-sha1 = "755c0bab3912ff782167e1b4b774b833f8a0e550"
+git-tree-sha1 = "21a3c22bc197b6ae2f8d4d75631876e2b6506dbe"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.4"
+version = "0.6.5"
 
 [[NaNMath]]
 git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
@@ -361,11 +361,9 @@ version = "1.2.11+8"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "3c65158c0aa0808cdfff8bca2a36430b038aad00"
-repo-rev = "master"
-repo-url = "https://github.com/FluxML/Zygote.jl.git"
+git-tree-sha1 = "f8329b595c465caf3ca87c4f744e6041a4983e43"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.7"
+version = "0.4.8"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]
diff --git a/Project.toml b/Project.toml
index bd105730..a27d766b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -44,6 +44,5 @@ IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
-
 [targets]
 test = ["Test", "Documenter", "IterTools", "LinearAlgebra"]
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
deleted file mode 100644
index bf9d220a..00000000
--- a/docs/Manifest.toml
+++ /dev/null
@@ -1,89 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-[[Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
-uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-
-[[DocStringExtensions]]
-deps = ["LibGit2", "Markdown", "Pkg", "Test"]
-git-tree-sha1 = "0513f1a8991e9d83255e0140aace0d0fc4486600"
-uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.8.0"
-
-[[Documenter]]
-deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
-git-tree-sha1 = "c61d6eedbc3c4323c08b64af12d29c8ee0fcbb5f"
-uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-version = "0.23.2"
-
-[[InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[JSON]]
-deps = ["Dates", "Mmap", "Parsers", "Unicode"]
-git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
-uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-version = "0.21.0"
-
-[[LibGit2]]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[Mmap]]
-uuid = "a63ad114-7e13-5084-954f-fe012c677804"
-
-[[Parsers]]
-deps = ["Dates", "Test"]
-git-tree-sha1 = "db2b35dedab3c0e46dc15996d170af07a5ab91c9"
-uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "0.3.6"
-
-[[Pkg]]
-deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-
-[[Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[Random]]
-deps = ["Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-
-[[Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[Test]]
-deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
-uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[[UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
diff --git a/docs/Project.toml b/docs/Project.toml
index dfa65cd1..1b9ab1f8 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,2 +1,5 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+
+[compat]
+Documenter = "0.24"
diff --git a/docs/make.jl b/docs/make.jl
index e42d8217..03fbf413 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,9 +1,3 @@
-using Pkg;
-Pkg.activate(joinpath(@__DIR__, "..")); Pkg.instantiate()
-Pkg.activate(); Pkg.instantiate()
-
-pushfirst!(LOAD_PATH, joinpath(@__DIR__, ".."))
-
 using Documenter, Flux, NNlib
 
 makedocs(modules=[Flux, NNlib],
@@ -30,4 +24,6 @@ makedocs(modules=[Flux, NNlib],
                                   analytics = "UA-36890222-9",
                                   prettyurls = haskey(ENV, "CI")))
 
-deploydocs(repo = "github.com/FluxML/Flux.jl.git")
+deploydocs(repo = "github.com/FluxML/Flux.jl.git",    
+           target = "build",
+           push_preview = true)
diff --git a/docs/src/models/nnlib.md b/docs/src/models/nnlib.md
index 9e570cb3..698a95ae 100644
--- a/docs/src/models/nnlib.md
+++ b/docs/src/models/nnlib.md
@@ -1,7 +1,9 @@
 # NNlib
+
 Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
 
 ## Activation Functions
+
 Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
 
 ```@docs
@@ -19,19 +21,30 @@ NNlib.swish
 ```
 
 ## Softmax
+
 ```@docs
 NNlib.softmax
 NNlib.logsoftmax
 ```
 
 ## Pooling
+
 ```@docs
 NNlib.maxpool
 NNlib.meanpool
 ```
 
 ## Convolution
+
 ```@docs
 NNlib.conv
 NNlib.depthwiseconv
+```
+
+## Batched Operations
+
+```@docs
+NNlib.batched_mul
+NNlib.batched_mul!
+NNlib.batched_adjoint
 ```
\ No newline at end of file

From ffea8b616dcf0576e09a5f3ec61f0b277570c4b9 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Mon, 2 Mar 2020 15:07:50 +0100
Subject: [PATCH 058/113] fix docs

---
 docs/Manifest.toml | 122 +++++++++++++++++++++++++++++++++++++++++++++
 docs/Project.toml  |   1 +
 2 files changed, 123 insertions(+)
 create mode 100644 docs/Manifest.toml

diff --git a/docs/Manifest.toml b/docs/Manifest.toml
new file mode 100644
index 00000000..82d46743
--- /dev/null
+++ b/docs/Manifest.toml
@@ -0,0 +1,122 @@
+# This file is machine-generated - editing it directly is not advised
+
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[BinaryProvider]]
+deps = ["Libdl", "SHA"]
+git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
+uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
+version = "0.5.8"
+
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[DocStringExtensions]]
+deps = ["LibGit2", "Markdown", "Pkg", "Test"]
+git-tree-sha1 = "88bb0edb352b16608036faadcc071adda068582a"
+uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+version = "0.8.1"
+
+[[Documenter]]
+deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
+git-tree-sha1 = "d497bcc45bb98a1fbe19445a774cfafeabc6c6df"
+uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+version = "0.24.5"
+
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[JSON]]
+deps = ["Dates", "Mmap", "Parsers", "Unicode"]
+git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
+uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+version = "0.21.0"
+
+[[LibGit2]]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[LinearAlgebra]]
+deps = ["Libdl"]
+uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[NNlib]]
+deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
+git-tree-sha1 = "21a3c22bc197b6ae2f8d4d75631876e2b6506dbe"
+uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+version = "0.6.5"
+
+[[Parsers]]
+deps = ["Dates", "Test"]
+git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553"
+uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+version = "0.3.12"
+
+[[Pkg]]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
+[[Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[Random]]
+deps = ["Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[Requires]]
+deps = ["UUIDs"]
+git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b"
+uuid = "ae029012-a4dd-5104-9daa-d747884805df"
+version = "1.0.1"
+
+[[SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+
+[[Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[SparseArrays]]
+deps = ["LinearAlgebra", "Random"]
+uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[[Statistics]]
+deps = ["LinearAlgebra", "SparseArrays"]
+uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[[Test]]
+deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
diff --git a/docs/Project.toml b/docs/Project.toml
index 1b9ab1f8..670a65be 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 
 [compat]
 Documenter = "0.24"

From f5da4d0c70140f573ded4233073c336389cf212a Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Mon, 2 Mar 2020 15:10:08 +0100
Subject: [PATCH 059/113] remove docs manifest

---
 docs/Manifest.toml | 122 ---------------------------------------------
 1 file changed, 122 deletions(-)
 delete mode 100644 docs/Manifest.toml

diff --git a/docs/Manifest.toml b/docs/Manifest.toml
deleted file mode 100644
index 82d46743..00000000
--- a/docs/Manifest.toml
+++ /dev/null
@@ -1,122 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-[[Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[BinaryProvider]]
-deps = ["Libdl", "SHA"]
-git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
-uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.8"
-
-[[Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
-uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-
-[[DocStringExtensions]]
-deps = ["LibGit2", "Markdown", "Pkg", "Test"]
-git-tree-sha1 = "88bb0edb352b16608036faadcc071adda068582a"
-uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.8.1"
-
-[[Documenter]]
-deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
-git-tree-sha1 = "d497bcc45bb98a1fbe19445a774cfafeabc6c6df"
-uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-version = "0.24.5"
-
-[[InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[JSON]]
-deps = ["Dates", "Mmap", "Parsers", "Unicode"]
-git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
-uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-version = "0.21.0"
-
-[[LibGit2]]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[LinearAlgebra]]
-deps = ["Libdl"]
-uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-
-[[Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[Mmap]]
-uuid = "a63ad114-7e13-5084-954f-fe012c677804"
-
-[[NNlib]]
-deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
-git-tree-sha1 = "21a3c22bc197b6ae2f8d4d75631876e2b6506dbe"
-uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.5"
-
-[[Parsers]]
-deps = ["Dates", "Test"]
-git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553"
-uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "0.3.12"
-
-[[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-
-[[Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[Random]]
-deps = ["Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[Requires]]
-deps = ["UUIDs"]
-git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b"
-uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.0.1"
-
-[[SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-
-[[Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[SparseArrays]]
-deps = ["LinearAlgebra", "Random"]
-uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-
-[[Statistics]]
-deps = ["LinearAlgebra", "SparseArrays"]
-uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-
-[[Test]]
-deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
-uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[[UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

From 2f05094068067ee2738adbe2e6e455909adfff0d Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 20:00:47 +0530
Subject: [PATCH 060/113] =?UTF-8?q?Added=20consistency=20with=20y=CC=82=20?=
 =?UTF-8?q?and=20unicode=20chars?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/layers/stateless.jl | 54 ++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 01b26a8a..5f457057 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -2,7 +2,7 @@
 """
     mae(ŷ, y)
 
-Return the mean of absolute error `sum(abs.(ŷ .- y)) * 1 / length(y)` 
+Return the mean of absolute error `sum(abs.(ŷ .- y)) / length(y)` 
 """
 mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 
@@ -16,23 +16,25 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
 
 """
-    msle(ŷ, y; ϵ1=eps.(Float64.(ŷ)))
+    msle(ŷ, y; ϵ = eps.(Float64.(ŷ)))
 
-Mean Squared Logarithmic Error. Returns the mean of the squared logarithmic errors `sum((log.(ŷ+ϵ1) .- log.(y+ϵ2)).^2) * 1 / length(y)`.<br>
-The `ϵ` term provides numerical stability. This error penalizes an under-predicted estimate greater than an over-predicted estimate.
+Returns the mean of the squared logarithmic errors `sum((log.(ŷ + ϵ) .- log.(y + ϵ)).^2) / length(y)`.
+The `ϵ` term provides numerical stability. 
+
+This error penalizes an under-predicted estimate greater than an over-predicted estimate.
 """
-msle(ŷ, y; ϵ=eps.(ŷ)) = sum((log.(ŷ+ϵ).-log.(y+ϵ)).^2) * 1 // length(y)
+msle(ŷ, y; ϵ = eps.(ŷ)) = sum((log.(ŷ + ϵ).-log.(y + ϵ)).^2) * 1 // length(y)
 
 
 
 """
-    huber_loss(ŷ, y; delta=1.0)
+    huber_loss(ŷ, y; delta = 1.0)
 
 Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, delta is set to 1.0.
 
-                    | 0.5*|(ŷ-y)|,   for |ŷ-y|<=delta
+                    | 0.5*|ŷ - y|,   for |ŷ - y| <= delta
       Hubber loss = |
-                    | delta*(|ŷ-y| - 0.5*delta),  otherwise
+                    | delta*(|ŷ- y| - 0.5*delta),  otherwise
 
 [`Huber Loss`](https://en.wikipedia.org/wiki/Huber_loss).
 """
@@ -151,6 +153,7 @@ end
     poisson(ŷ, y)
 
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
+Returns `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`
 
 [Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
@@ -160,48 +163,49 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
     hinge(ŷ, y)
 
 Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
-Returns `sum((max.(0,1 .-ŷ .* y))) *1 // size(y, 2)`
+Returns `sum((max.(0, 1 .- ŷ .* y))) / size(y, 2)`
 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
 See also [`squared_hinge`](@ref).
 """
-hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
+hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y, 2)
 
 """
     squared_hinge(ŷ, y)
 
 Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1).
-Returns `sum((max.(0,1 .-ŷ .* y)).^2) *1 // size(y, 2)`
+Returns `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`
 
 See also [`hinge`](@ref).
 """
-squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
+squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) *1 // size(y, 2)
 
 """
-    dice_coeff_loss(y_pred, y_true, smooth = 1)
+    dice_coeff_loss(ŷ, y, smooth = 1)
 
 Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score.
     
-    Dice_Coefficient(A,B) = 2 * sum( |A*B| + smooth) / (sum( A^2 ) + sum( B^2 )+ smooth)
+    Dice_Coefficient(ŷ, y) = 2 * sum( |ŷ.* y| + smooth) / (sum( ŷ.^2 ) + sum( y.^2 ) + smooth)
     Dice_loss = 1 - Dice_Coefficient
 
-Ref: [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
+[V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
 """
-function dice_coeff_loss(y_pred, y_true; smooth=eltype(y_pred)(1.0))
-    intersection = sum(y_true.*y_pred)
-    return 1 - (2*intersection + smooth)/(sum(y_true.^2) + sum(y_pred.^2)+smooth)
+function dice_coeff_loss(ŷ, y; smooth = eltype(ŷ)(1.0))
+    intersection = sum(y.*ŷ)
+    return 1 - (2*intersection + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
 end
 
 """
-    tversky_loss(y_pred, y_true, beta = 0.7)
+    tversky_loss(ŷ, y, β = 0.7)
 
-Used with imbalanced data to give more weightage to False negatives. Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
+Used with imbalanced data to give more weightage to False negatives. 
+Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
     
-    tversky_loss(ŷ,y,beta) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + beta*(1 .- y).*ŷ + (1 .- beta)*y.*(1 .- ŷ))+ 1)
+    tversky_loss(ŷ, y, β) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + β *(1 .- y).*ŷ + (1 - β).*y.*(1 .- ŷ))+ 1)
 
-Ref: [Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
+[Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
 """
-function tversky_loss(y_pred, y_true; beta = eltype(y_pred)(0.7))
-    intersection = sum(y_true.*y_pred)
-    return 1 - (intersection+1)/(sum(y_true.*y_pred + beta*(1 .- y_true).* y_pred + (1-beta).*y_true.*(1 .- y_pred))+1)
+function tversky_loss(ŷ, y; β = eltype(ŷ)(0.7))
+    intersection = sum(y.*ŷ)
+    return 1 - (intersection + 1) / (sum(y.* ŷ + β *(1 .- y).* ŷ + (1 - β).*y.*(1 .- ŷ)) + 1)
 end

From 92e09e204d0684258f76aac92e509aa89935b6ec Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 20:33:12 +0530
Subject: [PATCH 061/113] =?UTF-8?q?Test=20argument=20consistency=20with=20?=
 =?UTF-8?q?=C5=B7=20and=20y?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/layers/stateless.jl | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 702288b6..ce940bf9 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -22,9 +22,9 @@ const ϵ = 1e-7
   end       
             
   y = [123.0,456.0,789.0]
-  y1 = [345.0,332.0,789.0]
+  ŷ = [345.0,332.0,789.0]
   @testset "msle" begin
-    @test Flux.msle(y1, y) ≈ 0.38813985859136585
+    @test Flux.msle(ŷ, y) ≈ 0.38813985859136585
   end
 
   # Now onehot y's
@@ -65,49 +65,50 @@ const ϵ = 1e-7
   end
   
   y = [1 2 3]
-  y1 = [4.0 5.0 6.0]
+  ŷ = [4.0 5.0 6.0]
   @testset "kldivergence" begin
-    @test Flux.kldivergence(y, y1) ≈ 4.761838062403337
+    @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
     @test Flux.kldivergence(y, y) ≈ 0 
   end
   
   y = [1 2 3 4]
-  y1 = [5.0 6.0 7.0 8.0]
+  ŷ = [5.0 6.0 7.0 8.0]
   @testset "hinge" begin
-    @test Flux.hinge(y, y1) ≈ 0
+    @test Flux.hinge(ŷ, y) ≈ 0
     @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
   end
   
   @testset "squared_hinge" begin
-    @test Flux.squared_hinge(y, y1) ≈ 0
+    @test Flux.squared_hinge(ŷ, y) ≈ 0
     @test Flux.squared_hinge(y, 0.5 .* y) ≈ 0.0625
   end
   
   y = [0.1 0.2 0.3]
-  y1 = [0.4 0.5 0.6]
+  ŷ = [0.4 0.5 0.6]
   @testset "poisson" begin
-    @test Flux.poisson(y, y1) ≈ 1.0160455586700767
+    @test Flux.poisson(ŷ, y) ≈ 0.6278353988097339
     @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
   
   y = [1.0 0.5 0.3 2.4]
-  y1 = [0 1.4 0.5 1.2]
+  ŷ = [0 1.4 0.5 1.2]
   @testset "dice_coeff_loss" begin
-    @test Flux.dice_coeff_loss(y, y1) ≈ 0.2799999999999999
-    @test Flux.dice_coeff_loss(y,y) ≈ 0.0
+    @test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999
+    @test Flux.dice_coeff_loss(y, y) ≈ 0.0
   end
             
   @testset "tversky_loss" begin
-    @test Flux.tversky_loss(y,y1) ≈ 0.028747433264887046
-    @test Flux.tversky_loss(y,y1,beta = 0.8) ≈ 0.050200803212851364
-    @test Flux.tversky_loss(y,y) ≈ -0.5576923076923075
+    @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
+    @test Flux.tversky_loss(ŷ, y, β = 0.8) ≈ -0.09490740740740744
+    @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
   end
             
   @testset "no spurious promotions" begin
     for T in (Float32, Float64)
       y = rand(T, 2)
       ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,Flux.mae,Flux.huber_loss,Flux.msle,Flux.squared_hinge,Flux.dice_coeff_loss,Flux.tversky_loss)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,
+              Flux.mae, Flux.huber_loss, Flux.msle, Flux.squared_hinge, Flux.dice_coeff_loss, Flux.tversky_loss)
         fwd, back = Flux.pullback(f, ŷ, y)
         @test fwd isa T
         @test eltype(back(one(T))[1]) == T

From af99ca27eee01101ebeb06425c6b8fab495b7b2c Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Tue, 3 Mar 2020 07:52:20 +0100
Subject: [PATCH 062/113] docs update

---
 Manifest.toml            | 16 +++++++++++-----
 docs/src/models/nnlib.md |  3 ++-
 docs/src/performance.md  | 17 ++++++++---------
 src/data/dataloader.jl   | 16 ++++++++++------
 4 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index dac05aec..04465cae 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -18,6 +18,12 @@ git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 version = "1.0.1"
 
+[[ArrayLayouts]]
+deps = ["FillArrays", "LinearAlgebra"]
+git-tree-sha1 = "bc779df8d73be70e4e05a63727d3a4dfb4c52b1f"
+uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
+version = "0.1.5"
+
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
@@ -230,9 +236,9 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[NNlib]]
 deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
-git-tree-sha1 = "21a3c22bc197b6ae2f8d4d75631876e2b6506dbe"
+git-tree-sha1 = "d9f196d911f55aeaff11b11f681b135980783824"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.5"
+version = "0.6.6"
 
 [[NaNMath]]
 git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
@@ -360,10 +366,10 @@ uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
 version = "1.2.11+8"
 
 [[Zygote]]
-deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "f8329b595c465caf3ca87c4f744e6041a4983e43"
+deps = ["ArrayLayouts", "DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
+git-tree-sha1 = "7dc5fdb4917ac5a84e199ae654316a01cd4a278b"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.8"
+version = "0.4.9"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]
diff --git a/docs/src/models/nnlib.md b/docs/src/models/nnlib.md
index 698a95ae..6dbfd4f4 100644
--- a/docs/src/models/nnlib.md
+++ b/docs/src/models/nnlib.md
@@ -12,9 +12,9 @@ NNlib.gelu
 NNlib.leakyrelu
 NNlib.logcosh
 NNlib.logsigmoid
-NNlib.sigmoid
 NNlib.relu
 NNlib.selu
+NNlib.sigmoid
 NNlib.softplus
 NNlib.softsign
 NNlib.swish
@@ -47,4 +47,5 @@ NNlib.depthwiseconv
 NNlib.batched_mul
 NNlib.batched_mul!
 NNlib.batched_adjoint
+NNlib.batched_transpose
 ```
\ No newline at end of file
diff --git a/docs/src/performance.md b/docs/src/performance.md
index 06a4f690..0af8ef3b 100644
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@@ -4,7 +4,7 @@ All the usual [Julia performance tips apply](https://docs.julialang.org/en/v1/ma
 As always [profiling your code](https://docs.julialang.org/en/v1/manual/profile/#Profiling-1) is generally a useful way of finding bottlenecks.
 Below follow some Flux specific tips/reminders.
 
-## Don't use more precision than you need.
+## Don't use more precision than you need
 
 Flux works great with all kinds of number types.
 But often you do not need to be working with say `Float64` (let alone `BigFloat`).
@@ -14,7 +14,8 @@ Which means allocations occur much faster.
 And you use less memory.
 
 
-## Make sure your activation and loss functions preserve the type of their inputs
+## Preserve inputs' types
+
 Not only should your activation and loss functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
 they should also preserve the type of their inputs.
 
@@ -29,24 +30,22 @@ because it results in having to use slow mixed type multiplication in the dense
 Similar situations can occur in the loss function during backpropagation.
 
 Which means if you change your data say from `Float64` to `Float32` (which should give a speedup: see above),
-you will see a large slow-down
+you will see a large slow-down.
 
 This can occur sneakily, because you can cause type-promotion by interacting with a numeric literals.
 E.g. the following will have run into the same problem as above:
 
 ```
-    leaky_tanh(x) = 0.01x + tanh(x)
+    leaky_tanh(x) = 0.01*x + tanh(x)
 ```
 
-While one could change your activation function (e.g. to use `0.01f0x`) to avoid this when ever your inputs change,
-the idiomatic (and safe way) is to use `oftype`.
-
+While one could change the activation function (e.g. to use `0.01f0x`), the idiomatic (and safe way)  to avoid type casts whenever inputs changes is to use `oftype`:
 ```
-    leaky_tanh(x) = oftype(x/1, 0.01)x + tanh(x)
+    leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x)
 ```
 
 
-## Evaluate batches as Matrices of features, rather than sequences of Vector features
+## Evaluate batches as Matrices of features
 
 While it can sometimes be tempting to process your observations (feature vectors) one at a time
 e.g.
diff --git a/src/data/dataloader.jl b/src/data/dataloader.jl
index 8868a9b0..9da14650 100644
--- a/src/data/dataloader.jl
+++ b/src/data/dataloader.jl
@@ -23,21 +23,25 @@ dimension.
 If `shuffle=true`, shuffles the observations each time iterations are re-started.
 If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
 
+The original data is preserved as a tuple in the `data` field of the DataLoader. 
+
 Example usage:
 
     Xtrain = rand(10, 100)
-    dtrain = DataLoader(Xtrain, batchsize=2) 
-    # iterate over 50 mini-batches
-    for x in dtrain: 
+    train_loader = DataLoader(Xtrain, batchsize=2) 
+    # iterate over 50 mini-batches of size 2
+    for x in train_loader: 
         @assert size(x) == (10, 2)
         ...
     end
 
+    train_loader.data   # original dataset
+
     Xtrain = rand(10, 100)
     Ytrain = rand(100)
-    dtrain = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true) 
+    train_loader = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true) 
     for epoch in 1:100
-        for (x, y) in dtrain: 
+        for (x, y) in train_loader: 
             @assert size(x) == (10, 2)
             @assert size(y) == (2,)
             ...
@@ -46,7 +50,7 @@ Example usage:
 
     # train for 10 epochs
     using IterTools: ncycle 
-    Flux.train!(loss, ps, ncycle(dtrain, 10), opt)
+    Flux.train!(loss, ps, ncycle(train_loader, 10), opt)
 """
 function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
     length(data) > 0 || throw(ArgumentError("Need at least one data input"))

From 6e5c18bddffd447d7d6f84eb07f04724ab16a099 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Tue, 3 Mar 2020 16:02:57 +0530
Subject: [PATCH 063/113] Updated loss functions

---
 src/layers/stateless.jl | 51 +++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 5f457057..2fd98815 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -16,33 +16,33 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
 
 """
-    msle(ŷ, y; ϵ = eps.(Float64.(ŷ)))
+    msle(ŷ, y; ϵ=eps(eltype(ŷ)))
 
-Returns the mean of the squared logarithmic errors `sum((log.(ŷ + ϵ) .- log.(y + ϵ)).^2) / length(y)`.
+Returns the mean of the squared logarithmic errors `sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
 The `ϵ` term provides numerical stability. 
 
 This error penalizes an under-predicted estimate greater than an over-predicted estimate.
 """
-msle(ŷ, y; ϵ = eps.(ŷ)) = sum((log.(ŷ + ϵ).-log.(y + ϵ)).^2) * 1 // length(y)
+msle(ŷ, y; ϵ=eps(eltype(ŷ))) = sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) * 1 // length(y)
 
 
 
 """
-    huber_loss(ŷ, y; delta = 1.0)
+    huber_loss(ŷ, y; δ=1.0)
 
-Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, delta is set to 1.0.
+Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, δ is set to 1.0.
 
-                    | 0.5*|ŷ - y|,   for |ŷ - y| <= delta
+                    | 0.5*|ŷ - y|,   for |ŷ - y| <= δ
       Hubber loss = |
-                    | delta*(|ŷ- y| - 0.5*delta),  otherwise
+                    |  δ*(|ŷ - y| - 0.5*δ),  otherwise
 
 [`Huber Loss`](https://en.wikipedia.org/wiki/Huber_loss).
 """
-function huber_loss(ŷ, y; delta = eltype(ŷ)(1))
-   abs_error = abs.(ŷ.-y)
-   temp = abs_error.<delta
+function huber_loss(ŷ, y;  δ=eltype(ŷ)(1))
+   abs_error = abs.(ŷ .- y)
+   temp = abs_error .<  δ
    x = eltype(ŷ)(0.5)
-   hub_loss = sum(((abs_error.^2).*temp).*x .+ delta*(abs_error.- x*delta).*(1 .-temp)) * 1 // length(y)
+   hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
 end
 
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
@@ -144,7 +144,7 @@ It is always non-negative and zero only when both the distributions are equal ev
 [KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
 """
 function kldivergence(ŷ, y)
-  entropy = sum(y .* log.(y)) *1 //size(y,2)
+  entropy = sum(y .* log.(y)) * 1 //size(y,2)
   cross_entropy = crossentropy(ŷ, y)
   return entropy + cross_entropy
 end
@@ -157,7 +157,7 @@ Returns `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`
 
 [Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
-poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
+poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) * 1 // size(y,2)
 
 """
     hinge(ŷ, y)
@@ -168,7 +168,7 @@ Returns `sum((max.(0, 1 .- ŷ .* y))) / size(y, 2)`
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
 See also [`squared_hinge`](@ref).
 """
-hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y, 2)
+hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) * 1 // size(y, 2)
 
 """
     squared_hinge(ŷ, y)
@@ -178,34 +178,25 @@ Returns `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`
 
 See also [`hinge`](@ref).
 """
-squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) *1 // size(y, 2)
+squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) * 1 // size(y, 2)
 
 """
-    dice_coeff_loss(ŷ, y, smooth = 1)
+    dice_coeff_loss(ŷ, y; smooth=1)
 
 Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score.
-    
-    Dice_Coefficient(ŷ, y) = 2 * sum( |ŷ.* y| + smooth) / (sum( ŷ.^2 ) + sum( y.^2 ) + smooth)
-    Dice_loss = 1 - Dice_Coefficient
+Returns `1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)`
 
 [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
 """
-function dice_coeff_loss(ŷ, y; smooth = eltype(ŷ)(1.0))
-    intersection = sum(y.*ŷ)
-    return 1 - (2*intersection + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
-end
+dice_coeff_loss(ŷ, y; smooth=eltype(ŷ)(1.0)) = 1 - (2*sum(y .* ŷ) + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
 
 """
-    tversky_loss(ŷ, y, β = 0.7)
+    tversky_loss(ŷ, y; β=0.7)
 
 Used with imbalanced data to give more weightage to False negatives. 
 Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
-    
-    tversky_loss(ŷ, y, β) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + β *(1 .- y).*ŷ + (1 - β).*y.*(1 .- ŷ))+ 1)
+Returns `1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)`
 
 [Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
 """
-function tversky_loss(ŷ, y; β = eltype(ŷ)(0.7))
-    intersection = sum(y.*ŷ)
-    return 1 - (intersection + 1) / (sum(y.* ŷ + β *(1 .- y).* ŷ + (1 - β).*y.*(1 .- ŷ)) + 1)
-end
+tversky_loss(ŷ, y; β=eltype(ŷ)(0.7)) = 1 - (sum(y .* ŷ) + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)

From d0e8a9ff71fabc4a32d6606908236f79e44ca4a2 Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <Adarshkumar712.ak@gmail.com>
Date: Tue, 3 Mar 2020 22:07:05 +0530
Subject: [PATCH 064/113] Updated activation functions in NNlib doc

---
 docs/src/models/nnlib.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/docs/src/models/nnlib.md b/docs/src/models/nnlib.md
index 6dbfd4f4..7ede2682 100644
--- a/docs/src/models/nnlib.md
+++ b/docs/src/models/nnlib.md
@@ -7,17 +7,27 @@ Flux re-exports all of the functions exported by the [NNlib](https://github.com/
 Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
 
 ```@docs
+NNlib.celu
 NNlib.elu
 NNlib.gelu
+NNlib.hardsigmoid
+NNlib.hardtanh
 NNlib.leakyrelu
+NNlib.lisht
 NNlib.logcosh
 NNlib.logsigmoid
+NNlib.mish
 NNlib.relu
+NNlib.relu6
+NNlib.rrelu
 NNlib.selu
 NNlib.sigmoid
 NNlib.softplus
+NNlib.softshrink
 NNlib.softsign
 NNlib.swish
+NNlib.tanhshrink
+NNlib.trelu
 ```
 
 ## Softmax
@@ -48,4 +58,4 @@ NNlib.batched_mul
 NNlib.batched_mul!
 NNlib.batched_adjoint
 NNlib.batched_transpose
-```
\ No newline at end of file
+```

From 0def3523839f319e7b6b0e0f5343df657060fa72 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 11:49:34 -0500
Subject: [PATCH 065/113] Prevent breakage due to new `active` field in
 BatchNorm

---
 src/layers/normalise.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index fc781f70..0647e6b4 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -157,6 +157,8 @@ mutable struct BatchNorm{F,V,W,N}
   active::Union{Bool, Nothing}
 end
 
+BatchNorm(λ, β, γ, μ, σ², ϵ, momentum) = BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
+
 BatchNorm(chs::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   BatchNorm(λ, initβ(chs), initγ(chs),

From d9ea5fba761bb8471bac878237a7b4b836dbcf00 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 11:55:39 -0500
Subject: [PATCH 066/113] add `active` helpers for other normalise layers

---
 src/layers/normalise.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 0647e6b4..f9ef4de8 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -40,6 +40,8 @@ mutable struct Dropout{F,D}
   active::Union{Bool, Nothing}
 end
 
+Dropout(p, dims) = Dropout(p, dims, nothing)
+
 function Dropout(p; dims = :)
   @assert 0 ≤ p ≤ 1
   Dropout{typeof(p),typeof(dims)}(p, dims, nothing)
@@ -77,6 +79,8 @@ mutable struct AlphaDropout{F}
   end
 end
 
+AlphaDropout(p) = AlphaDropout(p, nothing)
+
 function (a::AlphaDropout)(x)
   _isactive(a) || return x
   λ = eltype(x)(1.0507009873554804934193349852946)
@@ -253,6 +257,8 @@ mutable struct InstanceNorm{F,V,W,N}
   active::Union{Bool, Nothing}
 end
 
+InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum) = InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
+
 InstanceNorm(chs::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   InstanceNorm(λ, initβ(chs), initγ(chs),
@@ -344,6 +350,8 @@ mutable struct GroupNorm{F,V,W,N,T}
   active::Union{Bool, Nothing}
 end
 
+GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum) = GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, nothing)
+
 GroupNorm(chs::Integer, G::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   GroupNorm(G, λ, initβ(chs), initγ(chs),

From d63fcf2cb46856fa091f6c353b33e09a649dd314 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 13:05:03 -0500
Subject: [PATCH 067/113] add depreciation reminder

---
 src/layers/normalise.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index f9ef4de8..858d4986 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -40,6 +40,7 @@ mutable struct Dropout{F,D}
   active::Union{Bool, Nothing}
 end
 
+# TODO: deprecate in v0.11
 Dropout(p, dims) = Dropout(p, dims, nothing)
 
 function Dropout(p; dims = :)
@@ -79,6 +80,7 @@ mutable struct AlphaDropout{F}
   end
 end
 
+# TODO: deprecate in v0.11
 AlphaDropout(p) = AlphaDropout(p, nothing)
 
 function (a::AlphaDropout)(x)
@@ -161,6 +163,7 @@ mutable struct BatchNorm{F,V,W,N}
   active::Union{Bool, Nothing}
 end
 
+# TODO: deprecate in v0.11
 BatchNorm(λ, β, γ, μ, σ², ϵ, momentum) = BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
 
 BatchNorm(chs::Integer, λ = identity;
@@ -257,6 +260,7 @@ mutable struct InstanceNorm{F,V,W,N}
   active::Union{Bool, Nothing}
 end
 
+# TODO: deprecate in v0.11
 InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum) = InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
 
 InstanceNorm(chs::Integer, λ = identity;
@@ -350,6 +354,7 @@ mutable struct GroupNorm{F,V,W,N,T}
   active::Union{Bool, Nothing}
 end
 
+# TODO: deprecate in v0.11
 GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum) = GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, nothing)
 
 GroupNorm(chs::Integer, G::Integer, λ = identity;

From 078ad7dd500f75dfa3f125b72742e9a8c07b5f6a Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 13:05:23 -0500
Subject: [PATCH 068/113] bump version to 0.10.3

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index a27d766b..451a73b7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.2"
+version = "0.10.3"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

From 61f66e3dcdfe64bfe630bc2183420197bc6babe0 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 13:20:02 -0500
Subject: [PATCH 069/113] remove unnecessary helper for AlphaDropout

---
 src/layers/normalise.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 858d4986..250a06fc 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -80,9 +80,6 @@ mutable struct AlphaDropout{F}
   end
 end
 
-# TODO: deprecate in v0.11
-AlphaDropout(p) = AlphaDropout(p, nothing)
-
 function (a::AlphaDropout)(x)
   _isactive(a) || return x
   λ = eltype(x)(1.0507009873554804934193349852946)

From 12106ff4ccd85be390f8714ac89ee5b761b641af Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Wed, 4 Mar 2020 04:45:41 +0100
Subject: [PATCH 070/113] update freeze docs

---
 docs/src/models/advanced.md | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/docs/src/models/advanced.md b/docs/src/models/advanced.md
index 4a023709..cbdb226c 100644
--- a/docs/src/models/advanced.md
+++ b/docs/src/models/advanced.md
@@ -39,23 +39,35 @@ However, doing this requires the `struct` to have a corresponding constructor th
 
 When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
 
-Consider the simple multi-layer model where we want to omit optimising the first two `Dense` layers. This setup would look something like so:
+Consider a simple multi-layer perceptron model where we want to avoid optimising the first two `Dense` layers. We can obtain
+this using the slicing features `Chain` provides:
 
 ```julia
 m = Chain(
-  Dense(784, 64, σ),
-  Dense(64, 32),
-  Dense(32, 10), softmax)
+      Dense(784, 64, relu),
+      Dense(64, 64, relu),
+      Dense(32, 10)
+    )
 
 ps = Flux.params(m[3:end])
 ```
 
-`ps` now holds a reference to only the parameters of the layers passed to it.
+The `Zygote.Params` object `ps` now holds a reference to only the parameters of the layers passed to it.
 
-During training, now the gradients would only be applied to the last `Dense` layer (and the `softmax` layer, but that is stateless so doesn't have any parameters), so only that would have its parameters changed.
+During training, the gradients will only be computed for (and applied to) the last `Dense` layer, therefore only that would have its parameters changed.
 
 `Flux.params` also takes multiple inputs to make it easy to collect parameters from heterogenous models with a single call. A simple demonstration would be if we wanted to omit optimising the second `Dense` layer in the previous example. It would look something like this:
 
 ```julia
 Flux.params(m[1], m[3:end])
 ```
+
+Sometimes, a more fine-tuned control is needed. 
+We can freeze a specific parameter of a specific layer which already entered a `Params` object `ps`, 
+by simply deleting it from `ps`:
+
+```julia
+ps = params(m)
+delete!(ps, m[2].b) 
+```
+

From 5a4f1932a6f1d7e09aa0f70497ee689a428a2421 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 4 Mar 2020 17:22:45 +0530
Subject: [PATCH 071/113] closes #1071

---
 src/layers/basic.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 24fab689..96d67b45 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -90,7 +90,7 @@ julia> d = Dense(5, 2)
 Dense(5, 2)
 
 julia> d(rand(5))
-Tracked 2-element Array{Float64,1}:
+Array{Float64,1}:
   0.00257447
   -0.00449443
 ```

From 3e14bd878c0bc2966579b20eea4bf69e52f53b6c Mon Sep 17 00:00:00 2001
From: Garben Tanghe <garben.tanghe@gmail.com>
Date: Mon, 2 Dec 2019 13:31:25 +0100
Subject: [PATCH 072/113] added GlobalMaxPool, GlobalMeanPool, and Flatten
 layers

---
 docs/src/models/layers.md |  3 ++
 src/Flux.jl               |  3 +-
 src/layers/conv.jl        | 80 ++++++++++++++++++++++++++++++++++++---
 test/layers/conv.jl       |  6 +++
 4 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index b4b745b2..5f12d41a 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -14,10 +14,13 @@ These layers are used to build convolutional neural networks (CNNs).
 ```@docs
 Conv
 MaxPool
+GlobalMaxPool
 MeanPool
+GlobalMeanPool
 DepthwiseConv
 ConvTranspose
 CrossCor
+Flatten
 ```
 
 ## Recurrent Layers
diff --git a/src/Flux.jl b/src/Flux.jl
index 5afa1fc0..725abfa7 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -10,7 +10,8 @@ using Zygote: Params, @adjoint, gradient, pullback, @nograd
 
 export gradient
 
-export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
+export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose,
+       GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, Flatten,
        DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
        SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index ef167f71..67004b4a 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -95,8 +95,9 @@ outdims(l::Conv, isize) =
 Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 
-Data should be stored in WHCN order. In other words, a 100×100 RGB image would
-be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
+Data should be stored in WHCN order (width, height, # channels, # batches).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
+and a batch of 50 would be a `100×100×3×50` array.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
@@ -171,8 +172,9 @@ Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Note that `out` must be an integer multiple of `in`.
 
-Data should be stored in WHCN order. In other words, a 100×100 RGB image would
-be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
+Data should be stored in WHCN order (width, height, # channels, # batches).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
+and a batch of 50 would be a `100×100×3×50` array.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
@@ -304,6 +306,56 @@ end
 outdims(l::CrossCor, isize) =
   output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
+"""
+    GlobalMaxPool()
+
+Global max pooling layer.
+
+Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output,
+by performing max pooling on the complete (w,h)-shaped feature maps.
+"""
+struct GlobalMaxPool end
+
+function (g::GlobalMaxPool)(x)
+  # Input size
+  x_size = size(x)
+  # Kernel size
+  k = x_size[1:end-2]
+  # Pooling dimensions
+  pdims = PoolDims(x, k)
+
+  return maxpool(x, pdims)
+end
+
+function Base.show(io::IO, g::GlobalMaxPool)
+  print(io, "GlobalMaxPool()")
+end
+
+"""
+    GlobalMeanPool()
+
+Global mean pooling layer.
+
+Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output,
+by performing mean pooling on the complete (w,h)-shaped feature maps.
+"""
+struct GlobalMeanPool end
+
+function (g::GlobalMeanPool)(x)
+  # Input size
+  x_size = size(x)
+  # Kernel size
+  k = x_size[1:end-2]
+  # Pooling dimensions
+  pdims = PoolDims(x, k)
+
+  return meanpool(x, pdims)
+end
+
+function Base.show(io::IO, g::GlobalMeanPool)
+  print(io, "GlobalMeanPool()")
+end
+
 """
     MaxPool(k)
 
@@ -363,4 +415,22 @@ function Base.show(io::IO, m::MeanPool)
   print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
-outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
\ No newline at end of file
+outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
+
+"""
+    Flatten()
+
+Flattening layer.
+
+Transforms (w,h,c,b)-shaped input into (w*h*c,b)-shaped output,
+by linearizing all values for each element in the batch.
+"""
+struct Flatten end
+
+function (f::Flatten)(x)
+  return reshape(x, :, size(x)[end])
+end
+
+function Base.show(io::IO, f::Flatten)
+  print(io, "Flatten()")
+end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 03a0d1a4..60e1898d 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -4,10 +4,16 @@ using Flux: gradient
 
 @testset "Pooling" begin
   x = randn(Float32, 10, 10, 3, 2)
+  gmp = GlobalMaxPool()
+  @test size(gmp(x)) == (1, 1, 3, 2)
+  gmp = GlobalMeanPool()
+  @test size(gmp(x)) == (1, 1, 3, 2)
   mp = MaxPool((2, 2))
   @test mp(x) == maxpool(x, PoolDims(x, 2))
   mp = MeanPool((2, 2))
   @test mp(x) == meanpool(x, PoolDims(x, 2))
+  f = Flatten()
+  @test size(f(x)) == (300, 2)
 end
 
 @testset "CNN" begin

From 82e16a5b291fc115e485bd2fcb1cea731c70c0e4 Mon Sep 17 00:00:00 2001
From: Garben Tanghe <garben.tanghe@gmail.com>
Date: Thu, 5 Dec 2019 14:16:12 +0100
Subject: [PATCH 073/113] split up Flatten layer to use the flatten function

---
 src/layers/conv.jl       | 16 ++++++++++++----
 src/layers/stateless.jl  | 26 ++++++++++++++++++--------
 test/layers/stateless.jl |  9 ++++++++-
 3 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 67004b4a..faca0895 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -425,12 +425,20 @@ Flattening layer.
 Transforms (w,h,c,b)-shaped input into (w*h*c,b)-shaped output,
 by linearizing all values for each element in the batch.
 """
-struct Flatten end
+struct Flatten{F}
+  σ::F
+  function Flatten(σ::F = identity) where {F}
+    return new{F}(σ)
+  end
+end
 
-function (f::Flatten)(x)
-  return reshape(x, :, size(x)[end])
+function (f::Flatten)(x::AbstractArray)
+  σ = f.σ
+  σ(flatten(x))
 end
 
 function Base.show(io::IO, f::Flatten)
-  print(io, "Flatten()")
+  print(io, "Flatten(")
+  f.σ == identity || print(io, f.σ)
+  print(io, ")")
 end
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 2fd98815..a9e6c6e5 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -2,7 +2,7 @@
 """
     mae(ŷ, y)
 
-Return the mean of absolute error `sum(abs.(ŷ .- y)) / length(y)` 
+Return the mean of absolute error `sum(abs.(ŷ .- y)) / length(y)`
 """
 mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 
@@ -10,7 +10,7 @@ mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 """
     mse(ŷ, y)
 
-Return the mean squared error `sum((ŷ .- y).^2) / length(y)`. 
+Return the mean squared error `sum((ŷ .- y).^2) / length(y)`.
 """
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
@@ -19,7 +19,7 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
     msle(ŷ, y; ϵ=eps(eltype(ŷ)))
 
 Returns the mean of the squared logarithmic errors `sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
-The `ϵ` term provides numerical stability. 
+The `ϵ` term provides numerical stability.
 
 This error penalizes an under-predicted estimate greater than an over-predicted estimate.
 """
@@ -60,7 +60,7 @@ end
 """
     crossentropy(ŷ, y; weight=1)
 
-Return the crossentropy computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`. 
+Return the crossentropy computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
 
 See also [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref).
 """
@@ -69,7 +69,7 @@ crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _cros
 """
     logitcrossentropy(ŷ, y; weight=1)
 
-Return the crossentropy computed after a [softmax](@ref) operation: 
+Return the crossentropy computed after a [softmax](@ref) operation:
 
   -sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)
 
@@ -97,7 +97,7 @@ CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1
 `logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(ŷ), y)`
 but it is more numerically stable.
 
-See also [`binarycrossentropy`](@ref), [`sigmoid`](@ref), [`logsigmoid`](@ref).  
+See also [`binarycrossentropy`](@ref), [`sigmoid`](@ref), [`logsigmoid`](@ref).
 """
 logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 
@@ -162,7 +162,7 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) * 1 // size(y,2)
 """
     hinge(ŷ, y)
 
-Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
+Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1).
 Returns `sum((max.(0, 1 .- ŷ .* y))) / size(y, 2)`
 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
@@ -193,10 +193,20 @@ dice_coeff_loss(ŷ, y; smooth=eltype(ŷ)(1.0)) = 1 - (2*sum(y .* ŷ) + smooth
 """
     tversky_loss(ŷ, y; β=0.7)
 
-Used with imbalanced data to give more weightage to False negatives. 
+Used with imbalanced data to give more weightage to False negatives.
 Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
 Returns `1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)`
 
 [Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
 """
 tversky_loss(ŷ, y; β=eltype(ŷ)(0.7)) = 1 - (sum(y .* ŷ) + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
+
+"""
+    flatten(x::AbstractArray)
+
+Transforms (w,h,c,b)-shaped input into (w*h*c,b)-shaped output,
+by linearizing all values for each element in the batch.
+"""
+function flatten(x::AbstractArray)
+  return reshape(x, :, size(x)[end])
+end
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index ce940bf9..ebcd815c 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -1,6 +1,6 @@
 using Test
 using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
-            σ, binarycrossentropy, logitbinarycrossentropy
+            σ, binarycrossentropy, logitbinarycrossentropy, flatten
 
 const ϵ = 1e-7
 
@@ -116,3 +116,10 @@ const ϵ = 1e-7
     end
   end
 end
+
+@testset "helpers" begin
+  @testset "flatten" begin
+    x = randn(Float32, 10, 10, 3, 2)
+    @test size(flatten(x)) == (300, 2)
+  end
+end

From 746e3310f18485c0c30c9975c71c88d53d00fe26 Mon Sep 17 00:00:00 2001
From: Garben Tanghe <garben.tanghe@gmail.com>
Date: Thu, 27 Feb 2020 12:44:17 +0100
Subject: [PATCH 074/113] removed Flatten struct

updated documentation
---
 docs/src/models/layers.md |  2 +-
 src/Flux.jl               |  2 +-
 src/layers/conv.jl        | 26 --------------------------
 test/layers/conv.jl       |  2 --
 4 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 5f12d41a..2b5c1591 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -20,7 +20,7 @@ GlobalMeanPool
 DepthwiseConv
 ConvTranspose
 CrossCor
-Flatten
+flatten
 ```
 
 ## Recurrent Layers
diff --git a/src/Flux.jl b/src/Flux.jl
index 725abfa7..f973dc4c 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -11,7 +11,7 @@ using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose,
-       GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, Flatten,
+       GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, flatten,
        DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
        SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index faca0895..742091a6 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -416,29 +416,3 @@ function Base.show(io::IO, m::MeanPool)
 end
 
 outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
-
-"""
-    Flatten()
-
-Flattening layer.
-
-Transforms (w,h,c,b)-shaped input into (w*h*c,b)-shaped output,
-by linearizing all values for each element in the batch.
-"""
-struct Flatten{F}
-  σ::F
-  function Flatten(σ::F = identity) where {F}
-    return new{F}(σ)
-  end
-end
-
-function (f::Flatten)(x::AbstractArray)
-  σ = f.σ
-  σ(flatten(x))
-end
-
-function Base.show(io::IO, f::Flatten)
-  print(io, "Flatten(")
-  f.σ == identity || print(io, f.σ)
-  print(io, ")")
-end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 60e1898d..e7b3963d 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -12,8 +12,6 @@ using Flux: gradient
   @test mp(x) == maxpool(x, PoolDims(x, 2))
   mp = MeanPool((2, 2))
   @test mp(x) == meanpool(x, PoolDims(x, 2))
-  f = Flatten()
-  @test size(f(x)) == (300, 2)
 end
 
 @testset "CNN" begin

From fc3af681ec45f9fb8ea405a447908c1b8bc9bbed Mon Sep 17 00:00:00 2001
From: Garben Tanghe <garben.tanghe@gmail.com>
Date: Sun, 8 Mar 2020 14:17:00 +0100
Subject: [PATCH 075/113] updated documentation

---
 src/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index a9e6c6e5..eebbbe98 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -204,7 +204,7 @@ tversky_loss(ŷ, y; β=eltype(ŷ)(0.7)) = 1 - (sum(y .* ŷ) + 1) / (sum(y .*
 """
     flatten(x::AbstractArray)
 
-Transforms (w,h,c,b)-shaped input into (w*h*c,b)-shaped output,
+Transforms (w,h,c,b)-shaped input into (w x h x c,b)-shaped output,
 by linearizing all values for each element in the batch.
 """
 function flatten(x::AbstractArray)

From f0d866b2fd3418e53c60bcd8216236df2bd180eb Mon Sep 17 00:00:00 2001
From: AzamatB <aberdysh@gmail.com>
Date: Tue, 10 Mar 2020 12:44:19 +0600
Subject: [PATCH 076/113] fix typo in the Dropout docs

---
 src/layers/normalise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 250a06fc..163dac12 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -32,7 +32,7 @@ end
 
 A Dropout layer. In the forward pass, applies the [`dropout`](@ref) function on the input.
 
-Does nothing to the input once [`testmode!`](@ref) is false.
+Does nothing to the input once [`testmode!`](@ref) is true.
 """
 mutable struct Dropout{F,D}
   p::F

From 69e96ed1c11728fdc9c04b8c93b702114cbee225 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 13 Mar 2020 00:13:04 +0000
Subject: [PATCH 077/113] CompatHelper: bump compat for "CodecZlib" to "0.7"

---
 Manifest.toml | 26 +++++++++++++-------------
 Project.toml  |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 04465cae..c5936885 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -57,10 +57,10 @@ uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
 version = "2.10.2"
 
 [[CodecZlib]]
-deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
-git-tree-sha1 = "05916673a2627dd91b4969ff8ba6941bc85a960e"
+deps = ["TranscodingStreams", "Zlib_jll"]
+git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
 uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
-version = "0.6.0"
+version = "0.7.0"
 
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
@@ -88,9 +88,9 @@ version = "0.2.0+1"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "7c20c5a45bb245cf248f454d26966ea70255b271"
+git-tree-sha1 = "7fa1331a0e0cd10e43b94b280027bda45990cb63"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.7.2"
+version = "1.7.3"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -180,15 +180,15 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[Juno]]
 deps = ["Base64", "Logging", "Media", "Profile"]
-git-tree-sha1 = "4f2249fb58cfb140eeb89428e31791e2f8959d8c"
+git-tree-sha1 = "e1ba2a612645b3e07c773c3a208f215745081fe6"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.8.0"
+version = "0.8.1"
 
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
+git-tree-sha1 = "b6b86801ae2f2682e0a4889315dc76b68db2de71"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.3.3"
+version = "1.3.4"
 
 [[LibGit2]]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
@@ -247,9 +247,9 @@ version = "0.3.3"
 
 [[OpenSpecFun_jll]]
 deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
-git-tree-sha1 = "d110040968b9afe95c6bd9c6233570b0fe8abd22"
+git-tree-sha1 = "d51c416559217d974a1113522d5919235ae67a87"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.3+2"
+version = "0.5.3+3"
 
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@@ -326,9 +326,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [[StatsBase]]
 deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
+git-tree-sha1 = "19bfcb46245f69ff4013b3df3b977a289852c3a1"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.32.1"
+version = "0.32.2"
 
 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
diff --git a/Project.toml b/Project.toml
index 451a73b7..131e66f5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -26,7 +26,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 AbstractTrees = "0.2, 0.3"
 Adapt = "1"
-CodecZlib = "0.5, 0.6"
+CodecZlib = "0.5, 0.6, 0.7"
 Colors = "0.8, 0.9, 0.10, 0.11"
 CuArrays = "1.6"
 Juno = "0.5, 0.6, 0.7, 0.8"

From bca74213ee50975e1382572720838f4681b73683 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sat, 14 Mar 2020 00:12:33 +0000
Subject: [PATCH 078/113] CompatHelper: bump compat for "Colors" to "0.12"

---
 Manifest.toml | 14 +++++++-------
 Project.toml  |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index c5936885..c57ff918 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -64,15 +64,15 @@ version = "0.7.0"
 
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8"
+git-tree-sha1 = "c4c1cca28748906265ed62c788d6fe6f0134d264"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.9.1"
+version = "0.10.0"
 
 [[Colors]]
-deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
-git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b"
+deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"]
+git-tree-sha1 = "2fdeb981ebcf52cd800ddb6a0aa5eac34153552d"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.11.2"
+version = "0.12.0"
 
 [[CommonSubexpressions]]
 deps = ["Test"]
@@ -146,9 +146,9 @@ uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
 version = "0.8.5"
 
 [[FixedPointNumbers]]
-git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa"
+git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.7.1"
+version = "0.8.0"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
diff --git a/Project.toml b/Project.toml
index 131e66f5..db885e2a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -27,7 +27,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 AbstractTrees = "0.2, 0.3"
 Adapt = "1"
 CodecZlib = "0.5, 0.6, 0.7"
-Colors = "0.8, 0.9, 0.10, 0.11"
+Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
 CuArrays = "1.6"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"

From 85a9493722442ad77ec575b57600cc4328f36c38 Mon Sep 17 00:00:00 2001
From: AzamatB <aberdysh@gmail.com>
Date: Sat, 14 Mar 2020 15:42:00 +0600
Subject: [PATCH 079/113] Fix typo in the docstrings of AlphaDropout

---
 src/layers/normalise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 163dac12..3828748f 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -69,7 +69,7 @@ A dropout layer. It is used in Self-Normalizing Neural Networks.
 (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
 The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
 
-Does nothing to the input once [`testmode!`](@ref) is false.
+Does nothing to the input once [`testmode!`](@ref) is true.
 """
 mutable struct AlphaDropout{F}
   p::F

From 1511778267a964d261c7bbf9f15233cb92adf3ab Mon Sep 17 00:00:00 2001
From: yuebanyishenqiu <thisispwj@outlook.com>
Date: Sun, 22 Mar 2020 09:41:29 +0800
Subject: [PATCH 080/113] fix typos

---
 src/data/dataloader.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/data/dataloader.jl b/src/data/dataloader.jl
index 9da14650..07c8f1fd 100644
--- a/src/data/dataloader.jl
+++ b/src/data/dataloader.jl
@@ -30,7 +30,7 @@ Example usage:
     Xtrain = rand(10, 100)
     train_loader = DataLoader(Xtrain, batchsize=2) 
     # iterate over 50 mini-batches of size 2
-    for x in train_loader: 
+    for x in train_loader
         @assert size(x) == (10, 2)
         ...
     end
@@ -41,7 +41,7 @@ Example usage:
     Ytrain = rand(100)
     train_loader = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true) 
     for epoch in 1:100
-        for (x, y) in train_loader: 
+        for (x, y) in train_loader
             @assert size(x) == (10, 2)
             @assert size(y) == (2,)
             ...
@@ -89,4 +89,4 @@ end
 function Base.length(d::DataLoader)
     n = d.nobs / d.batchsize
     d.partial ? ceil(Int,n) : floor(Int,n)
-end
\ No newline at end of file
+end

From 347f53adf6b034f84e5ea8da36cd1b8352cc007c Mon Sep 17 00:00:00 2001
From: Alex Arslan <ararslan@comcast.net>
Date: Wed, 25 Mar 2020 10:58:39 -0700
Subject: [PATCH 081/113] Allow CuArrays v2.x

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index db885e2a..052f61ad 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,7 +28,7 @@ AbstractTrees = "0.2, 0.3"
 Adapt = "1"
 CodecZlib = "0.5, 0.6, 0.7"
 Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
-CuArrays = "1.6"
+CuArrays = "1.6, 2"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"

From 49ba1211598dab894127b358a1ec441c86e4c1c1 Mon Sep 17 00:00:00 2001
From: Alex Arslan <ararslan@comcast.net>
Date: Wed, 25 Mar 2020 12:48:29 -0700
Subject: [PATCH 082/113] Update Manifest.toml

---
 Manifest.toml | 69 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 20 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index c57ff918..9b122866 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -40,21 +40,27 @@ version = "0.2.0"
 
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "d7ceadd8f821177d05b897c0517e94633db535fe"
+git-tree-sha1 = "831b825d10104bd29e28f6da93312a976830717b"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "3.1.0"
+version = "4.0.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "01e90fa34e25776bc7c8661183d4519149ebfe59"
+git-tree-sha1 = "65001097f4a964f1407d546589821cc305a3fa59"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "6.0.0"
+version = "6.2.1"
 
 [[CUDAnative]]
-deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "f86269ff60ebe082a2806ecbce51f3cadc68afe9"
+deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
+git-tree-sha1 = "4168c40ca3ff3475bc29a20a09ab7b910c4b8ef0"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.10.2"
+version = "3.0.1"
+
+[[CodeTracking]]
+deps = ["InteractiveUtils", "UUIDs"]
+git-tree-sha1 = "0becdab7e6fbbcb7b88d8de5b72e5bb2f28239f3"
+uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
+version = "0.5.8"
 
 [[CodecZlib]]
 deps = ["TranscodingStreams", "Zlib_jll"]
@@ -80,17 +86,29 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"
 
+[[Compat]]
+deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
+git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "2.2.0"
+
 [[CompilerSupportLibraries_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "b57c5d019367c90f234a7bc7e24ff0a84971da5d"
+git-tree-sha1 = "067567a322fe466c5ec8d01413eee7127bd11699"
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "0.2.0+1"
+version = "0.3.1+0"
+
+[[Cthulhu]]
+deps = ["CodeTracking", "InteractiveUtils", "TerminalMenus", "Unicode"]
+git-tree-sha1 = "5e0f928ccaab1fa2911fc4e204e8a6f5b0213eaf"
+uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+version = "1.0.0"
 
 [[CuArrays]]
-deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "7fa1331a0e0cd10e43b94b280027bda45990cb63"
+deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
+git-tree-sha1 = "025687917ae0f7816005f8fee08e45a91feb368d"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.7.3"
+version = "2.0.0"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -158,9 +176,9 @@ version = "0.10.9"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
-git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
+git-tree-sha1 = "d586762b08dcda13228df8967119b9cb6f22ade5"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "2.0.1"
+version = "3.1.0"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
@@ -191,6 +209,7 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
 version = "1.3.4"
 
 [[LibGit2]]
+deps = ["Printf"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
 [[Libdl]]
@@ -210,10 +229,10 @@ uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
 version = "2019.0.117+2"
 
 [[MacroTools]]
-deps = ["DataStructures", "Markdown", "Random"]
-git-tree-sha1 = "07ee65e03e28ca88bc9a338a3726ae0c3efaa94b"
+deps = ["Markdown", "Random"]
+git-tree-sha1 = "f7d2e3f654af75f01ec49be82c231c382214223a"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.4"
+version = "0.5.5"
 
 [[Markdown]]
 deps = ["Base64"]
@@ -258,7 +277,7 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.1.0"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
@@ -295,6 +314,10 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
+[[SharedArrays]]
+deps = ["Distributed", "Mmap", "Random", "Serialization"]
+uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -330,6 +353,12 @@ git-tree-sha1 = "19bfcb46245f69ff4013b3df3b977a289852c3a1"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 version = "0.32.2"
 
+[[TerminalMenus]]
+deps = ["Compat", "REPL", "Test"]
+git-tree-sha1 = "9ae6ed0c94eee4d898e049820942af21daf15efc"
+uuid = "dc548174-15c3-5faf-af27-7997cfbde655"
+version = "0.1.0"
+
 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
@@ -367,9 +396,9 @@ version = "1.2.11+8"
 
 [[Zygote]]
 deps = ["ArrayLayouts", "DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "7dc5fdb4917ac5a84e199ae654316a01cd4a278b"
+git-tree-sha1 = "9688fce24bd8a9468fed12f3d5206099a39054dc"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.9"
+version = "0.4.12"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]

From e85a5d8573699b1cb1bebd7e43a707c2fe66637c Mon Sep 17 00:00:00 2001
From: Alex Arslan <ararslan@comcast.net>
Date: Wed, 25 Mar 2020 15:23:07 -0700
Subject: [PATCH 083/113] Update CUDAdrv for Tim's bug fix

---
 Manifest.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 9b122866..8444a47a 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -46,9 +46,9 @@ version = "4.0.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "65001097f4a964f1407d546589821cc305a3fa59"
+git-tree-sha1 = "e650cbaee92b60433313157926b1e80d0c3a0e2e"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "6.2.1"
+version = "6.2.2"
 
 [[CUDAnative]]
 deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]

From 6939e03fc6e2303d2ce0ed60dd469b936b2d4567 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 26 Mar 2020 14:03:55 +0530
Subject: [PATCH 084/113] bump CuArrays version

---
 Manifest.toml | 5 ++---
 Project.toml  | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 8444a47a..2ba02f84 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -106,9 +106,9 @@ version = "1.0.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "025687917ae0f7816005f8fee08e45a91feb368d"
+git-tree-sha1 = "e8c55b38dcca955f5aed8ec4479cdc95810db1e1"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "2.0.0"
+version = "2.0.1"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -209,7 +209,6 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
 version = "1.3.4"
 
 [[LibGit2]]
-deps = ["Printf"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
 [[Libdl]]
diff --git a/Project.toml b/Project.toml
index 052f61ad..e927e14a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,7 +28,7 @@ AbstractTrees = "0.2, 0.3"
 Adapt = "1"
 CodecZlib = "0.5, 0.6, 0.7"
 Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
-CuArrays = "1.6, 2"
+CuArrays = "2"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"

From 1bf8dc2d5be962322cb757dceabc5de78e6d9f2d Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 31 Aug 2019 13:03:46 +0200
Subject: [PATCH 085/113] Update Documenter version and fix warnings

0.23.2 -> 0.23.3
---
 docs/make.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 240d5a9a..0dfb5dbb 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,7 +1,13 @@
 using Documenter, Flux, NNlib
 
 makedocs(modules=[Flux, NNlib],
+         doctest = true,
          sitename = "Flux",
+         format = Documenter.HTML(
+                 analytics = "UA-36890222-9",
+                 assets = ["assets/flux.css"],
+                 prettyurls = get(ENV, "CI", nothing) == "true",
+         ),
          pages = ["Home" => "index.md",
                   "Building Models" =>
                     ["Basics" => "models/basics.md",
@@ -21,10 +27,8 @@ makedocs(modules=[Flux, NNlib],
                   "The Julia Ecosystem" => "ecosystem.md",
                   "Performance Tips" => "performance.md",
                   "Community" => "community.md"],
-         format = Documenter.HTML(assets = ["assets/flux.css"],
-                                  analytics = "UA-36890222-9",
-                                  prettyurls = haskey(ENV, "CI")))
+         )
 
-deploydocs(repo = "github.com/FluxML/Flux.jl.git",    
+deploydocs(repo = "github.com/FluxML/Flux.jl.git",
            target = "build",
            push_preview = true)

From 9b68423e649f3937b352fc9fec24092033c80910 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 31 Aug 2019 13:05:04 +0200
Subject: [PATCH 086/113] Import (`using`) Flux for all doctests

---
 docs/make.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/make.jl b/docs/make.jl
index 0dfb5dbb..e67de41c 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,5 +1,6 @@
 using Documenter, Flux, NNlib
 
+DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
 makedocs(modules=[Flux, NNlib],
          doctest = true,
          sitename = "Flux",

From 2f955a33cd11d2f15144d822d7bef85d561b5dcd Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 31 Aug 2019 11:08:25 +0200
Subject: [PATCH 087/113] `src/layers/stateless.jl`: Add missing docstrings

---
 src/layers/stateless.jl | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index eebbbe98..b598fdd4 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -10,7 +10,14 @@ mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 """
     mse(ŷ, y)
 
-Return the mean squared error `sum((ŷ .- y).^2) / length(y)`.
+Return the mean squared error between ŷ and y;
+defined as ``\\frac{1}{n} \\sum_{i=1}^n (ŷ_i - y_i)^2``.
+
+# Examples
+```jldoctest
+julia> Flux.mse([0, 2], [1, 1])
+1//1
+```
 """
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
@@ -58,22 +65,40 @@ function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Abstr
 end
 
 """
-    crossentropy(ŷ, y; weight=1)
+    crossentropy(ŷ, y; weight = nothing)
 
-Return the crossentropy computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
+Return the cross entropy between the given probability distributions;
+computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
+
+`weight` can be `Nothing`, a `Number` or an `AbstractVector`.
+`weight=nothing` acts like `weight=1` but is faster.
 
 See also [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref).
+
+# Examples
+```jldoctest
+julia> Flux.crossentropy(softmax([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
+3.085467254747739
+```
 """
 crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
 
 """
-    logitcrossentropy(ŷ, y; weight=1)
+    logitcrossentropy(ŷ, y; weight = 1)
 
-Return the crossentropy computed after a [softmax](@ref) operation:
+Return the crossentropy computed after a [`logsoftmax`](@ref) operation;
+computed as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
 
-  -sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)
+`logitcrossentropy(ŷ, y)` is mathematically equivalent to
+[`crossentropy(softmax(log(ŷ)), y)`](@ref) but it is more numerically stable.
 
 See also [`crossentropy`](@ref), [`binarycrossentropy`](@ref).
+
+# Examples
+```jldoctest
+julia> Flux.logitcrossentropy([-1.1491, 0.8619, 0.3127], [1, 1, 0])
+3.085467254747738
+```
 """
 function logitcrossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
   return -sum(y .* logsoftmax(ŷ) .* weight) * 1 // size(y, 2)

From c222e1b1245106d6aff307b467238b97d91c8df3 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 31 Aug 2019 11:11:52 +0200
Subject: [PATCH 088/113] Add missing docstrings to `src/utils.jl`

Not sure about the `stack`, `unstack` and `unsqueeze` functions.
---
 src/utils.jl | 129 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 125 insertions(+), 4 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index f483c5d9..25be1063 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,10 +1,40 @@
 # Arrays
-nfan() = 1, 1 #fan_in, fan_out
-nfan(n) = 1, n #A vector is treated as a n×1 matrix
-nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
-nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
+nfan() = 1, 1 # fan_in, fan_out
+nfan(n) = 1, n # A vector is treated as a n×1 matrix
+nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices
+nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of convolution kernels
 
+"""
+    glorot_uniform(dims...)
+
+Return an `Array` of size `dims` containing random variables taken from a uniform
+distribution in the interval ``[-x, x]``, where `x = sqrt(24 / sum(dims)) / 2`.
+
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
+julia> Flux.glorot_uniform(2, 3)
+2×3 Array{Float32,2}:
+ 0.601094  -0.57414   -0.814925
+ 0.900868   0.805994   0.057514
+```
+"""
 glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
+
+"""
+    glorot_normal(dims...)
+
+Return an `Array` of size `dims` containing random variables taken from a normal
+distribution with mean 0 and standard deviation `(2 / sum(dims))`.
+
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
+julia> Flux.glorot_normal(3, 2)
+3×2 Array{Float32,2}:
+  0.429505  -0.0852891
+  0.523935   0.371009
+ -0.223261   0.188052
+```
+"""
 glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
 
 ones(T::Type, dims...) = Base.ones(T, dims...)
@@ -13,9 +43,81 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
 ones(dims...) = Base.ones(Float32, dims...)
 zeros(dims...) = Base.zeros(Float32, dims...)
 
+"""
+    unsqueeze(xs, dim)
+
+Return `xs` reshaped into an `Array` one dimensionality higher than `xs`,
+where `dim` indicates in which dimension `xs` is extended.
+
+# Examples
+```jldoctest
+julia> xs = [[1, 2], [3, 4], [5, 6]]
+3-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+
+julia> Flux.unsqueeze(xs, 1)
+1×3 Array{Array{Int64,1},2}:
+ [1, 2]  [3, 4]  [5, 6]
+
+julia> Flux.unsqueeze([1 2; 3 4], 2)
+2×1×2 Array{Int64,3}:
+[:, :, 1] =
+ 1
+ 3
+
+[:, :, 2] =
+ 2
+ 4
+```
+"""
 unsqueeze(xs, dim) = reshape(xs, (size(xs)[1:dim-1]..., 1, size(xs)[dim:end]...))
 
+"""
+    stack(xs, dim)
+
+Concatenate the given `Array` of `Array`s `xs` into a single `Array` along the
+given dimension `dim`.
+
+# Examples
+```jldoctest
+julia> xs = [[1, 2], [3, 4], [5, 6]]
+3-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+
+julia> Flux.stack(xs, 1)
+3×2 Array{Int64,2}:
+ 1  2
+ 3  4
+ 5  6
+
+julia> cat(xs, dims=1)
+3-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+```
+"""
 stack(xs, dim) = cat(unsqueeze.(xs, dim)..., dims=dim)
+
+"""
+    unstack(xs, dim)
+
+Unroll the given `xs` into an `Array` of `Array`s along the given dimension `dim`.
+
+# Examples
+```jldoctest
+julia> Flux.unstack([1 3 5 7; 2 4 6 8], 2)
+4-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+ [7, 8]
+```
+"""
 unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]
 
 """
@@ -82,6 +184,25 @@ function batch(xs)
   return data
 end
 
+"""
+Return the given sequence padded with `p` up to a maximum length of `n`.
+
+# Examples
+```jldoctest
+julia> rpad([1, 2], 4, 0)
+4-element Array{Int64,1}:
+ 1
+ 2
+ 0
+ 0
+
+julia> rpad([1, 2, 3], 2, 0)
+3-element Array{Int64,1}:
+ 1
+ 2
+ 3
+```
+"""
 Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))]
 
 """

From c76b7315ac401d4a4e8bf9581be7932908780d56 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 31 Aug 2019 11:20:32 +0200
Subject: [PATCH 089/113] Add loss and utility functions to docs

---
 docs/make.jl                        |  2 ++
 docs/src/training/loss_functions.md | 13 +++++++++
 docs/src/training/training.md       |  3 +-
 docs/src/utilities.md               | 43 +++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 docs/src/training/loss_functions.md
 create mode 100644 docs/src/utilities.md

diff --git a/docs/make.jl b/docs/make.jl
index e67de41c..f72237bc 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -22,10 +22,12 @@ makedocs(modules=[Flux, NNlib],
                      "DataLoader" => "data/dataloader.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
+                     "Loss Functions" => "training/loss_functions.md",
                      "Training" => "training/training.md"],
                   "GPU Support" => "gpu.md",
                   "Saving & Loading" => "saving.md",
                   "The Julia Ecosystem" => "ecosystem.md",
+                  "Utility Functions" => "utilities.md",
                   "Performance Tips" => "performance.md",
                   "Community" => "community.md"],
          )
diff --git a/docs/src/training/loss_functions.md b/docs/src/training/loss_functions.md
new file mode 100644
index 00000000..ed002a41
--- /dev/null
+++ b/docs/src/training/loss_functions.md
@@ -0,0 +1,13 @@
+# Loss Functions
+
+The following functions provide basic loss (or cost) functions.
+
+```@docs
+Flux.mse
+Flux.crossentropy
+Flux.logitcrossentropy
+Flux.binarycrossentropy
+Flux.logitbinarycrossentropy
+Flux.normalise
+```
+
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 903b8197..1fe10783 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -15,7 +15,7 @@ Flux.Optimise.train!
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
 
-## Loss Functions
+## Loss
 
 The objective function must return a number representing how far the model is from its target – the *loss* of the model. The `loss` function that we defined in [basics](../models/basics.md) will work as an objective. We can also define an objective in terms of some model:
 
@@ -32,6 +32,7 @@ Flux.train!(loss, ps, data, opt)
 ```
 
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
+For a list of all built-in loss functions, check out the [reference](loss_functions.md).
 
 At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
 
diff --git a/docs/src/utilities.md b/docs/src/utilities.md
new file mode 100644
index 00000000..d788e69f
--- /dev/null
+++ b/docs/src/utilities.md
@@ -0,0 +1,43 @@
+# Utility Functions
+
+Flux contains some utility functions for working with data; these functions
+help create inputs for your models or batch your dataset.
+Other functions can be used to initialize your layers or to regularly execute
+callback functions.
+
+## Working with Data
+
+```@docs
+Flux.unsqueeze
+Flux.stack
+Flux.unstack
+Flux.chunk
+Flux.frequencies
+Flux.batch
+Flux.batchseq
+Base.rpad(v::AbstractVector, n::Integer, p)
+```
+
+## Layer Initialization
+
+These are primarily useful if you are planning to write your own layers.
+Flux initializes convolutional layers and recurrent cells with `glorot_uniform`
+by default.
+To change the default on an applicable layer, pass the desired function with the
+`init` keyword. For example:
+```jldoctest; setup = :(using Flux)
+julia> conv = Conv((3, 3), 1 => 8, relu; init=Flux.glorot_normal)
+Conv((3, 3), 1=>8, relu)
+```
+
+```@docs
+Flux.glorot_uniform
+Flux.glorot_normal
+```
+
+## Callback Helpers
+
+```@docs
+Flux.throttle
+```
+

From ab86e350f2d719f7972fdf9f07b47ad3e70023cd Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 31 Aug 2019 11:39:28 +0200
Subject: [PATCH 090/113] Improve docstrings

Improvements like...
   - fixing typos,
   - removing trailing and double whitespaces,
   - using `jldoctest` blocks where applicable,
   - fixing, updating or correctly setting up existing doctests,
   - improving consistency (for example, always use "# Examples" instead
     of other variants),
   - removing empty lines between docstrings and functions,
   - instead of mentioning keywords, put them into the docstring,
   - adding some missing but useful keywords,
   - adding references (`@ref`),
   - using LaTeX math where applicable, and
   - linking papers.

Debatable stuff that is untouched:
   - BE/AE s/z irregularities ("normalise" versus "normalize") since
     most papers use the AE version while the Flux source code was
     written with BE spelling.
   - Names of normalization functions are capitalized
     ("Batch Normalization" instead of "batch normalization").
---
 src/data/fashion-mnist.jl  |   9 +-
 src/data/iris.jl           |  19 +--
 src/data/mnist.jl          |   9 +-
 src/layers/basic.jl        |  69 +++++-----
 src/layers/conv.jl         |  80 ++++++------
 src/layers/normalise.jl    |  55 ++++----
 src/layers/recurrent.jl    |  10 +-
 src/layers/stateless.jl    |  63 ++++++---
 src/onehot.jl              |  30 ++---
 src/optimise/optimisers.jl | 256 ++++++++++++++++++-------------------
 src/optimise/train.jl      |  16 +--
 src/utils.jl               |  36 ++++--
 12 files changed, 337 insertions(+), 315 deletions(-)

diff --git a/src/data/fashion-mnist.jl b/src/data/fashion-mnist.jl
index da78b605..5eaa1b29 100644
--- a/src/data/fashion-mnist.jl
+++ b/src/data/fashion-mnist.jl
@@ -33,9 +33,10 @@ const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte")
 
 Load the Fashion-MNIST images.
 
-Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
+Each image is a 28×28 array of `Gray` colour values
+(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).
 
-Returns the 60,000 training images by default; pass `:test` to retreive the
+Return the 60,000 training images by default; pass `:test` to retrieve the
 10,000 test images.
 """
 function images(set = :train)
@@ -49,10 +50,10 @@ end
     labels()
     labels(:test)
 
-Load the labels corresponding to each of the images returned from `images()`.
+Load the labels corresponding to each of the images returned from [`images()`](@ref).
 Each label is a number from 0-9.
 
-Returns the 60,000 training labels by default; pass `:test` to retreive the
+Return the 60,000 training labels by default; pass `:test` to retrieve the
 10,000 test labels.
 """
 function labels(set = :train)
diff --git a/src/data/iris.jl b/src/data/iris.jl
index f74e0709..76609677 100644
--- a/src/data/iris.jl
+++ b/src/data/iris.jl
@@ -2,13 +2,12 @@
 Fisher's classic iris dataset.
 
 Measurements from 3 different species of iris: setosa, versicolor and
-virginica.  There are 50 examples of each species.
+virginica. There are 50 examples of each species.
 
-There are 4 measurements for each example: sepal length, sepal width, petal
-length and petal width.  The measurements are in centimeters.
+There are 4 measurements for each example: sepal length, sepal width,
+petal length and petal width. The measurements are in centimeters.
 
 The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris).
-
 """
 module Iris
 
@@ -33,9 +32,7 @@ end
 Get the labels of the iris dataset, a 150 element array of strings listing the
 species of each example.
 
-```jldoctest
-julia> using Flux
-
+```jldoctest; setup = :(Flux.Data.Iris.load())
 julia> labels = Flux.Data.Iris.labels();
 
 julia> summary(labels)
@@ -54,13 +51,11 @@ end
 """
     features()
 
-Get the features of the iris dataset.  This is a 4x150 matrix of Float64
-elements.  It has a row for each feature (sepal length, sepal width,
+Get the features of the iris dataset. This is a 4x150 matrix of Float64
+elements. It has a row for each feature (sepal length, sepal width,
 petal length, petal width) and a column for each example.
 
-```jldoctest
-julia> using Flux
-
+```jldoctest; setup = :(Flux.Data.Iris.load())
 julia> features = Flux.Data.Iris.features();
 
 julia> summary(features)
diff --git a/src/data/mnist.jl b/src/data/mnist.jl
index b9c0540a..909814e0 100644
--- a/src/data/mnist.jl
+++ b/src/data/mnist.jl
@@ -83,9 +83,10 @@ getfeatures(io::IO, index::Integer) = vec(getimage(io, index))
 
 Load the MNIST images.
 
-Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
+Each image is a 28×28 array of `Gray` colour values
+(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).
 
-Returns the 60,000 training images by default; pass `:test` to retreive the
+Return the 60,000 training images by default; pass `:test` to retrieve the
 10,000 test images.
 """
 function images(set = :train)
@@ -99,10 +100,10 @@ end
     labels()
     labels(:test)
 
-Load the labels corresponding to each of the images returned from `images()`.
+Load the labels corresponding to each of the images returned from [`images()`](@ref).
 Each label is a number from 0-9.
 
-Returns the 60,000 training labels by default; pass `:test` to retreive the
+Return the 60,000 training labels by default; pass `:test` to retrieve the
 10,000 test labels.
 """
 function labels(set = :train)
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 96d67b45..4b0b4726 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -4,17 +4,23 @@
 Chain multiple layers / functions together, so that they are called in sequence
 on a given input.
 
-```julia
-m = Chain(x -> x^2, x -> x+1)
-m(5) == 26
-
-m = Chain(Dense(10, 5), Dense(5, 2))
-x = rand(10)
-m(x) == m[2](m[1](x))
-```
-
 `Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
 `m[1:3](x)` will calculate the output of the first three layers.
+
+# Examples
+```jldoctest
+julia> m = Chain(x -> x^2, x -> x+1);
+
+julia> m(5) == 26
+true
+
+julia> m = Chain(Dense(10, 5), Dense(5, 2));
+
+julia> x = rand(10);
+
+julia> m(x) == m[2](m[1](x))
+true
+```
 """
 struct Chain{T<:Tuple}
   layers::T
@@ -60,6 +66,7 @@ outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(
 # only slightly changed to better handle interaction with Zygote @dsweber2
 """
     activations(c::Chain, input)
+
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
@@ -78,14 +85,15 @@ extraChain(::Tuple{}, x) = ()
 """
     Dense(in::Integer, out::Integer, σ = identity)
 
-Creates a traditional `Dense` layer with parameters `W` and `b`.
+Create a traditional `Dense` layer with parameters `W` and `b`.
 
     y = σ.(W * x .+ b)
 
 The input `x` must be a vector of length `in`, or a batch of vectors represented
 as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.
 
-```julia
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
 julia> d = Dense(5, 2)
 Dense(5, 2)
 
@@ -145,7 +153,7 @@ outdims(l::Dense, isize) = (size(l.W)[1],)
 """
     Diagonal(in::Integer)
 
-Creates an element-wise linear transformation layer with learnable
+Create an element-wise linear transformation layer with learnable
 vectors `α` and `β`:
 
     y = α .* x .+ β
@@ -176,8 +184,8 @@ outdims(l::Diagonal, isize) = (length(l.α),)
 """
     Maxout(over)
 
-`Maxout` is a neural network layer, which has a number of internal layers,
-which all have the same input, and the maxout returns the elementwise maximium
+`Maxout` is a neural network layer which has a number of internal layers
+which all receive the same input. The layer returns the elementwise maximium
 of the internal layers' outputs.
 
 Maxout over linear dense layers satisfies the univeral approximation theorem.
@@ -196,17 +204,18 @@ end
 """
     Maxout(f, n_alts)
 
-Constructs a Maxout layer over `n_alts` instances of  the layer given  by `f`.
-The function takes no arguement and should return some callable layer.
-Conventionally this is a linear dense layer.
+Construct a Maxout layer over `n_alts` instances of the layer given by `f`.
+The function takes no arguments and should return some callable layer.
+Conventionally, this is a linear dense layer.
 
-For example the following example which
-will construct a `Maxout` layer over 4 internal dense linear layers,
-each identical in structure (784 inputs, 128 outputs).
+# Examples
+
+This constructs a `Maxout` layer over 4 internal dense linear layers, each
+identical in structure (784 inputs, 128 outputs):
 ```julia
-    insize = 784
-    outsize = 128
-    Maxout(()->Dense(insize, outsize), 4)
+insize = 784
+outsize = 128
+Maxout(()->Dense(insize, outsize), 4)
 ```
 """
 function Maxout(f, n_alts)
@@ -223,16 +232,18 @@ end
 outdims(l::Maxout, isize) = outdims(first(l.over), isize)
 
 """
-    SkipConnection(layers, connection)
+    SkipConnection(layer, connection)
 
-Creates a Skip Connection, of a layer or `Chain` of consecutive layers
-plus a shortcut connection. The connection function will combine the result of the layers
-with the original input, to give the final output.
+Create a skip connection which consists of a layer or `Chain` of consecutive
+layers and a shortcut connection linking the block's input to the output
+through a user-supplied 2-argument callable. The first argument to the callable
+will be propagated through the given `layer` while the second is the unchanged,
+"skipped" input.
 
-The simplest 'ResNet'-type connection is just `SkipConnection(layer, +)`,
+The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`,
 and requires the output of the layers to be the same shape as the input.
 Here is a more complicated example:
-```
+```julia
 m = Conv((3,3), 4=>7, pad=(1,1))
 x = ones(5,5,4,10);
 size(m(x)) == (5, 5, 7, 10)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 742091a6..60666aa2 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -8,25 +8,26 @@ _convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
 """
-    Conv(size, in=>out)
-    Conv(size, in=>out, relu)
+    Conv(size, in => out, σ = identity; init = glorot_uniform,
+         stride = 1, pad = 0, dilation = 1)
 
 Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 
-Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
-         giving us a 16-channel output. Output is activated with ReLU.
-
-    size = (2,2)
-    in = 1
-    out = 16
-    Conv((2, 2), 1=>16, relu)
-
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+# Examples
+
+Apply a `Conv` layer to a 1-channel input using a 2×2 window size, giving us a
+16-channel output. Output is activated with ReLU.
+```julia
+size = (2,2)
+in = 1
+out = 16
+Conv(size, in => out, relu)
+```
 """
 struct Conv{N,M,F,A,V}
   σ::F
@@ -76,8 +77,8 @@ end
 """
     outdims(l::Conv, isize::Tuple)
 
-Calculate the output dimensions given the input dimensions, `isize`.
-Batch size and channel size are ignored as per `NNlib.jl`.
+Calculate the output dimensions given the input dimensions `isize`.
+Batch size and channel size are ignored as per [NNlib.jl](https://github.com/FluxML/NNlib.jl).
 
 ```julia
 m = Conv((3, 3), 3 => 16)
@@ -89,17 +90,15 @@ outdims(l::Conv, isize) =
   output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
-    ConvTranspose(size, in=>out)
-    ConvTranspose(size, in=>out, relu)
+    ConvTranspose(size, in => out, σ = identity; init = glorot_uniform,
+                  stride = 1, pad = 0, dilation = 1)
 
 Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 
-Data should be stored in WHCN order (width, height, # channels, # batches).
+Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
-
-Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct ConvTranspose{N,M,F,A,V}
   σ::F
@@ -165,18 +164,16 @@ end
 outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
 
 """
-    DepthwiseConv(size, in=>out)
-    DepthwiseConv(size, in=>out, relu)
+    DepthwiseConv(size, in => out, σ = identity; init = glorot_uniform,
+                  stride = 1, pad = 0, dilation = 1)
 
 Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Note that `out` must be an integer multiple of `in`.
 
-Data should be stored in WHCN order (width, height, # channels, # batches).
+Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
-
-Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct DepthwiseConv{N,M,F,A,V}
   σ::F
@@ -233,25 +230,26 @@ outdims(l::DepthwiseConv, isize) =
   output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
-    CrossCor(size, in=>out)
-    CrossCor(size, in=>out, relu)
+    CrossCor(size, in => out, σ = identity; init = glorot_uniform,
+             stride = 1, pad = 0, dilation = 1)
 
 Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 
-Example: Applying CrossCor layer to a 1-channel input using a 2x2 window size,
-         giving us a 16-channel output. Output is activated with ReLU.
-
-    size = (2,2)
-    in = 1
-    out = 16
-    CrossCor((2, 2), 1=>16, relu)
-
-Data should be stored in WHCN order (width, height, # channels, # batches).
+Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+# Examples
+
+Apply a `CrossCor` layer to a 1-channel input using a 2×2 window size, giving us a
+16-channel output. Output is activated with ReLU.
+```julia
+size = (2,2)
+in = 1
+out = 16
+CrossCor((2, 2), 1=>16, relu)
+```
 """
 struct CrossCor{N,M,F,A,V}
   σ::F
@@ -357,11 +355,9 @@ function Base.show(io::IO, g::GlobalMeanPool)
 end
 
 """
-    MaxPool(k)
+    MaxPool(k; pad = 0, stride = k)
 
-Max pooling layer. `k` stands for the size of the window for each dimension of the input.
-
-Takes the keyword arguments `pad` and `stride`.
+Max pooling layer. `k` is the size of the window for each dimension of the input.
 """
 struct MaxPool{N,M}
   k::NTuple{N,Int}
@@ -388,11 +384,9 @@ end
 outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
 
 """
-    MeanPool(k)
+    MeanPool(k; pad = 0, stride = k)
 
-Mean pooling layer. `k` stands for the size of the window for each dimension of the input.
-
-Takes the keyword arguments `pad` and `stride`.
+Mean pooling layer. `k` is the size of the window for each dimension of the input.
 """
 struct MeanPool{N,M}
     k::NTuple{N,Int}
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 3828748f..76d312bf 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -10,14 +10,14 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
 """
-    dropout(p, dims = :)
+    dropout(x, p; dims = :)
 
-Dropout function. For each input, either sets that input to `0` (with probability
-`p`) or scales it by `1/(1-p)`. The `dims` argument is to specify the unbroadcasted
-dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
-used as a regularisation, i.e. it reduces overfitting during training. 
- 
-See also [`Dropout`](@ref).
+The dropout function. For each input, either sets that input to `0` (with probability
+`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions,
+e.g. `dims=1` applies dropout along columns and `dims=2` along rows.
+This is used as a regularisation, i.e. it reduces overfitting during training.
+
+See also the [`Dropout`](@ref) layer.
 """
 dropout(x, p; dims = :) = x
 
@@ -32,7 +32,7 @@ end
 
 A Dropout layer. In the forward pass, applies the [`dropout`](@ref) function on the input.
 
-Does nothing to the input once [`testmode!`](@ref) is true.
+Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 """
 mutable struct Dropout{F,D}
   p::F
@@ -64,9 +64,9 @@ end
 
 """
     AlphaDropout(p)
-    
-A dropout layer. It is used in Self-Normalizing Neural Networks.
-(https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
+
+A dropout layer. It is used in
+[Self-Normalizing Neural Networks](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf).
 The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
 
 Does nothing to the input once [`testmode!`](@ref) is true.
@@ -100,8 +100,8 @@ testmode!(m::AlphaDropout, mode = true) =
     LayerNorm(h::Integer)
 
 A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be
-used with recurrent hidden states of size `h`. Normalises the mean/stddev of
-each input before applying a per-neuron gain/bias.
+used with recurrent hidden states of size `h`. Normalises the mean and standard
+deviation of each input before applying a per-neuron gain/bias.
 """
 struct LayerNorm{T}
   diag::Diagonal{T}
@@ -139,7 +139,7 @@ Use [`testmode!`](@ref) during inference.
 See [Batch Normalization: Accelerating Deep Network Training by Reducing
 Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf).
 
-Example:
+# Examples
 ```julia
 m = Chain(
   Dense(28^2, 64),
@@ -234,7 +234,7 @@ Use [`testmode!`](@ref) during inference.
 
 See [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
 
-Example:
+# Examples
 ```julia
 m = Chain(
   Dense(28^2, 64),
@@ -316,28 +316,27 @@ function Base.show(io::IO, l::InstanceNorm)
 end
 
 """
-Group Normalization.
-This layer can outperform Batch-Normalization and Instance-Normalization.
+    GroupNorm(chs::Integer, G::Integer, λ = identity;
+              initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i),
+              ϵ = 1f-5, momentum = 0.1f0)
 
-	GroupNorm(chs::Integer, G::Integer, λ = identity;
-	          initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i),
-	          ϵ = 1f-5, momentum = 0.1f0)
+[Group Normalization](https://arxiv.org/pdf/1803.08494.pdf) layer.
+This layer can outperform Batch Normalization and Instance Normalization.
 
-``chs`` is the number of channels, the channel dimension of your input.
-For an array of N dimensions, the (N-1)th index is the channel dimension.
+`chs` is the number of channels, the channel dimension of your input.
+For an array of N dimensions, the `N-1`th index is the channel dimension.
 
-``G`` is the number of groups along which the statistics would be computed.
+`G` is the number of groups along which the statistics are computed.
 The number of channels must be an integer multiple of the number of groups.
 
 Use [`testmode!`](@ref) during inference.
 
-Example:
-```
+# Examples
+```julia
 m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1),
-          GroupNorm(32,16)) # 32 channels, 16 groups (G = 16), thus 2 channels per group used
+          GroupNorm(32,16))
+          # 32 channels, 16 groups (G = 16), thus 2 channels per group used
 ```
-
-Link : https://arxiv.org/pdf/1803.08494.pdf
 """
 mutable struct GroupNorm{F,V,W,N,T}
   G::T # number of groups
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 647dda25..05466b31 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -12,7 +12,7 @@ in the background. `cell` should be a model of the form:
 
     h, y = cell(h, x...)
 
-For example, here's a recurrent network that keeps a running total of its inputs.
+For example, here's a recurrent network that keeps a running total of its inputs:
 
 ```julia
 accum(h, x) = (h+x, x)
@@ -135,8 +135,8 @@ Base.show(io::IO, l::LSTMCell) =
 """
     LSTM(in::Integer, out::Integer)
 
-Long Short Term Memory recurrent layer. Behaves like an RNN but generally
-exhibits a longer memory span over sequences.
+[Long Short Term Memory](https://www.researchgate.net/publication/13853244_Long_Short-term_Memory)
+recurrent layer. Behaves like an RNN but generally exhibits a longer memory span over sequences.
 
 See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 for a good overview of the internals.
@@ -176,8 +176,8 @@ Base.show(io::IO, l::GRUCell) =
 """
     GRU(in::Integer, out::Integer)
 
-Gated Recurrent Unit layer. Behaves like an RNN but generally
-exhibits a longer memory span over sequences.
+[Gated Recurrent Unit](https://arxiv.org/abs/1406.1078) layer. Behaves like an
+RNN but generally exhibits a longer memory span over sequences.
 
 See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 for a good overview of the internals.
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index b598fdd4..b566c683 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -73,7 +73,7 @@ computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
 `weight` can be `Nothing`, a `Number` or an `AbstractVector`.
 `weight=nothing` acts like `weight=1` but is faster.
 
-See also [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref).
+See also: [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
 
 # Examples
 ```jldoctest
@@ -86,13 +86,13 @@ crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _cros
 """
     logitcrossentropy(ŷ, y; weight = 1)
 
-Return the crossentropy computed after a [`logsoftmax`](@ref) operation;
+Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
 computed as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
 
 `logitcrossentropy(ŷ, y)` is mathematically equivalent to
-[`crossentropy(softmax(log(ŷ)), y)`](@ref) but it is more numerically stable.
+[`Flux.crossentropy(softmax(log(ŷ)), y)`](@ref) but it is more numerically stable.
 
-See also [`crossentropy`](@ref), [`binarycrossentropy`](@ref).
+See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
 
 # Examples
 ```jldoctest
@@ -107,9 +107,20 @@ end
 """
     binarycrossentropy(ŷ, y; ϵ=eps(ŷ))
 
-Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerical stability.
+Return ``-y*\\log(ŷ + ϵ) - (1-y)*\\log(1-ŷ + ϵ)``. The `ϵ` term provides numerical stability.
 
 Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation.
+
+See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
+3-element Array{Float64,1}:
+ 1.424397097347566
+ 0.35231664672364077
+ 0.8616703662235441
+```
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
@@ -119,10 +130,19 @@ CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1
 """
     logitbinarycrossentropy(ŷ, y)
 
-`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(ŷ), y)`
-but it is more numerically stable.
+`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to
+[`Flux.binarycrossentropy(σ(log(ŷ)), y)`](@ref) but it is more numerically stable.
 
-See also [`binarycrossentropy`](@ref), [`sigmoid`](@ref), [`logsigmoid`](@ref).
+See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0])
+3-element Array{Float64,1}:
+ 1.4243970973475661
+ 0.35231664672364094
+ 0.8616703662235443
+```
 """
 logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 
@@ -132,26 +152,27 @@ CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 """
     normalise(x; dims=1)
 
-Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
+Normalise `x` to mean 0 and standard deviation 1 across the dimensions given by `dims`.
+Defaults to normalising over columns.
 
-```julia-repl
+```jldoctest
 julia> a = reshape(collect(1:9), 3, 3)
 3×3 Array{Int64,2}:
-  1  4  7
-  2  5  8
-  3  6  9
+ 1  4  7
+ 2  5  8
+ 3  6  9
 
-julia> normalise(a)
+julia> Flux.normalise(a)
 3×3 Array{Float64,2}:
-  -1.22474  -1.22474  -1.22474
+ -1.22474  -1.22474  -1.22474
   0.0       0.0       0.0
   1.22474   1.22474   1.22474
 
-julia> normalise(a, dims=2)
+julia> Flux.normalise(a, dims=2)
 3×3 Array{Float64,2}:
-  -1.22474  0.0  1.22474
-  -1.22474  0.0  1.22474
-  -1.22474  0.0  1.22474
+ -1.22474  0.0  1.22474
+ -1.22474  0.0  1.22474
+ -1.22474  0.0  1.22474
 ```
 """
 function normalise(x::AbstractArray; dims=1)
@@ -191,7 +212,7 @@ Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 o
 Returns `sum((max.(0, 1 .- ŷ .* y))) / size(y, 2)`
 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
-See also [`squared_hinge`](@ref).
+See also: [`squared_hinge`](@ref)
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) * 1 // size(y, 2)
 
@@ -201,7 +222,7 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) * 1 // size(y, 2)
 Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1).
 Returns `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`
 
-See also [`hinge`](@ref).
+See also: [`hinge`](@ref)
 """
 squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) * 1 // size(y, 2)
 
diff --git a/src/onehot.jl b/src/onehot.jl
index b480d9c0..7a046dc1 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -45,22 +45,20 @@ cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.d
 """
     onehot(l, labels[, unk])
 
-Create an [`OneHotVector`](@ref) wtih `l`-th element be `true` based on possible `labels` set.
-If `unk` is given, it retruns `onehot(unk, labels)` if the input label `l` is not find in `labels`; otherwise
-it will error.
-
-## Examples
+Create a [`OneHotVector`](@ref) with its `l`-th element `true` based on
+possible `labels` set.
+If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
+in `labels`; otherwise it will error.
 
+# Examples
 ```jldoctest
-julia> using Flux: onehot
-
-julia> onehot(:b, [:a, :b, :c])
+julia> Flux.onehot(:b, [:a, :b, :c])
 3-element Flux.OneHotVector:
  0
  1
  0
 
-julia> onehot(:c, [:a, :b, :c])
+julia> Flux.onehot(:c, [:a, :b, :c])
 3-element Flux.OneHotVector:
  0
  0
@@ -85,12 +83,9 @@ end
 Create an [`OneHotMatrix`](@ref) with a batch of labels based on possible `labels` set, returns the
 `onehot(unk, labels)` if given labels `ls` is not found in set `labels`.
 
-## Examples
-
+# Examples
 ```jldoctest
-julia> using Flux: onehotbatch
-
-julia> onehotbatch([:b, :a, :b], [:a, :b, :c])
+julia> Flux.onehotbatch([:b, :a, :b], [:a, :b, :c])
 3×3 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}:
  0  1  0
  1  0  1
@@ -107,13 +102,12 @@ Base.argmax(xs::OneHotVector) = xs.ix
 
 Inverse operations of [`onehot`](@ref).
 
+# Examples
 ```jldoctest
-julia> using Flux: onecold
-
-julia> onecold([true, false, false], [:a, :b, :c])
+julia> Flux.onecold([true, false, false], [:a, :b, :c])
 :a
 
-julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c])
+julia> Flux.onecold([0.3, 0.2, 0.5], [:a, :b, :c])
 :c
 ```
 """
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 7db5bff5..4f121edf 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -6,19 +6,20 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-    Descent(η)
+    Descent(η = 0.1)
 
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
 
-## Parameters
-  - Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
+# Parameters
+  - Learning rate (`η`): Amount by which the gradients are discounted before updating
+                         the weights.
 
-## Example
-```julia-repl
-opt = Descent() # uses default η (0.1)
+# Examples
+```julia
+opt = Descent()
 
-opt = Descent(0.3) # use provided η
+opt = Descent(0.3)
 
 ps = params(model)
 
@@ -40,17 +41,19 @@ function apply!(o::Descent, x, Δ)
 end
 
 """
-    Momentum(η, ρ)
+    Momentum(η = 0.01, ρ = 0.9)
 
-Gradient descent with learning rate `η` and momentum `ρ`.
+Gradient descent optimizer with learning rate `η` and momentum `ρ`.
 
-## Parameters
-  - Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`.
-  - Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`.
+# Parameters
+  - Learning rate (`η`): Amount by which gradients are discounted before updating the
+                         weights.
+  - Momentum (`ρ`): Controls the acceleration of gradient descent in the relevant direction
+                    and therefore the dampening of oscillations.
 
-## Examples
+# Examples
 ```julia
-opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9
+opt = Momentum()
 
 opt = Momentum(0.01, 0.99)
 ```
@@ -71,17 +74,18 @@ function apply!(o::Momentum, x, Δ)
 end
 
 """
-    Nesterov(η, ρ)
+    Nesterov(η = 0.001, ρ = 0.9)
 
-Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
+Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.
 
-## Parameters
-  - Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
-  - Nesterov Momentum (ρ): Parameters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
+# Parameters
+  - Learning rate (`η`): Amount by which the gradients are discounted before updating the
+                         weights.
+  - Nesterov momentum (`ρ`): The amount of Nesterov momentum to be applied.
 
-## Examples
+# Examples
 ```julia
-opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9
+opt = Nesterov()
 
 opt = Nesterov(0.003, 0.95)
 ```
@@ -103,23 +107,23 @@ function apply!(o::Nesterov, x, Δ)
 end
 
 """
-    RMSProp(η, ρ)
+    RMSProp(η = 0.001, ρ = 0.9)
 
-Implements the RMSProp algortihm. Often a good choice for recurrent networks. Parameters other than learning rate generally don't need tuning.
+Optimizer using the
+[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+algorithm. Often a good choice for recurrent networks. Parameters other than learning rate
+generally don't need tuning.
 
-## Parameters
-  - Learning Rate (η): Defaults to `0.001`.
-  - Rho (ρ): Defaults to `0.9`.
+# Parameters
+  - Learning rate (`η`)
+  - Momentum (`ρ`)
 
-## Examples
+# Examples
 ```julia
-opt = RMSProp() # uses default η = 0.001 and ρ = 0.9
+opt = RMSProp()
 
 opt = RMSProp(0.002, 0.95)
 ```
-
-## References
-[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 """
 mutable struct RMSProp
   eta::Float64
@@ -137,23 +141,21 @@ function apply!(o::RMSProp, x, Δ)
 end
 
 """
-    ADAM(η, β::Tuple)
+    ADAM(η = 0.001, β::Tuple = (0.9, 0.999))
 
-Implements the ADAM optimiser.
+[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 
-## Paramters
-  - Learning Rate (`η`): Defaults to `0.001`.
-  - Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
-
-## Examples
+# Parameters
+  - Learning rate (`η`)
+  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                     second (β2) momentum estimate.
 
+# Examples
 ```julia
-opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
+opt = ADAM()
 
 opt = ADAM(0.001, (0.9, 0.8))
 ```
-## References
-[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 """
 mutable struct ADAM
   eta::Float64
@@ -174,24 +176,21 @@ function apply!(o::ADAM, x, Δ)
 end
 
 """
-    RADAM(η, β::Tuple)
+    RADAM(η = 0.001, β::Tuple = (0.9, 0.999))
 
-Implements the rectified ADAM optimizer.
+[Rectified ADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimizer.
 
-## Parameters
-  - Learning Rate (η): Defaults to `0.001`
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
-
-## Examples
+# Parameters
+  - Learning rate (`η`)
+  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                     second (β2) momentum estimate.
 
+# Examples
 ```julia
-opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
+opt = RADAM()
 
 opt = RADAM(0.001, (0.9, 0.8))
 ```
-
-## References
-[RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
 """
 mutable struct RADAM
   eta::Float64
@@ -219,22 +218,21 @@ function apply!(o::RADAM, x, Δ)
 end
 
 """
-    AdaMax(η, β::Tuple)
+    AdaMax(η = 0.001, β::Tuple = (0.9, 0.999))
 
-Variant of ADAM based on ∞-norm.
+[AdaMax](https://arxiv.org/abs/1412.6980v9) is a variant of ADAM based on the ∞-norm.
 
-## Parameters
-  - Learning Rate (η): Defaults to `0.001`
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+# Parameters
+  - Learning rate (`η`)
+  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                     second (β2) momentum estimate.
 
-## Examples
+# Examples
 ```julia
-opt = AdaMax() # uses default η and β
+opt = AdaMax()
 
 opt = AdaMax(0.001, (0.9, 0.995))
 ```
-## References
-[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser.
 """
 mutable struct AdaMax
   eta::Float64
@@ -255,23 +253,21 @@ function apply!(o::AdaMax, x, Δ)
 end
 
 """
-    ADAGrad(η)
+    ADAGrad(η = 0.1)
 
-Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated.
+[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
+parameter specific learning rates based on how frequently it is updated.
+Parameters don't need tuning.
 
-## Parameters
-  - Learning Rate (η): Defaults to `0.1`
+# Parameters
+  - Learning rate (`η`)
 
-## Examples
+# Examples
 ```julia
-opt = ADAGrad() # uses default η = 0.1
+opt = ADAGrad()
 
 opt = ADAGrad(0.001)
 ```
-
-## References
-[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
-Parameters don't need tuning.
 """
 mutable struct ADAGrad
   eta::Float64
@@ -288,21 +284,21 @@ function apply!(o::ADAGrad, x, Δ)
 end
 
 """
-    ADADelta(ρ)
+    ADADelta(ρ = 0.9)
 
-Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning.
+[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning
+rate based on a window of past gradient updates.
+Parameters don't need tuning.
 
-## Parameters
-  - Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`.
+# Parameters
+  - Rho (`ρ`): Factor by which gradient is decayed at each time step.
 
-## Examples
+# Examples
 ```julia
-opt = ADADelta() # uses default ρ = 0.9
+opt = ADADelta()
+
 opt = ADADelta(0.89)
 ```
-
-## References
-[ADADelta](https://arxiv.org/abs/1212.5701) optimiser.
 """
 mutable struct ADADelta
   rho::Float64
@@ -321,22 +317,22 @@ function apply!(o::ADADelta, x, Δ)
 end
 
 """
-    AMSGrad(η, β::Tuple)
+    AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999))
 
-Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning.
+The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
+optimiser. Parameters don't need tuning.
 
-## Parameters
-  - Learning Rate (η): Defaults to `0.001`.
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+# Parameters
+  - Learning Rate (`η`)
+  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                     second (β2) momentum estimate.
 
-## Examples
+# Examples
 ```julia
-opt = AMSGrad() # uses default η and β
+opt = AMSGrad()
+
 opt = AMSGrad(0.001, (0.89, 0.995))
 ```
-
-## References
-[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser.
 """
 mutable struct AMSGrad
   eta::Float64
@@ -356,22 +352,22 @@ function apply!(o::AMSGrad, x, Δ)
 end
 
 """
-    NADAM(η, β::Tuple)
+    NADAM(η = 0.001, β::Tuple = (0.9, 0.999))
 
-Nesterov variant of ADAM. Parameters don't need tuning.
+[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) is a Nesterov variant of ADAM.
+Parameters don't need tuning.
 
-## Parameters
-  - Learning Rate (η): Defaults to `0.001`.
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+# Parameters
+  - Learning rate (`η`)
+  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                     second (β2) momentum estimate.
 
-## Examples
+# Examples
 ```julia
-opt = NADAM() # uses default η and β
+opt = NADAM()
+
 opt = NADAM(0.002, (0.89, 0.995))
 ```
-
-## References
-[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser.
 """
 mutable struct NADAM
   eta::Float64
@@ -392,23 +388,23 @@ function apply!(o::NADAM, x, Δ)
 end
 
 """
-    ADAMW(η, β::Tuple, decay)
+    ADAMW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0)
 
-Variant of ADAM defined by fixing weight decay regularization.
+[ADAMW](https://arxiv.org/abs/1711.05101) is a variant of ADAM fixing (as in repairing) its
+weight decay regularization.
 
-## Parameters
-  - Learning Rate (η): Defaults to `0.001`.
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999).
-  - decay: Decay applied to weights during optimisation. Defaults to 0.
+# Parameters
+  - Learning rate (`η`)
+  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                     second (β2) momentum estimate.
+  - `decay`: Decay applied to weights during optimisation.
 
-## Examples
+# Examples
 ```julia
-opt = ADAMW() # uses default η, β and decay
+opt = ADAMW()
+
 opt = ADAMW(0.001, (0.89, 0.995), 0.1)
 ```
-
-## References
-[ADAMW](https://arxiv.org/abs/1711.05101)
 """
 ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
   Optimiser(ADAM(η, β), WeightDecay(decay))
@@ -441,14 +437,13 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
-    InvDecay(γ)
+    InvDecay(γ = 0.001)
 
-Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
+Apply inverse time decay to an optimiser, so that the effective step size at
+iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size.
+The wrapped optimiser's step size is not modified.
 
-## Parameters
-  - gamma (γ): Defaults to `0.001`
-
-## Example
+# Examples
 ```julia
 Optimiser(InvDecay(..), Opt(..))
 ```
@@ -469,20 +464,23 @@ function apply!(o::InvDecay, x, Δ)
 end
 
 """
-    ExpDecay(eta, decay, decay_step, clip)
+    ExpDecay(eta = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4)
 
-Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
+Discount the learning rate `eta` by the factor `decay` every `decay_step` steps till
+a minimum of `clip`.
 
-## Parameters
-  - Learning Rate (eta): Defaults to `0.001`.
-  - decay: Factor by which the learning rate is discounted. Defaults to `0.1`.
-  - decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`.
-  - clip: Minimum value of learning rate. Defaults to `1e-4`.
+# Parameters
+  - Learning rate (`eta`)
+  - `decay`: Factor by which the learning rate is discounted.
+  - `decay_step`: Schedule decay operations by setting number of steps between two decay
+                  operations.
+  - `clip`: Minimum value of learning rate.
 
-## Example
+# Examples
 To apply exponential decay to an optimiser:
 ```julia
 Optimiser(ExpDecay(..), Opt(..))
+
 opt = Optimiser(ExpDecay(), ADAM())
 ```
 """
@@ -507,12 +505,12 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-    WeightDecay(wd)
+    WeightDecay(wd = 0)
 
-Decays the weight by `wd`
+Decay weights by `wd`.
 
-## Parameters
-  - weight decay (wd): 0
+# Parameters
+  - Weight decay (`wd`)
 """
 mutable struct WeightDecay
   wd::Real
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index e12ab27b..9c3c29bd 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -43,9 +43,8 @@ struct StopException <: Exception end
 Call `Flux.stop()` in a callback to indicate when a callback condition is met.
 This would trigger the train loop to stop and exit.
 
+# Examples
 ```julia
-# Example callback:
-
 cb = function ()
   accuracy() > 0.9 && Flux.stop()
 end
@@ -65,12 +64,12 @@ In case datapoints `d` are of numeric array type, assumes no splatting is needed
 and computes the gradient of `loss(d)`.
 
 Takes a callback as keyword argument `cb`. For example, this will print "training"
-every 10 seconds:
+every 10 seconds (using [`throttle`](@ref)):
 
   train!(loss, params, data, opt,
          cb = throttle(() -> println("training"), 10))
 
-The callback can call `Flux.stop()` to interrupt the training loop.
+The callback can call [`Flux.stop()`](@ref) to interrupt the training loop.
 
 Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays.
 """
@@ -106,11 +105,12 @@ end
 Run `body` `N` times. Mainly useful for quickly doing multiple epochs of
 training in a REPL.
 
-```julia
-julia> @epochs 2 println("hello")
-INFO: Epoch 1
+# Examples
+```jldoctest
+julia> Flux.@epochs 2 println("hello")
+[ Info: Epoch 1
 hello
-INFO: Epoch 2
+[ Info: Epoch 2
 hello
 ```
 """
diff --git a/src/utils.jl b/src/utils.jl
index 25be1063..40f0ae9c 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -125,8 +125,9 @@ unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]
 
 Split `xs` into `n` parts.
 
-```julia
-julia> chunk(1:10, 3)
+# Examples
+```jldoctest
+julia> Flux.chunk(1:10, 3)
 3-element Array{Array{Int64,1},1}:
  [1, 2, 3, 4]
  [5, 6, 7, 8]
@@ -142,11 +143,12 @@ batchindex(xs, i) = (reverse(Base.tail(reverse(axes(xs))))..., i)
 
 Count the number of times that each element of `xs` appears.
 
-```julia
-julia> frequencies(['a','b','b'])
+# Examples
+```jldoctest
+julia> Flux.frequencies(['a','b','b'])
 Dict{Char,Int64} with 2 entries:
-  'b' => 2
   'a' => 1
+  'b' => 2
 ```
 """
 function frequencies(xs)
@@ -166,8 +168,9 @@ squeezebatch(x) = reshape(x, head(size(x)))
 
 Batch the arrays in `xs` into a single array.
 
-```julia
-julia> batch([[1,2,3],[4,5,6]])
+# Examples
+```jldoctest
+julia> Flux.batch([[1,2,3],[4,5,6]])
 3×2 Array{Int64,2}:
  1  4
  2  5
@@ -211,8 +214,9 @@ Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))
 Take a list of `N` sequences, and turn them into a single sequence where each
 item is a batch of `N`. Short sequences will be padded by `pad`.
 
-```julia
-julia> batchseq([[1, 2, 3], [4, 5]], 0)
+# Examples
+```jldoctest
+julia> Flux.batchseq([[1, 2, 3], [4, 5]], 0)
 3-element Array{Array{Int64,1},1}:
  [1, 4]
  [2, 5]
@@ -269,11 +273,15 @@ end
 # Other
 
 """
-Returns a function that when invoked, will only be triggered at most once
-during `timeout` seconds. Normally, the throttled function will run
-as much as it can, without ever going more than once per `wait` duration;
-but if you'd like to disable the execution on the leading edge, pass
-`leading=false`. To enable execution on the trailing edge, ditto.
+    throttle(f, timeout; leading=true, trailing=false)
+
+Return a function that when invoked, will only be triggered at most once
+during `timeout` seconds.
+
+Normally, the throttled function will run as much as it can, without ever
+going more than once per `wait` duration; but if you'd like to disable the
+execution on the leading edge, pass `leading=false`. To enable execution on
+the trailing edge, pass `trailing=true`.
 """
 function throttle(f, timeout; leading=true, trailing=false)
   cooldown = true

From ba80c2e8abfbee95d13fd60790689a67b3a59075 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 31 Aug 2019 11:43:34 +0200
Subject: [PATCH 091/113] Improve whitespaces in docs

---
 docs/src/performance.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/performance.md b/docs/src/performance.md
index 0af8ef3b..4601e90c 100644
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@@ -52,7 +52,7 @@ e.g.
 ```julia
 function loss_total(xs::AbstractVector{<:Vector}, ys::AbstractVector{<:Vector})
     sum(zip(xs, ys)) do (x, y_target)
-        y_pred = model(x) #  evaluate the model
+        y_pred = model(x)  # evaluate the model
         return loss(y_pred, y_target)
     end
 end

From 740a59d0a67c92b3f6dfa5302341081e6dc01369 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 31 Aug 2019 12:49:40 +0200
Subject: [PATCH 092/113] Add missing docstrings to `src/data`.

---
 src/data/cmudict.jl   | 25 +++++++++++++++++++++++++
 src/data/sentiment.jl | 21 +++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl
index e6266540..0ed724d4 100644
--- a/src/data/cmudict.jl
+++ b/src/data/cmudict.jl
@@ -24,18 +24,35 @@ function load()
   end
 end
 
+"""
+    phones()
+
+Return a `Vector` containing the phones used in the dataset.
+"""
 function phones()
   load()
   Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
                         "\n", keepempty = false), "\t")))
 end
 
+"""
+    symbols()
+
+Return a `Vector` containing the symbols used in the dataset.
+A symbol is a phone with optional auxiliary symbols, indicating for example the
+amount of stress on the phone.
+"""
 function symbols()
   load()
   Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
                 "\n", keepempty = false))
 end
 
+"""
+    rawdict()
+
+Return the unfiltered CMU Pronouncing Dictionary.
+"""
 function rawdict()
   load()
   Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
@@ -44,6 +61,14 @@ end
 
 validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
 
+"""
+    cmudict()
+
+Return a filtered CMU Pronouncing Dictionary.
+
+It is filtered so each word contains only ASCII characters and a combination of
+word characters (as determined by the regex engine using `\\w`), '-' and '.'.
+"""
 cmudict() = filter(p -> validword(p.first), rawdict())
 
 alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
diff --git a/src/data/sentiment.jl b/src/data/sentiment.jl
index ecb1ab8d..058dcf07 100644
--- a/src/data/sentiment.jl
+++ b/src/data/sentiment.jl
@@ -1,3 +1,4 @@
+"Stanford Sentiment Treebank dataset."
 module Sentiment
 
 using ZipFile
@@ -39,8 +40,28 @@ function gettrees(name)
   return parsetree.(ss)
 end
 
+"""
+    train()
+
+Return the train split of the Stanford Sentiment Treebank.
+The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
+"""
 train() = gettrees("train")
+
+"""
+    test()
+
+Return the test split of the Stanford Sentiment Treebank.
+The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
+"""
 test() = gettrees("test")
+
+"""
+    dev()
+
+Return the dev split of the Stanford Sentiment Treebank.
+The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
+"""
 dev() = gettrees("dev")
 
 end

From ff9198b93977c78a6e70c4b5e19c590a47bd6b3e Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 31 Aug 2019 12:51:37 +0200
Subject: [PATCH 093/113] Add datasets to docs

All the relevant functions. Perhaps discuss a consistent API, describe
it in the docs and then only document the modules.
---
 docs/make.jl         | 10 +++++-----
 docs/src/datasets.md | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)
 create mode 100644 docs/src/datasets.md

diff --git a/docs/make.jl b/docs/make.jl
index f72237bc..0ee0ccab 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -4,11 +4,6 @@ DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
 makedocs(modules=[Flux, NNlib],
          doctest = true,
          sitename = "Flux",
-         format = Documenter.HTML(
-                 analytics = "UA-36890222-9",
-                 assets = ["assets/flux.css"],
-                 prettyurls = get(ENV, "CI", nothing) == "true",
-         ),
          pages = ["Home" => "index.md",
                   "Building Models" =>
                     ["Basics" => "models/basics.md",
@@ -29,7 +24,12 @@ makedocs(modules=[Flux, NNlib],
                   "The Julia Ecosystem" => "ecosystem.md",
                   "Utility Functions" => "utilities.md",
                   "Performance Tips" => "performance.md",
+                  "Datasets" => "datasets.md",
                   "Community" => "community.md"],
+         format = Documenter.HTML(
+             analytics = "UA-36890222-9",
+             assets = ["assets/flux.css"],
+             prettyurls = get(ENV, "CI", nothing) == "true"),
          )
 
 deploydocs(repo = "github.com/FluxML/Flux.jl.git",
diff --git a/docs/src/datasets.md b/docs/src/datasets.md
new file mode 100644
index 00000000..45e29a75
--- /dev/null
+++ b/docs/src/datasets.md
@@ -0,0 +1,20 @@
+# Datasets
+
+Flux includes several standard machine learning datasets.
+
+```@docs
+Flux.Data.Iris.features()
+Flux.Data.Iris.labels()
+Flux.Data.MNIST.images()
+Flux.Data.MNIST.labels()
+Flux.Data.FashionMNIST.images()
+Flux.Data.FashionMNIST.labels()
+Flux.Data.CMUDict.phones()
+Flux.Data.CMUDict.symbols()
+Flux.Data.CMUDict.rawdict()
+Flux.Data.CMUDict.cmudict()
+Flux.Data.Sentiment.train()
+Flux.Data.Sentiment.test()
+Flux.Data.Sentiment.dev()
+```
+

From 3b913cd501c2e76a5b5f57039dea760f4a0be895 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 7 Oct 2019 16:43:20 +0200
Subject: [PATCH 094/113] Fix rebase changes

- Remove `Flux.testmode!` reference (the function no longer exists).
- Change TrackedArray to Array in doctest (Tracker -> Zygote).
---
 src/layers/normalise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 76d312bf..29725066 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -30,7 +30,7 @@ end
 """
     Dropout(p, dims = :)
 
-A Dropout layer. In the forward pass, applies the [`dropout`](@ref) function on the input.
+Dropout layer. In the forward pass, applies the [`Flux.dropout`](@ref) function on the input.
 
 Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 """

From aaa0a82b749817c751fa2287b1bc92a1a168417f Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 24 Oct 2019 22:35:59 +0200
Subject: [PATCH 095/113] Slight modifications in `recurrent` docstrings

---
 src/layers/recurrent.jl | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 05466b31..d9de9884 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -15,13 +15,13 @@ in the background. `cell` should be a model of the form:
 For example, here's a recurrent network that keeps a running total of its inputs:
 
 ```julia
-accum(h, x) = (h+x, x)
+accum(h, x) = (h + x, x)
 rnn = Flux.Recur(accum, 0)
-rnn(2) # 2
-rnn(3) # 3
-rnn.state # 5
-rnn.(1:10) # apply to a sequence
-rnn.state # 60
+rnn(2)      # 2
+rnn(3)      # 3
+rnn.state   # 5
+rnn.(1:10)  # apply to a sequence
+rnn.state   # 60
 ```
 """
 mutable struct Recur{T}
@@ -47,9 +47,10 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
 
 Reset the hidden state of a recurrent layer back to its original value.
 
-Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
-
-    rnn.state = hidden(rnn.cell)
+Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
+```
+rnn.state = hidden(rnn.cell)
+```
 """
 reset!(m::Recur) = (m.state = m.init)
 reset!(m) = foreach(reset!, functor(m)[1])

From a614983e0b4d67e100a270099ac26561b441deca Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 25 Oct 2019 13:23:27 +0200
Subject: [PATCH 096/113] Improve parameter lists in optimisers.jl

---
 src/optimise/optimisers.jl | 93 +++++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 41 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 4f121edf..611edddb 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -12,8 +12,8 @@ Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
 
 # Parameters
-  - Learning rate (`η`): Amount by which the gradients are discounted before updating
-                         the weights.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
 
 # Examples
 ```julia
@@ -24,7 +24,7 @@ opt = Descent(0.3)
 ps = params(model)
 
 gs = gradient(ps) do
-  loss(x, y)
+    loss(x, y)
 end
 
 Flux.Optimise.update!(opt, ps, gs)
@@ -46,10 +46,10 @@ end
 Gradient descent optimizer with learning rate `η` and momentum `ρ`.
 
 # Parameters
-  - Learning rate (`η`): Amount by which gradients are discounted before updating the
-                         weights.
-  - Momentum (`ρ`): Controls the acceleration of gradient descent in the relevant direction
-                    and therefore the dampening of oscillations.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Momentum (`ρ`): Controls the acceleration of gradient descent in the
+                  prominent direction, in effect dampening oscillations.
 
 # Examples
 ```julia
@@ -79,9 +79,10 @@ end
 Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.
 
 # Parameters
-  - Learning rate (`η`): Amount by which the gradients are discounted before updating the
-                         weights.
-  - Nesterov momentum (`ρ`): The amount of Nesterov momentum to be applied.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
+                           prominent direction, in effect dampening oscillations.
 
 # Examples
 ```julia
@@ -115,8 +116,10 @@ algorithm. Often a good choice for recurrent networks. Parameters other than lea
 generally don't need tuning.
 
 # Parameters
-  - Learning rate (`η`)
-  - Momentum (`ρ`)
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Momentum (`ρ`): Controls the acceleration of gradient descent in the
+                  prominent direction, in effect dampening oscillations.
 
 # Examples
 ```julia
@@ -146,9 +149,10 @@ end
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 
 # Parameters
-  - Learning rate (`η`)
-  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                     second (β2) momentum estimate.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
 
 # Examples
 ```julia
@@ -181,9 +185,10 @@ end
 [Rectified ADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimizer.
 
 # Parameters
-  - Learning rate (`η`)
-  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                     second (β2) momentum estimate.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
 
 # Examples
 ```julia
@@ -223,9 +228,10 @@ end
 [AdaMax](https://arxiv.org/abs/1412.6980v9) is a variant of ADAM based on the ∞-norm.
 
 # Parameters
-  - Learning rate (`η`)
-  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                     second (β2) momentum estimate.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
 
 # Examples
 ```julia
@@ -260,7 +266,8 @@ parameter specific learning rates based on how frequently it is updated.
 Parameters don't need tuning.
 
 # Parameters
-  - Learning rate (`η`)
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
 
 # Examples
 ```julia
@@ -291,7 +298,7 @@ rate based on a window of past gradient updates.
 Parameters don't need tuning.
 
 # Parameters
-  - Rho (`ρ`): Factor by which gradient is decayed at each time step.
+- Rho (`ρ`): Factor by which the gradient is decayed at each time step.
 
 # Examples
 ```julia
@@ -323,9 +330,10 @@ The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
 optimiser. Parameters don't need tuning.
 
 # Parameters
-  - Learning Rate (`η`)
-  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                     second (β2) momentum estimate.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
 
 # Examples
 ```julia
@@ -358,9 +366,10 @@ end
 Parameters don't need tuning.
 
 # Parameters
-  - Learning rate (`η`)
-  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                     second (β2) momentum estimate.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
 
 # Examples
 ```julia
@@ -394,10 +403,11 @@ end
 weight decay regularization.
 
 # Parameters
-  - Learning rate (`η`)
-  - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                     second (β2) momentum estimate.
-  - `decay`: Decay applied to weights during optimisation.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
+- `decay`: Decay applied to weights during optimisation.
 
 # Examples
 ```julia
@@ -464,17 +474,18 @@ function apply!(o::InvDecay, x, Δ)
 end
 
 """
-    ExpDecay(eta = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4)
+    ExpDecay(η = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4)
 
-Discount the learning rate `eta` by the factor `decay` every `decay_step` steps till
+Discount the learning rate `η` by the factor `decay` every `decay_step` steps till
 a minimum of `clip`.
 
 # Parameters
-  - Learning rate (`eta`)
-  - `decay`: Factor by which the learning rate is discounted.
-  - `decay_step`: Schedule decay operations by setting number of steps between two decay
-                  operations.
-  - `clip`: Minimum value of learning rate.
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- `decay`: Factor by which the learning rate is discounted.
+- `decay_step`: Schedule decay operations by setting the number of steps between
+                two decay operations.
+- `clip`: Minimum value of learning rate.
 
 # Examples
 To apply exponential decay to an optimiser:
@@ -510,7 +521,7 @@ end
 Decay weights by `wd`.
 
 # Parameters
-  - Weight decay (`wd`)
+- Weight decay (`wd`)
 """
 mutable struct WeightDecay
   wd::Real

From e16c24a9b8872c29552f5e1a4d390dc35a4d81e8 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 4 Apr 2020 19:43:28 +0200
Subject: [PATCH 097/113] General minuscule improvements

---
 src/data/cmudict.jl     | 4 ++--
 src/layers/normalise.jl | 2 +-
 src/layers/recurrent.jl | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl
index 0ed724d4..9ddecbcd 100644
--- a/src/data/cmudict.jl
+++ b/src/data/cmudict.jl
@@ -27,7 +27,7 @@ end
 """
     phones()
 
-Return a `Vector` containing the phones used in the dataset.
+Return a `Vector` containing the phones used in the CMU Pronouncing Dictionary.
 """
 function phones()
   load()
@@ -38,7 +38,7 @@ end
 """
     symbols()
 
-Return a `Vector` containing the symbols used in the dataset.
+Return a `Vector` containing the symbols used in the CMU Pronouncing Dictionary.
 A symbol is a phone with optional auxiliary symbols, indicating for example the
 amount of stress on the phone.
 """
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 29725066..b81e4967 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -30,7 +30,7 @@ end
 """
     Dropout(p, dims = :)
 
-Dropout layer. In the forward pass, applies the [`Flux.dropout`](@ref) function on the input.
+Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input.
 
 Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 """
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index d9de9884..a93c4a0a 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -48,7 +48,7 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
 Reset the hidden state of a recurrent layer back to its original value.
 
 Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
-```
+```julia
 rnn.state = hidden(rnn.cell)
 ```
 """

From 64ce32ddcf5c8e242e99ff74f82958338112afb8 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 4 Apr 2020 22:55:14 +0200
Subject: [PATCH 098/113] Fix problems due to rebase

---
 docs/make.jl                        |  1 -
 docs/src/training/loss_functions.md | 13 -------------
 src/layers/basic.jl                 |  7 +++----
 src/utils.jl                        |  8 +++++++-
 4 files changed, 10 insertions(+), 19 deletions(-)
 delete mode 100644 docs/src/training/loss_functions.md

diff --git a/docs/make.jl b/docs/make.jl
index 0ee0ccab..be4522eb 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -17,7 +17,6 @@ makedocs(modules=[Flux, NNlib],
                      "DataLoader" => "data/dataloader.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
-                     "Loss Functions" => "training/loss_functions.md",
                      "Training" => "training/training.md"],
                   "GPU Support" => "gpu.md",
                   "Saving & Loading" => "saving.md",
diff --git a/docs/src/training/loss_functions.md b/docs/src/training/loss_functions.md
deleted file mode 100644
index ed002a41..00000000
--- a/docs/src/training/loss_functions.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Loss Functions
-
-The following functions provide basic loss (or cost) functions.
-
-```@docs
-Flux.mse
-Flux.crossentropy
-Flux.logitcrossentropy
-Flux.binarycrossentropy
-Flux.logitbinarycrossentropy
-Flux.normalise
-```
-
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 4b0b4726..4c58b9d7 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -98,10 +98,9 @@ julia> d = Dense(5, 2)
 Dense(5, 2)
 
 julia> d(rand(5))
-Array{Float64,1}:
-  0.00257447
-  -0.00449443
-```
+2-element Array{Float32,1}:
+  -0.16210233
+   0.12311903```
 """
 struct Dense{F,S,T}
   W::S
diff --git a/src/utils.jl b/src/utils.jl
index 40f0ae9c..c666caca 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -128,7 +128,13 @@ Split `xs` into `n` parts.
 # Examples
 ```jldoctest
 julia> Flux.chunk(1:10, 3)
-3-element Array{Array{Int64,1},1}:
+3-element Array{UnitRange{Int64},1}:
+ 1:4
+ 5:8
+ 9:10
+
+julia> Flux.chunk(collect(1:10), 3)
+3-element Array{SubArray{Int64,1,Array{Int64,1},Tuple{UnitRange{Int64}},true},1}:
  [1, 2, 3, 4]
  [5, 6, 7, 8]
  [9, 10]

From 2ce5f6d9bfda56b07fa01eb63afb77b9481ead94 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 4 Apr 2020 22:59:45 +0200
Subject: [PATCH 099/113] Further docstring improvements in src/

Some had to be re-done after the rebase
---
 src/layers/basic.jl     | 13 ++------
 src/layers/normalise.jl | 71 +++++++++++++++++++---------------------
 src/layers/stateless.jl | 72 ++++++++++++++++++++++-------------------
 src/onehot.jl           | 10 +++---
 src/optimise/train.jl   | 36 +++++++++++----------
 5 files changed, 100 insertions(+), 102 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 4c58b9d7..905844d7 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -183,18 +183,11 @@ outdims(l::Diagonal, isize) = (length(l.α),)
 """
     Maxout(over)
 
-`Maxout` is a neural network layer which has a number of internal layers
-which all receive the same input. The layer returns the elementwise maximium
-of the internal layers' outputs.
+The [Maxout](https://arxiv.org/pdf/1302.4389.pdf) layer has a number of
+internal layers which all receive the same input. It returns the elementwise
+maximum of the internal layers' outputs.
 
 Maxout over linear dense layers satisfies the univeral approximation theorem.
-
-Reference:
-Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, and Yoshua Bengio.
-2013. Maxout networks.
-In Proceedings of the 30th International Conference on International Conference on Machine Learning - Volume 28 (ICML'13),
-Sanjoy Dasgupta and David McAllester (Eds.), Vol. 28. JMLR.org III-1319-III-1327.
-https://arxiv.org/pdf/1302.4389.pdf
 """
 struct Maxout{FS<:Tuple}
     over::FS
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index b81e4967..0b5e04fb 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -65,9 +65,10 @@ end
 """
     AlphaDropout(p)
 
-A dropout layer. It is used in
+A dropout layer. Used in
 [Self-Normalizing Neural Networks](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf).
-The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
+The AlphaDropout layer ensures that mean and variance of activations
+remain the same as before.
 
 Does nothing to the input once [`testmode!`](@ref) is true.
 """
@@ -123,8 +124,8 @@ end
               initβ = zeros, initγ = ones,
               ϵ = 1e-8, momentum = .1)
 
-Batch Normalization layer. The `channels` input should be the size of the
-channel dimension in your data (see below).
+[Batch Normalization](https://arxiv.org/pdf/1502.03167.pdf) layer.
+`channels` should be the size of the channel dimension in your data (see below).
 
 Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
 a batch of feature vectors this is just the data dimension, for `WHCN` images
@@ -136,9 +137,6 @@ per-channel `bias` and `scale` parameters).
 
 Use [`testmode!`](@ref) during inference.
 
-See [Batch Normalization: Accelerating Deep Network Training by Reducing
-Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf).
-
 # Examples
 ```julia
 m = Chain(
@@ -213,37 +211,6 @@ function Base.show(io::IO, l::BatchNorm)
   print(io, ")")
 end
 
-
-"""
-    InstanceNorm(channels::Integer, σ = identity;
-                 initβ = zeros, initγ = ones,
-                 ϵ = 1e-8, momentum = .1)
-
-Instance Normalization layer. The `channels` input should be the size of the
-channel dimension in your data (see below).
-
-Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
-a batch of feature vectors this is just the data dimension, for `WHCN` images
-it's the usual channel dimension.)
-
-`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
-shifts them to have a new mean and variance (corresponding to the learnable,
-per-channel `bias` and `scale` parameters).
-
-Use [`testmode!`](@ref) during inference.
-
-See [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
-
-# Examples
-```julia
-m = Chain(
-  Dense(28^2, 64),
-  InstanceNorm(64, relu),
-  Dense(64, 10),
-  InstanceNorm(10),
-  softmax)
-```
-"""
 expand_inst = (x, as) -> reshape(repeat(x, outer=[1, as[length(as)]]), as...)
 
 mutable struct InstanceNorm{F,V,W,N}
@@ -258,6 +225,34 @@ mutable struct InstanceNorm{F,V,W,N}
 end
 
 # TODO: deprecate in v0.11
+"""
+    InstanceNorm(channels::Integer, σ = identity;
+                 initβ = zeros, initγ = ones,
+                 ϵ = 1e-8, momentum = .1)
+
+[Instance Normalization](https://arxiv.org/abs/1607.08022) layer.
+`channels` should be the size of the channel dimension in your data (see below).
+
+Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
+a batch of feature vectors this is just the data dimension, for `WHCN` images
+it's the usual channel dimension.)
+
+`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
+shifts them to have a new mean and variance (corresponding to the learnable,
+per-channel `bias` and `scale` parameters).
+
+Use [`testmode!`](@ref) during inference.
+
+# Examples
+```julia
+m = Chain(
+  Dense(28^2, 64),
+  InstanceNorm(64, relu),
+  Dense(64, 10),
+  InstanceNorm(10),
+  softmax)
+```
+"""
 InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum) = InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
 
 InstanceNorm(chs::Integer, λ = identity;
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index b566c683..3f97e1fd 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -2,7 +2,8 @@
 """
     mae(ŷ, y)
 
-Return the mean of absolute error `sum(abs.(ŷ .- y)) / length(y)`
+Return the mean of absolute error; calculated as
+`sum(abs.(ŷ .- y)) / length(y)`.
 """
 mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 
@@ -10,8 +11,8 @@ mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 """
     mse(ŷ, y)
 
-Return the mean squared error between ŷ and y;
-defined as ``\\frac{1}{n} \\sum_{i=1}^n (ŷ_i - y_i)^2``.
+Return the mean squared error between ŷ and y; calculated as
+`sum((ŷ .- y).^2) / length(y)`.
 
 # Examples
 ```jldoctest
@@ -25,10 +26,11 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 """
     msle(ŷ, y; ϵ=eps(eltype(ŷ)))
 
-Returns the mean of the squared logarithmic errors `sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
+Return the mean of the squared logarithmic errors; calculated as
+`sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
 The `ϵ` term provides numerical stability.
 
-This error penalizes an under-predicted estimate greater than an over-predicted estimate.
+Penalizes an under-predicted estimate greater than an over-predicted estimate.
 """
 msle(ŷ, y; ϵ=eps(eltype(ŷ))) = sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) * 1 // length(y)
 
@@ -37,13 +39,12 @@ msle(ŷ, y; ϵ=eps(eltype(ŷ))) = sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) *
 """
     huber_loss(ŷ, y; δ=1.0)
 
-Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, δ is set to 1.0.
+Return the mean of the [Huber loss](https://en.wikipedia.org/wiki/Huber_loss)
+given the prediction `ŷ` and true values `y`.
 
-                    | 0.5*|ŷ - y|,   for |ŷ - y| <= δ
-      Hubber loss = |
-                    |  δ*(|ŷ - y| - 0.5*δ),  otherwise
-
-[`Huber Loss`](https://en.wikipedia.org/wiki/Huber_loss).
+                 | 0.5 * |ŷ - y|,            for |ŷ - y| <= δ
+    Huber loss = |
+                 |  δ * (|ŷ - y| - 0.5 * δ), otherwise
 """
 function huber_loss(ŷ, y;  δ=eltype(ŷ)(1))
    abs_error = abs.(ŷ .- y)
@@ -68,7 +69,7 @@ end
     crossentropy(ŷ, y; weight = nothing)
 
 Return the cross entropy between the given probability distributions;
-computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
+calculated as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
 
 `weight` can be `Nothing`, a `Number` or an `AbstractVector`.
 `weight=nothing` acts like `weight=1` but is faster.
@@ -87,7 +88,7 @@ crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _cros
     logitcrossentropy(ŷ, y; weight = 1)
 
 Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
-computed as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
+calculated as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
 
 `logitcrossentropy(ŷ, y)` is mathematically equivalent to
 [`Flux.crossentropy(softmax(log(ŷ)), y)`](@ref) but it is more numerically stable.
@@ -184,10 +185,14 @@ end
 """
     kldivergence(ŷ, y)
 
-KLDivergence is a measure of how much one probability distribution is different from the other.
-It is always non-negative and zero only when both the distributions are equal everywhere.
+Return the
+[Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)
+between the given probability distributions.
 
-[KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
+KL divergence is a measure of how much one probability distribution is different
+from the other.
+It is always non-negative and zero only when both the distributions are equal
+everywhere.
 """
 function kldivergence(ŷ, y)
   entropy = sum(y .* log.(y)) * 1 //size(y,2)
@@ -198,20 +203,20 @@ end
 """
     poisson(ŷ, y)
 
-Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
-Returns `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`
+Return how much the predicted distribution `ŷ` diverges from the expected Poisson
+distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
 
-[Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
+[More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) * 1 // size(y,2)
 
 """
     hinge(ŷ, y)
 
-Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1).
-Returns `sum((max.(0, 1 .- ŷ .* y))) / size(y, 2)`
+Return the [hinge loss](https://en.wikipedia.org/wiki/Hinge_loss) given the
+prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as
+`sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)`.
 
-[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
 See also: [`squared_hinge`](@ref)
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) * 1 // size(y, 2)
@@ -219,8 +224,8 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) * 1 // size(y, 2)
 """
     squared_hinge(ŷ, y)
 
-Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1).
-Returns `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`
+Return the squared hinge loss given the prediction `ŷ` and true labels `y`
+(containing 1 or -1); calculated as `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`.
 
 See also: [`hinge`](@ref)
 """
@@ -229,28 +234,29 @@ squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) * 1 // size(y, 2)
 """
     dice_coeff_loss(ŷ, y; smooth=1)
 
-Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score.
-Returns `1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)`
-
-[V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
+Return a loss based on the dice coefficient.
+Used in the [V-Net](https://arxiv.org/pdf/1606.04797v1.pdf) image segmentation
+architecture.
+Similar to the F1_score. Calculated as:
+    1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)`
 """
 dice_coeff_loss(ŷ, y; smooth=eltype(ŷ)(1.0)) = 1 - (2*sum(y .* ŷ) + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
 
 """
     tversky_loss(ŷ, y; β=0.7)
 
-Used with imbalanced data to give more weightage to False negatives.
+Return the [Tversky loss](https://arxiv.org/pdf/1706.05721.pdf).
+Used with imbalanced data to give more weight to false negatives.
 Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
-Returns `1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)`
-
-[Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
+Calculated as:
+    1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
 """
 tversky_loss(ŷ, y; β=eltype(ŷ)(0.7)) = 1 - (sum(y .* ŷ) + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
 
 """
     flatten(x::AbstractArray)
 
-Transforms (w,h,c,b)-shaped input into (w x h x c,b)-shaped output,
+Transform (w, h, c, b)-shaped input into (w × h × c, b)-shaped output
 by linearizing all values for each element in the batch.
 """
 function flatten(x::AbstractArray)
diff --git a/src/onehot.jl b/src/onehot.jl
index 7a046dc1..551e1f37 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -45,8 +45,8 @@ cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.d
 """
     onehot(l, labels[, unk])
 
-Create a [`OneHotVector`](@ref) with its `l`-th element `true` based on
-possible `labels` set.
+Create a [`OneHotVector`](@ref) with its `l`-th element `true` based on the
+possible set of `labels`.
 If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
 in `labels`; otherwise it will error.
 
@@ -80,8 +80,10 @@ end
 """
     onehotbatch(ls, labels[, unk...])
 
-Create an [`OneHotMatrix`](@ref) with a batch of labels based on possible `labels` set, returns the
-`onehot(unk, labels)` if given labels `ls` is not found in set `labels`.
+Create a [`OneHotMatrix`](@ref) with a batch of labels based on the
+possible set of `labels`.
+If `unk` is given, return [`onehot(unk, labels)`](@ref) if one of the input
+labels `ls` is not found in `labels`; otherwise it will error.
 
 # Examples
 ```jldoctest
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 9c3c29bd..98ef8fd5 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -2,23 +2,25 @@ using Juno
 import Zygote: Params, gradient
 
 
+
 """
-    update!(opt, p, g)
-    update!(opt, ps::Params, gs)
-
-Perform an update step of the parameters `ps` (or the single parameter `p`) 
-according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
-
-As a result, the parameters are mutated and the optimizer's internal state may change. 
-
   update!(x, x̄)
-  
+
 Update the array `x` according to `x .-= x̄`.
 """
 function update!(x::AbstractArray, x̄)
   x .-= x̄
 end
 
+"""
+    update!(opt, p, g)
+    update!(opt, ps::Params, gs)
+
+Perform an update step of the parameters `ps` (or the single parameter `p`)
+according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
+
+As a result, the parameters are mutated and the optimizer's internal state may change.
+"""
 function update!(opt, x, x̄)
   x .-= apply!(opt, x, x̄)
 end
@@ -41,7 +43,7 @@ struct StopException <: Exception end
     stop()
 
 Call `Flux.stop()` in a callback to indicate when a callback condition is met.
-This would trigger the train loop to stop and exit.
+This will trigger the train loop to stop and exit.
 
 # Examples
 ```julia
@@ -57,19 +59,19 @@ end
 """
     train!(loss, params, data, opt; cb)
 
-For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
-backpropagation and calls the optimizer `opt`.
+For each datapoint `d` in `data` compute the gradient of `loss(d...)` through
+backpropagation and call the optimizer `opt`.
 
-In case datapoints `d` are of numeric array type, assumes no splatting is needed 
-and computes the gradient of `loss(d)`.
+In case datapoints `d` are of numeric array type, assume no splatting is needed
+and compute the gradient of `loss(d)`.
 
-Takes a callback as keyword argument `cb`. For example, this will print "training"
-every 10 seconds (using [`throttle`](@ref)):
+A callback is given with the keyword argument `cb`. For example, this will print
+"training" every 10 seconds (using [`Flux.throttle`](@ref)):
 
   train!(loss, params, data, opt,
          cb = throttle(() -> println("training"), 10))
 
-The callback can call [`Flux.stop()`](@ref) to interrupt the training loop.
+The callback can call [`Flux.stop`](@ref) to interrupt the training loop.
 
 Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays.
 """

From 73d631f5cdf8d64c563d857455854fbe78aba29a Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 4 Apr 2020 23:00:34 +0200
Subject: [PATCH 100/113] Fix and improve docs

Add missing docstrings, improve existing ones, fix links to functions
or files.
---
 docs/src/data/dataloader.md       | 2 +-
 docs/src/data/onehot.md           | 9 +++++++++
 docs/src/models/basics.md         | 4 ++--
 docs/src/models/layers.md         | 9 ++++++---
 docs/src/models/regularisation.md | 4 ++++
 docs/src/training/optimisers.md   | 1 +
 docs/src/training/training.md     | 6 +++++-
 docs/src/utilities.md             | 8 +++++++-
 8 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/docs/src/data/dataloader.md b/docs/src/data/dataloader.md
index 70a883c9..f6edc709 100644
--- a/docs/src/data/dataloader.md
+++ b/docs/src/data/dataloader.md
@@ -3,4 +3,4 @@ Flux provides the `DataLoader` type in the `Flux.Data` module to handle iteratio
 
 ```@docs
 Flux.Data.DataLoader
-```
\ No newline at end of file
+```
diff --git a/docs/src/data/onehot.md b/docs/src/data/onehot.md
index 0bc3531b..23d6f196 100644
--- a/docs/src/data/onehot.md
+++ b/docs/src/data/onehot.md
@@ -31,6 +31,11 @@ julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c])
 :c
 ```
 
+```@docs
+Flux.onehot
+Flux.onecold
+```
+
 ## Batches
 
 `onehotbatch` creates a batch (matrix) of one-hot vectors, and `onecold` treats matrices as batches.
@@ -52,3 +57,7 @@ julia> onecold(ans, [:a, :b, :c])
 ```
 
 Note that these operations returned `OneHotVector` and `OneHotMatrix` rather than `Array`s. `OneHotVector`s behave like normal vectors but avoid any unnecessary cost compared to using an integer index directly. For example, multiplying a matrix with a one-hot vector simply slices out the relevant row of the matrix under the hood.
+
+```@docs
+Flux.onehotbatch
+```
diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 24230ab1..06901d99 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -220,7 +220,7 @@ Flux.@functor Affine
 
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
 
-For some more helpful tricks, including parameter freezing, please checkout the [advanced usage guide](advacned.md).
+For some more helpful tricks, including parameter freezing, please checkout the [advanced usage guide](advanced.md).
 
 ## Utility functions
 
@@ -240,5 +240,5 @@ Currently limited to the following layers:
 - `MeanPool`
 
 ```@docs
-outdims
+Flux.outdims
 ```
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 2b5c1591..54ce5791 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -32,6 +32,7 @@ RNN
 LSTM
 GRU
 Flux.Recur
+Flux.reset!
 ```
 
 ## Other General Purpose Layers
@@ -49,20 +50,22 @@ SkipConnection
 These layers don't affect the structure of the network but may improve training times or reduce overfitting.
 
 ```@docs
+Flux.normalise
 BatchNorm
-Dropout
 Flux.dropout
+Dropout
 AlphaDropout
 LayerNorm
+InstanceNorm
 GroupNorm
 ```
 
 ### Testmode
 
-Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified.
+Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `Flux.testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified.
 
 ```@docs
-testmode!
+Flux.testmode!
 trainmode!
 ```
 
diff --git a/docs/src/models/regularisation.md b/docs/src/models/regularisation.md
index 02aa3da8..535dd096 100644
--- a/docs/src/models/regularisation.md
+++ b/docs/src/models/regularisation.md
@@ -64,3 +64,7 @@ julia> activations(c, rand(10))
 julia> sum(norm, ans)
 2.1166067f0
 ```
+
+```@docs
+Flux.activations
+```
diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 1ee526b3..5ed083ee 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -52,6 +52,7 @@ Momentum
 Nesterov
 RMSProp
 ADAM
+RADAM
 AdaMax
 ADAGrad
 ADADelta
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 1fe10783..48b7b42d 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -32,7 +32,7 @@ Flux.train!(loss, ps, data, opt)
 ```
 
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
-For a list of all built-in loss functions, check out the [reference](loss_functions.md).
+For a list of all built-in loss functions, check out the [layer reference](../models/layers.md).
 
 At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
 
@@ -95,6 +95,10 @@ julia> @epochs 2 Flux.train!(...)
 # Train for two epochs
 ```
 
+```@docs
+Flux.@epochs
+```
+
 ## Callbacks
 
 `train!` takes an additional argument, `cb`, that's used for callbacks so that you can observe the training process. For example:
diff --git a/docs/src/utilities.md b/docs/src/utilities.md
index d788e69f..7986ec23 100644
--- a/docs/src/utilities.md
+++ b/docs/src/utilities.md
@@ -35,9 +35,15 @@ Flux.glorot_uniform
 Flux.glorot_normal
 ```
 
+## Model Abstraction
+
+```@docs
+Flux.destructure
+```
+
 ## Callback Helpers
 
 ```@docs
 Flux.throttle
+Flux.stop
 ```
-

From 8d2d15aa70f617c16c4a70efe1eb6550d0bd3c88 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sat, 4 Apr 2020 23:06:56 +0200
Subject: [PATCH 101/113] Remove links to OneHot{Vector,Matrix}

Since they aren't documented, we only get a 404 link.
---
 src/onehot.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/onehot.jl b/src/onehot.jl
index 551e1f37..4b7e5e36 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -45,7 +45,7 @@ cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.d
 """
     onehot(l, labels[, unk])
 
-Create a [`OneHotVector`](@ref) with its `l`-th element `true` based on the
+Create a `OneHotVector` with its `l`-th element `true` based on the
 possible set of `labels`.
 If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
 in `labels`; otherwise it will error.
@@ -80,7 +80,7 @@ end
 """
     onehotbatch(ls, labels[, unk...])
 
-Create a [`OneHotMatrix`](@ref) with a batch of labels based on the
+Create a `OneHotMatrix` with a batch of labels based on the
 possible set of `labels`.
 If `unk` is given, return [`onehot(unk, labels)`](@ref) if one of the input
 labels `ls` is not found in `labels`; otherwise it will error.

From 2a65a303993eea73d452ebbaf4515586de5d0800 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Sun, 5 Apr 2020 13:58:27 +0200
Subject: [PATCH 102/113] Fix doctests in runtests.jl

---
 test/runtests.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index 81182f0d..8f3ea015 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -42,6 +42,7 @@ Random.seed!(0)
 
   @testset "Docs" begin
     if VERSION >= v"1.2"
+      DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
       doctest(Flux)
     end
   end

From 18ea4803888c5b3c253c490853a403153bbd1a5d Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Mon, 6 Apr 2020 09:26:38 +0200
Subject: [PATCH 103/113] fix tests and new version

---
 Manifest.toml | 85 ++++++++++++++-------------------------------------
 Project.toml  |  4 +--
 test/data.jl  | 11 ++++---
 3 files changed, 31 insertions(+), 69 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 2ba02f84..bc74cb6e 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -20,9 +20,9 @@ version = "1.0.1"
 
 [[ArrayLayouts]]
 deps = ["FillArrays", "LinearAlgebra"]
-git-tree-sha1 = "bc779df8d73be70e4e05a63727d3a4dfb4c52b1f"
+git-tree-sha1 = "41956a49a8a4fefa1bf6664bca4a3035aba4c3a0"
 uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
-version = "0.1.5"
+version = "0.2.3"
 
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
@@ -52,9 +52,9 @@ version = "6.2.2"
 
 [[CUDAnative]]
 deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
-git-tree-sha1 = "4168c40ca3ff3475bc29a20a09ab7b910c4b8ef0"
+git-tree-sha1 = "d1fc99635d0002c8a819b78cb1f441eb44310725"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "3.0.1"
+version = "3.0.2"
 
 [[CodeTracking]]
 deps = ["InteractiveUtils", "UUIDs"]
@@ -86,23 +86,17 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"
 
-[[Compat]]
-deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
-uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "2.2.0"
-
 [[CompilerSupportLibraries_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "067567a322fe466c5ec8d01413eee7127bd11699"
+git-tree-sha1 = "7c4f882c41faa72118841185afc58a2eb00ef612"
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "0.3.1+0"
+version = "0.3.3+0"
 
 [[Cthulhu]]
-deps = ["CodeTracking", "InteractiveUtils", "TerminalMenus", "Unicode"]
-git-tree-sha1 = "5e0f928ccaab1fa2911fc4e204e8a6f5b0213eaf"
+deps = ["CodeTracking", "InteractiveUtils", "REPL", "Unicode"]
+git-tree-sha1 = "484790098c85c26f8e59051f8ff1a0745c034a7d"
 uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
-version = "1.0.0"
+version = "1.0.1"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
@@ -117,9 +111,9 @@ version = "1.1.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8"
+git-tree-sha1 = "73eb18320fe3ba58790c8b8f6f89420f0a622773"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.10"
+version = "0.17.11"
 
 [[Dates]]
 deps = ["Printf"]
@@ -145,23 +139,11 @@ version = "1.0.1"
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 
-[[FFTW]]
-deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
-git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
-uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.2.0"
-
-[[FFTW_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "ddb57f4cf125243b4aa4908c94d73a805f3cbf2c"
-uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
-version = "3.3.9+4"
-
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
+git-tree-sha1 = "51cc2f9bc4eb9c6c0e81ec2f779d1085583cc956"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.5"
+version = "0.8.7"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238"
@@ -170,9 +152,9 @@ version = "0.8.0"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "88b082d492be6b63f967b6c96b352e25ced1a34c"
+git-tree-sha1 = "869540e4367122fbffaace383a5bdc34d6e5e5ac"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.9"
+version = "0.10.10"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
@@ -186,12 +168,6 @@ git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
 version = "0.3.1"
 
-[[IntelOpenMP_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
-uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
-version = "2018.0.3+0"
-
 [[InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
@@ -209,6 +185,7 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
 version = "1.3.4"
 
 [[LibGit2]]
+deps = ["Printf"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
 [[Libdl]]
@@ -221,12 +198,6 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
-[[MKL_jll]]
-deps = ["IntelOpenMP_jll", "Libdl", "Pkg"]
-git-tree-sha1 = "720629cc8cbd12c146ca01b661fd1a6cf66e2ff4"
-uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
-version = "2019.0.117+2"
-
 [[MacroTools]]
 deps = ["Markdown", "Random"]
 git-tree-sha1 = "f7d2e3f654af75f01ec49be82c231c382214223a"
@@ -313,10 +284,6 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
-[[SharedArrays]]
-deps = ["Distributed", "Mmap", "Random", "Serialization"]
-uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -348,15 +315,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [[StatsBase]]
 deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "19bfcb46245f69ff4013b3df3b977a289852c3a1"
+git-tree-sha1 = "a6102b1f364befdb05746f386b67c6b7e3262c45"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.32.2"
-
-[[TerminalMenus]]
-deps = ["Compat", "REPL", "Test"]
-git-tree-sha1 = "9ae6ed0c94eee4d898e049820942af21daf15efc"
-uuid = "dc548174-15c3-5faf-af27-7997cfbde655"
-version = "0.1.0"
+version = "0.33.0"
 
 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
@@ -389,15 +350,15 @@ version = "0.9.1"
 
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e"
+git-tree-sha1 = "2f6c3e15e20e036ee0a0965879b31442b7ec50fa"
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.11+8"
+version = "1.2.11+9"
 
 [[Zygote]]
-deps = ["ArrayLayouts", "DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "9688fce24bd8a9468fed12f3d5206099a39054dc"
+deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
+git-tree-sha1 = "1ccbfbe8930376e31752b812daa2532c723dc332"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.12"
+version = "0.4.13"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]
diff --git a/Project.toml b/Project.toml
index e927e14a..6fd5834c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.3"
+version = "0.10.4"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -35,7 +35,7 @@ NNlib = "0.6"
 Reexport = "0.2"
 StatsBase = "0"
 ZipFile = "0.7, 0.8, 0.9"
-Zygote = "0.4"
+Zygote = "0.4.13"
 julia = "1"
 
 [extras]
diff --git a/test/data.jl b/test/data.jl
index c7a8fdfd..b5faf359 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -76,10 +76,11 @@ end
     @test size(Iris.labels()) == (150,)
 end
 
-@testset "Housing" begin
-    @test Housing.features() isa Matrix
-    @test size(Housing.features()) == (506, 13)
 
-    @test Housing.targets() isa Array{Float64}
-    @test size(Housing.targets()) == (506, 1)
+@testset "Housing" begin
+    @test_broken Housing.features() isa Matrix # test broken due to SSL certifate expiration problem
+    @test_broken size(Housing.features()) == (506, 13)
+
+    @test_broken Housing.targets() isa Array{Float64}
+    @test_broken size(Housing.targets()) == (506, 1)
 end

From f9e97104466503c3b213347f2bfbc347f02589ff Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Mon, 6 Apr 2020 09:35:34 +0200
Subject: [PATCH 104/113] update travis and bound julia version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 6fd5834c..1883d974 100644
--- a/Project.toml
+++ b/Project.toml
@@ -36,7 +36,7 @@ Reexport = "0.2"
 StatsBase = "0"
 ZipFile = "0.7, 0.8, 0.9"
 Zygote = "0.4.13"
-julia = "1"
+julia = "1.3"
 
 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

From d6cb9f055da08538e84780071d2e670fd76b37c1 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Mon, 6 Apr 2020 11:08:20 +0200
Subject: [PATCH 105/113] fix housing download

---
 src/data/housing.jl | 2 +-
 test/data.jl        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/data/housing.jl b/src/data/housing.jl
index 61391304..06ecc339 100644
--- a/src/data/housing.jl
+++ b/src/data/housing.jl
@@ -50,7 +50,7 @@ function load()
     isfile(deps("housing.data")) && return
     
     @info "Downloading the Boston housing Dataset"
-    download_and_verify("$(cache_prefix)https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
+    download_and_verify("$(cache_prefix)http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
                         deps("housing.data"),
                         "baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")
     
diff --git a/test/data.jl b/test/data.jl
index b5faf359..20492323 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -78,9 +78,9 @@ end
 
 
 @testset "Housing" begin
-    @test_broken Housing.features() isa Matrix # test broken due to SSL certifate expiration problem
-    @test_broken size(Housing.features()) == (506, 13)
+    @test Housing.features() isa Matrix # test broken due to SSL certifate expiration problem
+    @test size(Housing.features()) == (506, 13)
 
-    @test_broken Housing.targets() isa Array{Float64}
-    @test_broken size(Housing.targets()) == (506, 1)
+    @test Housing.targets() isa Array{Float64}
+    @test size(Housing.targets()) == (506, 1)
 end

From c54d71ce56e84bd5cbff8d265d771ef7fc2a8aeb Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Mon, 6 Apr 2020 13:20:28 +0200
Subject: [PATCH 106/113] update travis

---
 .travis.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 90cf039b..d2ab83c2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,6 +7,7 @@ os:
 
 julia:
   - 1.3
+  - 1
   - nightly
 
 notifications:
@@ -15,7 +16,7 @@ notifications:
 jobs:
   include:
     - stage: "Documentation"
-      julia: 1.3
+      julia: 1
       os: linux
       script:
         - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));

From 0e9bc826265f267ba05754719b4e035a1802ceca Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 6 Apr 2020 13:52:27 +0200
Subject: [PATCH 107/113] Loss -> Loss Functions

---
 docs/src/training/training.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 48b7b42d..36da0eb0 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -15,7 +15,7 @@ Flux.Optimise.train!
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
 
-## Loss
+## Loss Functions
 
 The objective function must return a number representing how far the model is from its target – the *loss* of the model. The `loss` function that we defined in [basics](../models/basics.md) will work as an objective. We can also define an objective in terms of some model:
 

From 684570660a1b6bfd465076f16bb31f95db985cbd Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 6 Apr 2020 13:53:36 +0200
Subject: [PATCH 108/113] Update doctest version guard (1.2 -> 1.4)

And add the same to docs/make.jl
---
 docs/make.jl     | 2 +-
 test/runtests.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index be4522eb..2f24a022 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,7 +2,7 @@ using Documenter, Flux, NNlib
 
 DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
 makedocs(modules=[Flux, NNlib],
-         doctest = true,
+         doctest = VERSION >= v"1.4",
          sitename = "Flux",
          pages = ["Home" => "index.md",
                   "Building Models" =>
diff --git a/test/runtests.jl b/test/runtests.jl
index 8f3ea015..c2ea0715 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -41,7 +41,7 @@ Random.seed!(0)
   end
 
   @testset "Docs" begin
-    if VERSION >= v"1.2"
+    if VERSION >= v"1.4"
       DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
       doctest(Flux)
     end

From be926184730e3c6384e95f60f048eb02fd9b85c0 Mon Sep 17 00:00:00 2001
From: matsueushi <matsueushi@gmail.com>
Date: Tue, 14 Apr 2020 00:12:06 -0400
Subject: [PATCH 109/113] Fix doc indent

---
 src/optimise/train.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 98ef8fd5..ade8e862 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -4,7 +4,7 @@ import Zygote: Params, gradient
 
 
 """
-  update!(x, x̄)
+    update!(x, x̄)
 
 Update the array `x` according to `x .-= x̄`.
 """

From db99e41959ba943ec5c7a0e4d561cf97690c2637 Mon Sep 17 00:00:00 2001
From: Bruno Hebling Vieira <bruno.hebling.vieira@usp.br>
Date: Thu, 16 Apr 2020 09:50:41 -0300
Subject: [PATCH 110/113] Removed SGD exports

---
 src/Flux.jl              | 2 +-
 src/optimise/Optimise.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 9969b323..e453e129 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -16,7 +16,7 @@ export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxP
 include("optimise/Optimise.jl")
 using .Optimise
 using .Optimise: @epochs
-export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
+export Descent, ADAM, Momentum, Nesterov, RMSProp,
   ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
   ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
 
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index 68c18a6f..f25e9a50 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -1,7 +1,7 @@
 module Optimise
 
 export train!,
-	SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
+	Descent, ADAM, Momentum, Nesterov, RMSProp,
 	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
 	InvDecay, ExpDecay, WeightDecay, stop, Optimiser
 

From d53deb9132b06188bea53bee647aa35d62d25c35 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Sat, 18 Apr 2020 03:19:32 +0530
Subject: [PATCH 111/113] Update glorot_normal doc

---
 src/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.jl b/src/utils.jl
index c666caca..7842c961 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -24,7 +24,7 @@ glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum
     glorot_normal(dims...)
 
 Return an `Array` of size `dims` containing random variables taken from a normal
-distribution with mean 0 and standard deviation `(2 / sum(dims))`.
+distribution with mean 0 and standard deviation `sqrt(2 / sum(dims))`.
 
 # Examples
 ```jldoctest; setup = :(using Random; Random.seed!(0))

From deff98812a09c4f902bc8705fa9fb4fda5a68a8d Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Fri, 24 Apr 2020 21:59:02 +0200
Subject: [PATCH 112/113] Add v0.11.0 entry and added samepadding option

---
 NEWS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 4023c7f2..898bf671 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,6 @@
+# v0.11.0
+* Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`. 
+
 # v0.10.0
 * The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
   - The dependency on Tracker.jl has been removed.

From 4e4f6d9d1f7ed8b30c6d552b817d50d7450608a8 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Fri, 24 Apr 2020 22:07:57 +0200
Subject: [PATCH 113/113] Change next version entry to 0.10.5

---
 NEWS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 898bf671..460a9e5b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,5 @@
-# v0.11.0
-* Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`. 
+# v0.10.5
+* Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`.
 
 # v0.10.0
 * The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)