From b4ed16ad9cd52905a94ea18b70148724998742ab Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Tue, 3 Dec 2019 22:48:48 -0600
Subject: [PATCH 01/46] Added outdims for some basic layers

---
 src/layers/basic.jl | 35 +++++++++++++++++++++++++++++++
 src/layers/conv.jl  | 51 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 2a465208..f2e7645d 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -39,6 +39,17 @@ function Base.show(io::IO, c::Chain)
   print(io, ")")
 end
 
+"""
+    outdims(c::Chain, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+outdims(m, (10, 10)) == (6, 6)
+```
+"""
+outdims(c::Chain, isize::Tuple) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -116,6 +127,19 @@ end
 (a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::Dense, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Dense(10, 5)
+outdims(m, (5, 2)) == (5,)
+outdims(m, (10,)) == (5,)
+```
+"""
+outdims(l::Dense, isize) = (size(l.W)[2],)
+
 """
     Diagonal(in::Integer)
 
@@ -145,6 +169,17 @@ function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", length(l.α), ")")
 end
 
+"""
+    outdims(l::Diagonal, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Diagonal(10)
+outdims(m, (10,)) == (10,)
+```
+"""
+outdims(l::Diagonal, isize) = (length(l.α),)
 
 """
     Maxout(over)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index f4de3ffc..eeeea82b 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,5 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv
 
+_convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
+
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
 """
@@ -68,6 +70,18 @@ end
 (a::Conv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::Conv, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Conv((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::Conv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     ConvTranspose(size, in=>out)
     ConvTranspose(size, in=>out, relu)
@@ -140,6 +154,7 @@ end
 
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
+
 """
     DepthwiseConv(size, in=>out)
     DepthwiseConv(size, in=>out, relu)
@@ -204,6 +219,18 @@ end
 (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::DepthwiseConv, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = DepthwiseConv((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::DepthwiseConv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     CrossCor(size, in=>out)
     CrossCor(size, in=>out, relu)
@@ -304,6 +331,18 @@ function Base.show(io::IO, m::MaxPool)
   print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
+"""
+    outdims(l::MaxPool, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = MaxPool((2, 2))
+outdims(m, (10, 10)) == (5, 5)
+```
+"""
+outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
+
 """
     MeanPool(k)
 
@@ -331,3 +370,15 @@ end
 function Base.show(io::IO, m::MeanPool)
   print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
+
+"""
+    outdims(l::MeanPool, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = MeanPool((2, 2))
+outdims(m, (10, 10)) == (5, 5)
+```
+"""
+outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
\ No newline at end of file

From 31dda0ce6cd8c264d083d453823f4f13fa755da5 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 5 Dec 2019 21:57:10 -0600
Subject: [PATCH 02/46] Updated with all basic and conv layers outdims

---
 src/layers/basic.jl | 16 ++++++++++++++--
 src/layers/conv.jl  | 25 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index f2e7645d..8794b58c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -40,7 +40,7 @@ function Base.show(io::IO, c::Chain)
 end
 
 """
-    outdims(c::Chain, isize::Tuple)
+    outdims(c::Chain, isize)
 
 Calculate the output dimensions given the input dimensions, `isize`.
 
@@ -49,7 +49,7 @@ m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
 outdims(m, (10, 10)) == (6, 6)
 ```
 """
-outdims(c::Chain, isize::Tuple) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
+outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -228,6 +228,18 @@ function (mo::Maxout)(input::AbstractArray)
     mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end
 
+"""
+    outdims(c::Maxout, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Maxout(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::Maxout, isize) = outdims(first(l.over))
+
 """
     SkipConnection(layers, connection)
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index eeeea82b..2e3e87d7 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,6 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv
 
 _convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
+_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad))
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -155,6 +156,18 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::ConvTranspose, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = ConvTranspose((3, 3), 3 => 16)
+outdims(m, (8, 8)) == (10, 10)
+```
+"""
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     DepthwiseConv(size, in=>out)
     DepthwiseConv(size, in=>out, relu)
@@ -302,6 +315,18 @@ end
 (a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::CrossCor, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = CrossCor((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::CrossCor{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     MaxPool(k)
 

From 6265b1fa39c5d7d289ccd5a00c94ae9f448377fc Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 5 Dec 2019 22:54:25 -0600
Subject: [PATCH 03/46] Added tests for outdims

---
 src/layers/basic.jl  |  8 ++++----
 src/layers/conv.jl   |  8 ++++----
 test/layers/basic.jl | 15 +++++++++++++++
 test/layers/conv.jl  | 20 ++++++++++++++++++++
 4 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 8794b58c..b62d8bb9 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -49,7 +49,7 @@ m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
 outdims(m, (10, 10)) == (6, 6)
 ```
 """
-outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
+outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(isize)
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -138,7 +138,7 @@ outdims(m, (5, 2)) == (5,)
 outdims(m, (10,)) == (5,)
 ```
 """
-outdims(l::Dense, isize) = (size(l.W)[2],)
+outdims(l::Dense, isize) = (size(l.W)[1],)
 
 """
     Diagonal(in::Integer)
@@ -234,11 +234,11 @@ end
 Calculate the output dimensions given the input dimensions, `isize`.
 
 ```julia
-m = Maxout(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
 outdims(m, (10, 10)) == (8, 8)
 ```
 """
-outdims(l::Maxout, isize) = outdims(first(l.over))
+outdims(l::Maxout, isize) = outdims(first(l.over), isize)
 
 """
     SkipConnection(layers, connection)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 2e3e87d7..6ce9bcbf 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,7 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv
 
 _convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
-_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad))
+_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad)
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -238,7 +238,7 @@ end
 Calculate the output dimensions given the input dimensions, `isize`.
 
 ```julia
-m = DepthwiseConv((3, 3), 3 => 16)
+m = DepthwiseConv((3, 3), 3 => 6)
 outdims(m, (10, 10)) == (8, 8)
 ```
 """
@@ -366,7 +366,7 @@ m = MaxPool((2, 2))
 outdims(m, (10, 10)) == (5, 5)
 ```
 """
-outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
+outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
 
 """
     MeanPool(k)
@@ -406,4 +406,4 @@ m = MeanPool((2, 2))
 outdims(m, (10, 10)) == (5, 5)
 ```
 """
-outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
\ No newline at end of file
+outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
\ No newline at end of file
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 0ff1776d..421c7721 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -92,4 +92,19 @@ import Flux: activations
       @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
     end
   end
+
+  @testset "output dimensions" begin
+    m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+    @test Flux.outdims(m, (10, 10)) == (6, 6)
+
+    m = Dense(10, 5)
+    @test Flux.outdims(m, (5, 2)) == (5,)
+    @test Flux.outdims(m, (10,)) == (5,)
+
+    m = Flux.Diagonal(10)
+    @test Flux.outdims(m, (10,)) == (10,)
+
+    m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
+    @test Flux.outdims(m, (10, 10)) == (8, 8)
+  end
 end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index b4136062..5701df80 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -107,3 +107,23 @@ end
     true
   end
 end
+
+@testset "conv output dimensions" begin
+  m = Conv((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+
+  m = ConvTranspose((3, 3), 3 => 16)
+  @test Flux.outdims(m, (8, 8)) == (10, 10)
+
+  m = DepthwiseConv((3, 3), 3 => 6)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+
+  m = CrossCor((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+
+  m = MaxPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+
+  m = MeanPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+end
\ No newline at end of file

From a64378b11272444f8803ec0155262d47ab0cef71 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 7 Dec 2019 13:21:26 -0600
Subject: [PATCH 04/46] Switched to using NNlib for conv.jl outdims.

---
 src/layers/basic.jl | 20 -------------
 src/layers/conv.jl  | 73 ++++++++++-----------------------------------
 2 files changed, 15 insertions(+), 78 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index b62d8bb9..6f056429 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -169,16 +169,6 @@ function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", length(l.α), ")")
 end
 
-"""
-    outdims(l::Diagonal, isize)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = Diagonal(10)
-outdims(m, (10,)) == (10,)
-```
-"""
 outdims(l::Diagonal, isize) = (length(l.α),)
 
 """
@@ -228,16 +218,6 @@ function (mo::Maxout)(input::AbstractArray)
     mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end
 
-"""
-    outdims(c::Maxout, isize)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
-outdims(m, (10, 10)) == (8, 8)
-```
-"""
 outdims(l::Maxout, isize) = outdims(first(l.over), isize)
 
 """
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 6ce9bcbf..7b32f999 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,6 +1,8 @@
-using NNlib: conv, ∇conv_data, depthwiseconv
+using NNlib: conv, ∇conv_data, depthwiseconv, output_size
+
+# pad dims of x with dims of y until ndims(x) == ndims(y)
+_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
 
-_convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
 _convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad)
 
 expand(N, i::Tuple) = i
@@ -75,13 +77,16 @@ end
     outdims(l::Conv, isize::Tuple)
 
 Calculate the output dimensions given the input dimensions, `isize`.
+Batch size and channel size are ignored as per `NNlib.jl`.
 
 ```julia
 m = Conv((3, 3), 3 => 16)
 outdims(m, (10, 10)) == (8, 8)
+outdims(m, (10, 10, 1, 3)) == (8, 8)
 ```
 """
-outdims(l::Conv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::Conv, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
     ConvTranspose(size, in=>out)
@@ -156,17 +161,7 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-"""
-    outdims(l::ConvTranspose, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = ConvTranspose((3, 3), 3 => 16)
-outdims(m, (8, 8)) == (10, 10)
-```
-"""
-outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.pad[1:N])
 
 """
     DepthwiseConv(size, in=>out)
@@ -232,17 +227,8 @@ end
 (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-"""
-    outdims(l::DepthwiseConv, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = DepthwiseConv((3, 3), 3 => 6)
-outdims(m, (10, 10)) == (8, 8)
-```
-"""
-outdims(l::DepthwiseConv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::DepthwiseConv, isize) =
+  output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
     CrossCor(size, in=>out)
@@ -315,17 +301,8 @@ end
 (a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-"""
-    outdims(l::CrossCor, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = CrossCor((3, 3), 3 => 16)
-outdims(m, (10, 10)) == (8, 8)
-```
-"""
-outdims(l::CrossCor{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::CrossCor, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
     MaxPool(k)
@@ -356,17 +333,7 @@ function Base.show(io::IO, m::MaxPool)
   print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
-"""
-    outdims(l::MaxPool, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = MaxPool((2, 2))
-outdims(m, (10, 10)) == (5, 5)
-```
-"""
-outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
+outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
 
 """
     MeanPool(k)
@@ -396,14 +363,4 @@ function Base.show(io::IO, m::MeanPool)
   print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
-"""
-    outdims(l::MeanPool, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = MeanPool((2, 2))
-outdims(m, (10, 10)) == (5, 5)
-```
-"""
-outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
\ No newline at end of file
+outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
\ No newline at end of file

From 0cdd11c0dc8e8e82a90467cc66e3b8330ad57682 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 7 Dec 2019 14:05:50 -0600
Subject: [PATCH 05/46] Added tests for varying padding, stride, and dilation
 with outdims.

---
 src/layers/conv.jl  |  4 ++--
 test/layers/conv.jl | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 7b32f999..03de438a 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -3,7 +3,7 @@ using NNlib: conv, ∇conv_data, depthwiseconv, output_size
 # pad dims of x with dims of y until ndims(x) == ndims(y)
 _paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
 
-_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad)
+_convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+ (ksize .- 1).*dsize .- (pad[1:2:end] .+ pad[2:2:end])
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -161,7 +161,7 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
 
 """
     DepthwiseConv(size, in=>out)
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 5701df80..1a22b385 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -111,19 +111,51 @@ end
 @testset "conv output dimensions" begin
   m = Conv((3, 3), 3 => 16)
   @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = Conv((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
 
   m = ConvTranspose((3, 3), 3 => 16)
   @test Flux.outdims(m, (8, 8)) == (10, 10)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (2, 2)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (4, 4)) == (5, 5)
 
   m = DepthwiseConv((3, 3), 3 => 6)
   @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
 
   m = CrossCor((3, 3), 3 => 16)
   @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = CrossCor((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
 
   m = MaxPool((2, 2))
   @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MaxPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MaxPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
 
   m = MeanPool((2, 2))
   @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MeanPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MeanPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
 end
\ No newline at end of file

From 04991d3261f006f134beb6333f504ad27e11a706 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 7 Dec 2019 14:06:11 -0600
Subject: [PATCH 06/46] Added entry to docs for outdims

---
 docs/src/models/basics.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index d83fc462..c6dc4e19 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -219,3 +219,13 @@ Flux.@functor Affine
 ```
 
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
+
+## Utility functions
+
+Flux provides some utility functions to help you generate models in an automated fashion.
+
+`outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
+
+```@docs
+outdims
+```

From 2f854bdfc0d7064f4e28988d6418d9b09324c11e Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Tue, 10 Dec 2019 09:57:08 -0600
Subject: [PATCH 07/46] Recommitting to trigger new build


From f00b5325568415aaf32024e7eda0090d2cb0e036 Mon Sep 17 00:00:00 2001
From: aminya <aminyahyaabadi74@gmail.com>
Date: Mon, 6 Jan 2020 03:17:25 +0330
Subject: [PATCH 08/46] Adding CompatHelper

---
 .github/workflows/CompatHelper.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 .github/workflows/CompatHelper.yml

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
new file mode 100644
index 00000000..9777033d
--- /dev/null
+++ b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,24 @@
+name: CompatHelper
+
+on:
+  schedule:
+    - cron: '00 00 * * *'
+
+jobs:
+  CompatHelper:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        julia-version: [1.3]
+        julia-arch: [x64]
+        os: [ubuntu-latest]
+    steps:
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: ${{ matrix.julia-version }}
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'

From 9803826a368fa3f527e9c2682876f168e11f75fc Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Mon, 20 Jan 2020 13:53:28 -0500
Subject: [PATCH 09/46] test restructure on the GPU

Requires https://github.com/FluxML/Zygote.jl/pull/474
---
 test/cuda/cuda.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 1576d88f..911eef93 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -58,6 +58,13 @@ end
   @test y[3,:] isa CuArray
 end
 
+@testset "restructure gpu" begin
+  dudt = Dense(1,1) |> gpu
+  p,re = Flux.destructure(dudt)
+  foo(x) = sum(re(p)(x))
+  @test gradient(foo, cu(rand(1)))[1] isa CuArray
+end
+
 if CuArrays.has_cudnn()
   @info "Testing Flux/CUDNN"
   include("cudnn.jl")

From 6499344af397db698706b3325d2ba6831178ac65 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 6 Feb 2020 13:42:17 +0100
Subject: [PATCH 10/46] nograd for onecold, onehot, onehotbatch

---
 src/onehot.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/onehot.jl b/src/onehot.jl
index 754d0607..7a3123ec 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -125,6 +125,4 @@ onecold(y::AbstractMatrix, labels...) =
 onecold(y::OneHotMatrix, labels...) =
   mapreduce(x -> Flux.onecold(x, labels...), |, y.data, dims = 2, init = 0)
 
-# TODO probably still want this as a custom adjoint Zygote
-# onecold(x::TrackedVector, l...) = onecold(data(x), l...)
-# onecold(x::TrackedMatrix, l...) = onecold(data(x), l...)
+@nograd onecold, onehot, onehotbatch

From 197a1a70c09deba9f4d5ae1bf74bc12a86314288 Mon Sep 17 00:00:00 2001
From: pranjaldatta <pranjaldatta99@gmail.com>
Date: Fri, 7 Feb 2020 03:47:19 +0530
Subject: [PATCH 11/46] added BostonHousing dataset and testing

---
 src/data/Data.jl    |   3 +
 src/data/housing.jl | 136 ++++++++++++++++++++++++++++++++++++++++++++
 test/data.jl        |   8 ++-
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 src/data/housing.jl

diff --git a/src/data/Data.jl b/src/data/Data.jl
index d7cd0303..88af9549 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -42,4 +42,7 @@ using .Sentiment
 include("iris.jl")
 export Iris
 
+include("housing.jl")
+export Housing
+
 end
diff --git a/src/data/housing.jl b/src/data/housing.jl
new file mode 100644
index 00000000..0d167dc0
--- /dev/null
+++ b/src/data/housing.jl
@@ -0,0 +1,136 @@
+"""
+1. Title: Boston Housing Data
+
+2. Sources:
+   (a) Origin:  This dataset was taken from the StatLib library which is
+                maintained at Carnegie Mellon University.
+   (b) Creator:  Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the 
+                 demand for clean air', J. Environ. Economics & Management,
+                 vol.5, 81-102, 1978.
+   (c) Date: July 7, 1993
+
+3. Number of Instances: 506
+
+4. Number of Attributes: 13 continuous attributes (including "class"
+                            attribute "MEDV"), 1 binary-valued attribute.
+                            
+5. Attribute Information:
+   
+       1. CRIM      per capita crime rate by town
+       2. ZN        proportion of residential land zoned for lots over 
+                    25,000 sq.ft.
+       3. INDUS     proportion of non-retail business acres per town
+       4. CHAS      Charles River dummy variable (= 1 if tract bounds 
+                    river; 0 otherwise)
+       5. NOX       nitric oxides concentration (parts per 10 million)
+       6. RM        average number of rooms per dwelling
+       7. AGE       proportion of owner-occupied units built prior to 1940
+       8. DIS       weighted distances to five Boston employment centres
+       9. RAD       index of accessibility to radial highways
+       10. TAX      full-value property-tax rate per 10,000 dollars
+       11. PTRATIO  pupil-teacher ratio by town
+       12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
+                    by town
+       13. LSTAT    % lower status of the population
+       14. MEDV     Median value of owner-occupied homes in 1000's of dollars   
+
+       Downloaded From: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data
+
+"""
+module Housing
+
+using DelimitedFiles
+using ..Data: deps, download_and_verify
+
+#Uncomment if package exists
+#const cache_prefix = "https://cache.julialang.org/"
+const cache_prefix = ""
+
+function load()
+    isfile(deps("housing.data")) && return
+    
+    @info "Downloading the Boston housing Dataset"
+    download_and_verify("$(cache_prefix)https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
+                        deps("housing.data"),
+                        "baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")
+    
+    #@info "Download complete. Working on the files"
+    path = deps()
+    isfile(deps("housing.data")) && touch(joinpath(path, "tempfile.data"))
+    open(joinpath(path, "tempfile.data"), "a") do fout
+        open(deps("housing.data"), "r") do fin
+            for line in eachline(fin)
+                line = replace(lstrip(line), r" +" => s",")
+                println(fout, line)
+            end
+        end
+    end
+    mv(joinpath(path, "tempfile.data"), deps("housing.data"), force=true)
+end
+
+"""
+Gets the targets for the Boston housing dataset, a 506 element array listing the targets for each example
+
+```jldoctest
+julia> using Flux
+
+julia> target = Flux.Data.Housing.targets()
+
+julia> summary(target)
+506×1 Array{Float64,2}
+
+julia> target[1]
+24.0
+
+"""
+function targets()
+    load()
+    housing = readdlm(deps("housing.data"), ',')
+    reshape(Vector{Float64}(housing[1:end,end]), (506, 1))           
+end
+
+
+"""
+Gets the names of the features provided in the dataset
+
+"""
+function feature_names()
+    ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"]
+end
+
+
+"""
+Gets the features of the Boston Housing Dataset. This is a 506x13 Matrix of Float64 datatypes.
+The values are in the order ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"].
+It has 506 examples.
+
+```jldoctest
+julia> using Flux
+
+julia> features = Flux.Data.Housing.features()
+
+julia> summary(features)
+506×13 Array{Float64,2}
+
+julia> features[1, :]
+13-element Array{Float64,1}:
+0.00632
+18.0    
+2.31   
+0.0    
+0.538  
+   ⋮      
+296.0    
+15.3    
+396.9    
+4.98   
+
+"""
+function features()
+    load()
+    housing = readdlm(deps("housing.data"), ',')
+    Matrix{Float64}(housing[1:end, 1:13])    
+end
+
+
+end
\ No newline at end of file
diff --git a/test/data.jl b/test/data.jl
index 6b777873..aa913806 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -16,7 +16,13 @@ using Test
 @test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
 
 @test Iris.features() isa Matrix
-@test size(Iris.features()) == (4,150)
+@test size(Iris.features()) == (4,150) 
 
 @test Iris.labels() isa Vector{String}
 @test size(Iris.labels()) == (150,)
+
+@test Housing.features() isa Matrix
+@test size(Housing.features()) == (506, 13)
+
+@test Housing.targets() isa Array{Float64}
+@test size(Housing.targets()) == (506, 1)
\ No newline at end of file

From d7b20d1a780d32d111030f4a7a7f62cd62b2eb11 Mon Sep 17 00:00:00 2001
From: Julia TagBot <50554310+JuliaTagBot@users.noreply.github.com>
Date: Sat, 8 Feb 2020 20:02:52 +0700
Subject: [PATCH 12/46] Install TagBot as a GitHub Action

---
 .github/workflows/TagBot.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 .github/workflows/TagBot.yml

diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 00000000..d77d3a0c
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,11 @@
+name: TagBot
+on:
+  schedule:
+    - cron: 0 * * * *
+jobs:
+  TagBot:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}

From c37fc3cfa63a82deec33d40f837b880341440c7a Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 9 Feb 2020 19:45:04 -0600
Subject: [PATCH 13/46] Recommitting to trigger build


From ae0455517a57159fb5d05c9b5a0e2531f78ebc93 Mon Sep 17 00:00:00 2001
From: Marco <mcognetta@users.noreply.github.com>
Date: Mon, 10 Feb 2020 00:03:11 -0800
Subject: [PATCH 14/46] Remove outdated reference to truncate!

---
 src/layers/recurrent.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 499a21ab..647dda25 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -45,8 +45,7 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
 """
     reset!(rnn)
 
-Reset the hidden state of a recurrent layer back to its original value. See also
-`truncate!`.
+Reset the hidden state of a recurrent layer back to its original value.
 
 Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
 

From 6ea7b95384b34b0b2aacc10bc225480f4a9555a0 Mon Sep 17 00:00:00 2001
From: matsueushi <matsueushi@gmail.com>
Date: Sat, 15 Feb 2020 20:06:15 -0500
Subject: [PATCH 15/46] Remove unused using

---
 src/optimise/optimisers.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index fb3b9fc5..cf4496f4 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -1,5 +1,4 @@
 using Flux
-using Base: @get!
 using MacroTools: @forward
 
 const ϵ = 1e-8

From 9bb388d953ce6676860ff82028b9c1f98c88bbfb Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Sun, 16 Feb 2020 18:29:18 +0100
Subject: [PATCH 16/46] update Juno compat

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 96d1d853..f76063bd 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.1"
+version = "0.10.2"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -29,7 +29,7 @@ Adapt = "1"
 CodecZlib = "0.5, 0.6"
 Colors = "0.8, 0.9"
 CuArrays = "1.6"
-Juno = "0.5, 0.6, 0.7"
+Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
 Reexport = "0.2"

From f5b9cf659cb14f0b05ab98b2fef70f705adfc8c3 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 20 Feb 2020 23:38:56 -0600
Subject: [PATCH 17/46] Updated docs to specify exactly what layers support
 outdims

---
 docs/src/models/basics.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index c6dc4e19..6e8d0b76 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -225,6 +225,17 @@ This enables a useful extra set of functionality for our `Affine` layer, such as
 Flux provides some utility functions to help you generate models in an automated fashion.
 
 `outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
+Currently limited to the following layers:
+- `Chain`
+- `Dense`
+- `Conv`
+- `Diagonal`
+- `Maxout`
+- `ConvTranspose`
+- `DepthwiseConv`
+- `CrossCor`
+- `MaxPool`
+- `MeanPool`
 
 ```@docs
 outdims

From 7c12af065a2d8fb20359321e34f3c0731ae5559f Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 20 Feb 2020 23:27:36 -0600
Subject: [PATCH 18/46] Added testmode! functionality back to normalization
 layers.

---
 src/Flux.jl                  |  2 +-
 src/layers/normalise.jl      | 72 ++++++++++++++++++++++++++++++------
 test/layers/normalisation.jl | 31 +++++++++-------
 3 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 9969b323..5f9878f3 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -11,7 +11,7 @@ export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
        DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
-       SkipConnection, params, fmap, cpu, gpu, f32, f64
+       SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!
 
 include("optimise/Optimise.jl")
 using .Optimise
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index b421d3e7..ee6b6fdd 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -2,6 +2,23 @@ istraining() = false
 
 @adjoint istraining() = true, _ -> nothing
 
+_isactive(m) = isnothing(m.active) ? istraining() : m.active
+# @adjoint _isactive(m) = _isactive(m), Δ -> nothing
+
+"""
+  testmode!(m, mode = :auto)
+
+Set a layer or model's test mode (see below).
+Using `:auto` mode will treat any gradient computation as training.
+
+Possible values include:
+- `false` for training
+- `true` for testing
+- `:auto` or `nothing` for Flux to detect the mode automatically
+"""
+testmode!(m, mode) = nothing
+testmode!(m::Chain, mode = :auto) = map(x -> testmode!(x, mode), m.layers)
+
 _dropout_shape(s, ::Colon) = size(s)
 _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
 
@@ -22,18 +39,27 @@ A Dropout layer. For each input, either sets that input to `0` (with probability
 `p`) or scales it by `1/(1-p)`. The `dims` argument is to specified the unbroadcasted
  dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
  used as a regularisation, i.e. it reduces overfitting during training. see also [`dropout`](@ref).
+
+Does nothing to the input once [`testmode!`](@ref) is false.
 """
 mutable struct Dropout{F,D}
   p::F
   dims::D
+  active::Union{Bool, Nothing}
 end
 
 function Dropout(p; dims = :)
   @assert 0 ≤ p ≤ 1
-  Dropout{typeof(p),typeof(dims)}(p, dims)
+  Dropout{typeof(p),typeof(dims)}(p, dims, nothing)
 end
 
-(a::Dropout)(x) = dropout(x, a.p; dims = a.dims)
+function (a::Dropout)(x)
+  _isactive(a) || return x
+  return dropout(x, a.p; dims = a.dims)
+end
+
+testmode!(m::Dropout, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, d::Dropout)
   print(io, "Dropout(", d.p)
@@ -46,17 +72,20 @@ end
 A dropout layer. It is used in Self-Normalizing Neural Networks.
 (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
 The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
+
+Does nothing to the input once [`testmode!`](@ref) is false.
 """
 mutable struct AlphaDropout{F}
   p::F
-  function AlphaDropout(p)
+  active::Union{Bool, Nothing}
+  function AlphaDropout(p, active = nothing)
     @assert 0 ≤ p ≤ 1
-    new{typeof(p)}(p)
+    new{typeof(p)}(p, active)
   end
 end
 
 function (a::AlphaDropout)(x)
-  istraining() || return x
+  _isactive(a) || return x
   λ = eltype(x)(1.0507009873554804934193349852946)
   α = eltype(x)(1.6732632423543772848170429916717)
   α1 = eltype(x)(-λ*α)
@@ -68,6 +97,9 @@ function (a::AlphaDropout)(x)
   return x
 end
 
+testmode!(m::AlphaDropout, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+
 """
     LayerNorm(h::Integer)
 
@@ -106,6 +138,8 @@ it's the usual channel dimension.)
 shifts them to have a new mean and variance (corresponding to the learnable,
 per-channel `bias` and `scale` parameters).
 
+Use [`testmode!`](@ref) during inference.
+
 See [Batch Normalization: Accelerating Deep Network Training by Reducing
 Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf).
 
@@ -127,12 +161,13 @@ mutable struct BatchNorm{F,V,W,N}
   σ²::W  # moving std
   ϵ::N
   momentum::N
+  active::Union{Bool, Nothing}
 end
 
 BatchNorm(chs::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   BatchNorm(λ, initβ(chs), initγ(chs),
-            zeros(chs), ones(chs), ϵ, momentum)
+            zeros(chs), ones(chs), ϵ, momentum, nothing)
 
 trainable(bn::BatchNorm) = (bn.β, bn.γ)
 
@@ -145,7 +180,7 @@ function (BN::BatchNorm)(x)
   m = div(prod(size(x)), channels)
   γ = reshape(BN.γ, affine_shape...)
   β = reshape(BN.β, affine_shape...)
-  if !istraining()
+  if !_isactive(BN)
     μ = reshape(BN.μ, affine_shape...)
     σ² = reshape(BN.σ², affine_shape...)
     ϵ = BN.ϵ
@@ -170,6 +205,9 @@ end
 
 @functor BatchNorm
 
+testmode!(m::BatchNorm, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+
 function Base.show(io::IO, l::BatchNorm)
   print(io, "BatchNorm($(join(size(l.β), ", "))")
   (l.λ == identity) || print(io, ", λ = $(l.λ)")
@@ -193,6 +231,8 @@ it's the usual channel dimension.)
 shifts them to have a new mean and variance (corresponding to the learnable,
 per-channel `bias` and `scale` parameters).
 
+Use [`testmode!`](@ref) during inference.
+
 See [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
 
 Example:
@@ -215,12 +255,13 @@ mutable struct InstanceNorm{F,V,W,N}
   σ²::W  # moving std
   ϵ::N
   momentum::N
+  active::Union{Bool, Nothing}
 end
 
 InstanceNorm(chs::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   InstanceNorm(λ, initβ(chs), initγ(chs),
-            zeros(chs), ones(chs), ϵ, momentum)
+            zeros(chs), ones(chs), ϵ, momentum, nothing)
 
 trainable(in::InstanceNorm) = (in.β, in.γ)
 
@@ -237,7 +278,7 @@ function (in::InstanceNorm)(x)
   m = div(prod(size(x)), c*bs)
   γ, β = expand_inst(in.γ, affine_shape), expand_inst(in.β, affine_shape)
 
-  if !istraining()
+  if !_isactive(in)
     μ = expand_inst(in.μ, affine_shape)
     σ² = expand_inst(in.σ², affine_shape)
     ϵ = in.ϵ
@@ -263,6 +304,9 @@ end
 
 @functor InstanceNorm
 
+testmode!(m::InstanceNorm, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+
 function Base.show(io::IO, l::InstanceNorm)
   print(io, "InstanceNorm($(join(size(l.β), ", "))")
   (l.λ == identity) || print(io, ", λ = $(l.λ)")
@@ -283,6 +327,8 @@ For an array of N dimensions, the (N-1)th index is the channel dimension.
 ``G`` is the number of groups along which the statistics would be computed.
 The number of channels must be an integer multiple of the number of groups.
 
+Use [`testmode!`](@ref) during inference.
+
 Example:
 ```
 m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1),
@@ -300,12 +346,13 @@ mutable struct GroupNorm{F,V,W,N,T}
   σ²::W  # moving std
   ϵ::N
   momentum::N
+  active::Union{Bool, Nothing}
 end
 
 GroupNorm(chs::Integer, G::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   GroupNorm(G, λ, initβ(chs), initγ(chs),
-            zeros(G,1), ones(G,1), ϵ, momentum)
+            zeros(G,1), ones(G,1), ϵ, momentum, nothing)
 
 trainable(gn::GroupNorm) = (gn.β, gn.γ)
 
@@ -329,7 +376,7 @@ function(gn::GroupNorm)(x)
   β = reshape(gn.β, affine_shape...)
 
   y = reshape(x,((size(x))[1:end-2]...,channels_per_group,groups,batches))
-  if !istraining()
+  if !_isactive(gn)
     og_shape = size(x)
     μ = reshape(gn.μ, μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
     σ² = reshape(gn.σ², μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
@@ -360,6 +407,9 @@ end
 
 @functor GroupNorm
 
+testmode!(m::GroupNorm, mode = :auto) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+
 function Base.show(io::IO, l::GroupNorm)
   print(io, "GroupNorm($(join(size(l.β), ", "))")
   (l.λ == identity) || print(io, ", λ = $(l.λ)")
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 4399a256..594fb586 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -1,30 +1,33 @@
 using Flux, Test, Statistics
 using Zygote: pullback
 
-trainmode(f, x...) = pullback(f, x...)[1]
-trainmode(f) = (x...) -> trainmode(f, x...)
+evalwgrad(f, x...) = pullback(f, x...)[1]
+trainmode(f) = (testmode!(f, false); f)
 
 @testset "Dropout" begin
   x = [1.,2.,3.]
   @test x == Dropout(0.1)(x)
-  @test x == trainmode(Dropout(0), x)
-  @test zero(x) == trainmode(Dropout(1), x)
+  @test x == evalwgrad(Dropout(0), x)
+  @test zero(x) == evalwgrad(Dropout(1), x)
 
   x = rand(100)
   m = Dropout(0.9)
-  y = trainmode(m, x)
+  y = evalwgrad(m, x)
   @test count(a->a==0, y) > 50
-  y = m(x)
+  testmode!(m, true)
+  y = evalwgrad(m, x) # should override istraining
   @test count(a->a==0, y) == 0
-  y = trainmode(m, x)
+  testmode!(m, false)
+  y = evalwgrad(m, x)
   @test count(a->a==0, y) > 50
 
   x = rand(Float32, 100)
   m = Chain(Dense(100,100),
             Dropout(0.9))
-  y = trainmode(m, x)
+  y = evalwgrad(m, x)
   @test count(a->a == 0, y) > 50
-  y = m(x)
+  testmode!(m, true)
+  y = evalwgrad(m, x) # should override istraining
   @test count(a->a == 0, y) == 0
 
   x = rand(100, 50)
@@ -49,7 +52,7 @@ end
     # initial m.σ is 1
     # initial m.μ is 0
 
-    y = trainmode(m, x)
+    y = evalwgrad(m, x)
     @test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5)
     # julia> x
     #  2×3 Array{Float64,2}:
@@ -117,7 +120,7 @@ end
       x = Float64.(x)
       @test m.β == [0, 0]  # initβ(2)
       @test m.γ == [1, 1]  # initγ(2)
-      y = trainmode(m, x)
+      y = evalwgrad(m, x)
 
       #julia> x
       #[:, :, 1] =
@@ -172,7 +175,7 @@ end
   # check that μ, σ², and the output are the correct size for higher rank tensors
   let m = InstanceNorm(2), sizes = (5, 5, 3, 4, 2, 6),
       x = reshape(Float32.(collect(1:prod(sizes))), sizes)
-    y = trainmode(m, x)
+    y = evalwgrad(m, x)
     @test size(m.μ) == (sizes[end - 1], )
     @test size(m.σ²) == (sizes[end - 1], )
     @test size(y) == sizes
@@ -204,7 +207,7 @@ if VERSION >= v"1.1"
       @test m.β == [0, 0, 0, 0]  # initβ(32)
       @test m.γ == [1, 1, 1, 1]  # initγ(32)
 
-      y = trainmode(m, x)
+      y = evalwgrad(m, x)
 
       #julia> x
       #[:, :, 1]  =
@@ -273,7 +276,7 @@ if VERSION >= v"1.1"
   # check that μ, σ², and the output are the correct size for higher rank tensors
   let m = GroupNorm(4,2), sizes = (5, 5, 3, 4, 4, 6),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
-    y = trainmode(m, x)
+    y = evalwgrad(m, x)
     @test size(m.μ) == (m.G,1)
     @test size(m.σ²) == (m.G,1)
     @test size(y) == sizes

From 924b8f49ec9a438d35159e4e8ad5fbd75f0654ba Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Fri, 21 Feb 2020 15:10:28 -0600
Subject: [PATCH 19/46] Updated to place function definitions in the
 appropriate places.

---
 src/functor.jl          | 13 +++++++++++++
 src/layers/basic.jl     |  2 ++
 src/layers/normalise.jl | 25 +++++--------------------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index a36b5765..4edfbd98 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -39,6 +39,19 @@ end
 
 trainable(m) = functor(m)[1]
 
+"""
+  testmode!(m, mode = true)
+
+Set a layer or model's test mode (see below).
+Using `:auto` mode will treat any gradient computation as training.
+
+Possible values include:
+- `false` for training
+- `true` for testing
+- `:auto` or `nothing` for Flux to detect the mode automatically
+"""
+testmode!(m, mode) = nothing
+
 params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
 
 function params!(p::Params, x, seen = IdSet())
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 2a465208..6788f761 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -33,6 +33,8 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
+testmode!(m::Chain, mode = true) = map(x -> testmode!(x, mode), m.layers)
+
 function Base.show(io::IO, c::Chain)
   print(io, "Chain(")
   join(io, c.layers, ", ")
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index ee6b6fdd..7b438bc2 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -3,21 +3,6 @@ istraining() = false
 @adjoint istraining() = true, _ -> nothing
 
 _isactive(m) = isnothing(m.active) ? istraining() : m.active
-# @adjoint _isactive(m) = _isactive(m), Δ -> nothing
-
-"""
-  testmode!(m, mode = :auto)
-
-Set a layer or model's test mode (see below).
-Using `:auto` mode will treat any gradient computation as training.
-
-Possible values include:
-- `false` for training
-- `true` for testing
-- `:auto` or `nothing` for Flux to detect the mode automatically
-"""
-testmode!(m, mode) = nothing
-testmode!(m::Chain, mode = :auto) = map(x -> testmode!(x, mode), m.layers)
 
 _dropout_shape(s, ::Colon) = size(s)
 _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
@@ -58,7 +43,7 @@ function (a::Dropout)(x)
   return dropout(x, a.p; dims = a.dims)
 end
 
-testmode!(m::Dropout, mode = :auto) =
+testmode!(m::Dropout, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, d::Dropout)
@@ -97,7 +82,7 @@ function (a::AlphaDropout)(x)
   return x
 end
 
-testmode!(m::AlphaDropout, mode = :auto) =
+testmode!(m::AlphaDropout, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 """
@@ -205,7 +190,7 @@ end
 
 @functor BatchNorm
 
-testmode!(m::BatchNorm, mode = :auto) =
+testmode!(m::BatchNorm, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, l::BatchNorm)
@@ -304,7 +289,7 @@ end
 
 @functor InstanceNorm
 
-testmode!(m::InstanceNorm, mode = :auto) =
+testmode!(m::InstanceNorm, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, l::InstanceNorm)
@@ -407,7 +392,7 @@ end
 
 @functor GroupNorm
 
-testmode!(m::GroupNorm, mode = :auto) =
+testmode!(m::GroupNorm, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
 
 function Base.show(io::IO, l::GroupNorm)

From 6ced7e1ecff379cf3df3f62f05557317dc56e41f Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sun, 23 Feb 2020 13:42:11 -0500
Subject: [PATCH 20/46] expand Colors compat

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f76063bd..71282a10 100644
--- a/Project.toml
+++ b/Project.toml
@@ -27,7 +27,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 AbstractTrees = "0.2, 0.3"
 Adapt = "1"
 CodecZlib = "0.5, 0.6"
-Colors = "0.8, 0.9"
+Colors = "0.8, 0.9, 0.10, 0.11"
 CuArrays = "1.6"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"

From db4eaf254b5de8902349afbd705243c22d0ec91a Mon Sep 17 00:00:00 2001
From: Bulat Suleymanov <motjumi@gmail.com>
Date: Mon, 24 Feb 2020 13:16:51 +0500
Subject: [PATCH 21/46] Edit description of convolutional layer

---
 src/layers/conv.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index f4de3ffc..829051ae 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -17,7 +17,7 @@ Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
     out = 16
     Conv((2, 2), 1=>16, relu)
 
-Data should be stored in WHCN order (width, height, # channels, # batches).
+Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 

From ba5259a269f93b0dcf65dfca43b29b219bf81415 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Tue, 25 Feb 2020 13:53:49 -0600
Subject: [PATCH 22/46] Added docs on testmode!

---
 docs/src/models/layers.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 5f2ab3ce..763fbf8c 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,6 +66,14 @@ LayerNorm
 GroupNorm
 ```
 
+### Testmode
+
+Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified.
+
+```@docs
+testmode!
+```
+
 ## Cost Functions
 ```@docs
 mse

From 569021a9f1f9910f7f2e9ac6869bb149b9da7023 Mon Sep 17 00:00:00 2001
From: pranjaldatta <pranjaldatta99@gmail.com>
Date: Wed, 26 Feb 2020 15:05:23 +0530
Subject: [PATCH 23/46] added newlines  at end of file

---
 src/data/housing.jl | 2 +-
 test/data.jl        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/data/housing.jl b/src/data/housing.jl
index 0d167dc0..61391304 100644
--- a/src/data/housing.jl
+++ b/src/data/housing.jl
@@ -133,4 +133,4 @@ function features()
 end
 
 
-end
\ No newline at end of file
+end
diff --git a/test/data.jl b/test/data.jl
index aa913806..6c012a93 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -25,4 +25,4 @@ using Test
 @test size(Housing.features()) == (506, 13)
 
 @test Housing.targets() isa Array{Float64}
-@test size(Housing.targets()) == (506, 1)
\ No newline at end of file
+@test size(Housing.targets()) == (506, 1)

From 759fe9df2fb0a4665052383fae1b0fd8978a2f52 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Wed, 26 Feb 2020 20:27:39 +0100
Subject: [PATCH 24/46] update docs and export update!

---
 docs/src/training/optimisers.md |  3 ++-
 src/optimise/Optimise.jl        |  2 +-
 src/optimise/train.jl           | 17 +++++++++++++++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 5e8b95de..37288b5d 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -21,7 +21,7 @@ grads = gradient(() -> loss(x, y), θ)
 We want to update each parameter, using the gradient, in order to improve (reduce) the loss. Here's one way to do that:
 
 ```julia
-using Flux: update!
+using Flux.Optimise: update!
 
 η = 0.1 # Learning Rate
 for p in (W, b)
@@ -46,6 +46,7 @@ An optimiser `update!` accepts a parameter and a gradient, and updates the param
 All optimisers return an object that, when passed to `train!`, will update the parameters passed to it.
 
 ```@docs
+Flux.Optimise.update!
 Descent
 Momentum
 Nesterov
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index 68c18a6f..28a1849d 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -1,6 +1,6 @@
 module Optimise
 
-export train!,
+export train!, update!,
 	SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
 	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
 	InvDecay, ExpDecay, WeightDecay, stop, Optimiser
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index ae0f334c..59404a42 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -1,9 +1,22 @@
 using Juno
 import Zygote: Params, gradient
 
+
+"""
+  update!(opt, p, g)
+  update!(opt, ps::Params, gs)
+
+Perform an update step of the parameters `ps` (or the single parameter `p`) 
+according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
+
+As a result, the parameters are mutated and the optimizer's internal state may change. 
+
+  update!(x, x̄)
+  
+Update the array `x` according to `x .-= x̄`.
+"""
 function update!(x::AbstractArray, x̄)
-  x .+= x̄
-  return x
+  x .-= x̄
 end
 
 function update!(opt, x, x̄)

From a121742f9c766b954f56a46e631333853e97d5ad Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 27 Feb 2020 13:56:05 +0530
Subject: [PATCH 25/46] pkg up

---
 Manifest.toml | 68 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 12986ccd..55f3e229 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -40,15 +40,15 @@ version = "2.1.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
+git-tree-sha1 = "5660775f2a3214420add960e1ff2baf46d5297cd"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "5.0.1"
+version = "5.1.0"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
+git-tree-sha1 = "e0c2805c9a7d338823c0d8f574242e284410fa61"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.7.0"
+version = "2.9.1"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -74,6 +74,12 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"
 
+[[CompilerSupportLibraries_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "b57c5d019367c90f234a7bc7e24ff0a84971da5d"
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "0.2.0+1"
+
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
 git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
@@ -87,9 +93,9 @@ version = "1.1.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
+git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.7"
+version = "0.17.10"
 
 [[Dates]]
 deps = ["Printf"]
@@ -107,9 +113,9 @@ version = "1.0.2"
 
 [[DiffRules]]
 deps = ["NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
+git-tree-sha1 = "eb0c34204c8410888844ada5359ac8b96292cfd1"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.0.0"
+version = "1.0.1"
 
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
@@ -123,15 +129,15 @@ version = "1.2.0"
 
 [[FFTW_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
+git-tree-sha1 = "ddb57f4cf125243b4aa4908c94d73a805f3cbf2c"
 uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
-version = "3.3.9+3"
+version = "3.3.9+4"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
+git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.4"
+version = "0.8.5"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -140,9 +146,9 @@ version = "0.6.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
+git-tree-sha1 = "88b082d492be6b63f967b6c96b352e25ced1a34c"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.8"
+version = "0.10.9"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
@@ -152,9 +158,9 @@ version = "2.0.1"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
+git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.3.0"
+version = "0.3.1"
 
 [[IntelOpenMP_jll]]
 deps = ["Libdl", "Pkg"]
@@ -192,10 +198,10 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
 [[MKL_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
+deps = ["IntelOpenMP_jll", "Libdl", "Pkg"]
+git-tree-sha1 = "720629cc8cbd12c146ca01b661fd1a6cf66e2ff4"
 uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
-version = "2019.0.117+0"
+version = "2019.0.117+2"
 
 [[MacroTools]]
 deps = ["DataStructures", "Markdown", "Random"]
@@ -234,10 +240,10 @@ uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
 version = "0.3.3"
 
 [[OpenSpecFun_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
+deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
+git-tree-sha1 = "d110040968b9afe95c6bd9c6233570b0fe8abd22"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.3+1"
+version = "0.5.3+2"
 
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@@ -273,9 +279,9 @@ version = "0.2.0"
 
 [[Requires]]
 deps = ["UUIDs"]
-git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
+git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.0.0"
+version = "1.0.1"
 
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@@ -298,9 +304,9 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[SpecialFunctions]]
 deps = ["OpenSpecFun_jll"]
-git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
+git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.9.0"
+version = "0.10.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
@@ -349,15 +355,17 @@ version = "0.9.0"
 
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
+git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e"
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.11+7"
+version = "1.2.11+8"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
+git-tree-sha1 = "ab2683e7670925ed73b7f076b26847683e38db8c"
+repo-rev = "master"
+repo-url = "https://github.com/FluxML/Zygote.jl.git"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.5"
+version = "0.4.7"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]

From 35f6998be7572bb557948d3cee65797be22c9019 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 27 Feb 2020 22:19:06 +0530
Subject: [PATCH 26/46] pkg up

---
 Manifest.toml | 60 +++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 55f3e229..693f7ca2 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -8,15 +8,15 @@ version = "0.5.0"
 
 [[AbstractTrees]]
 deps = ["Markdown"]
-git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
+git-tree-sha1 = "86d092c2599f1f7bb01668bf8eb3412f98d61e47"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.3.0"
+version = "0.3.2"
 
 [[Adapt]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "82dab828020b872fa9efd3abec1152b075bc7cbf"
+git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "1.0.0"
+version = "1.0.1"
 
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
@@ -34,21 +34,21 @@ version = "0.2.0"
 
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
+git-tree-sha1 = "d7ceadd8f821177d05b897c0517e94633db535fe"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "2.1.0"
+version = "3.1.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "5660775f2a3214420add960e1ff2baf46d5297cd"
+git-tree-sha1 = "01e90fa34e25776bc7c8661183d4519149ebfe59"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "5.1.0"
+version = "6.0.0"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "e0c2805c9a7d338823c0d8f574242e284410fa61"
+git-tree-sha1 = "f86269ff60ebe082a2806ecbce51f3cadc68afe9"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.9.1"
+version = "2.10.2"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -58,15 +58,15 @@ version = "0.6.0"
 
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
+git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.8.1"
+version = "0.9.1"
 
 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
-git-tree-sha1 = "c9c1845d6bf22e34738bee65c357a69f416ed5d1"
+git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.9.6"
+version = "0.11.2"
 
 [[CommonSubexpressions]]
 deps = ["Test"]
@@ -82,9 +82,9 @@ version = "0.2.0+1"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
+git-tree-sha1 = "7c20c5a45bb245cf248f454d26966ea70255b271"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.6.0"
+version = "1.7.2"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -140,9 +140,9 @@ uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
 version = "0.8.5"
 
 [[FixedPointNumbers]]
-git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
+git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.6.1"
+version = "0.7.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
@@ -173,10 +173,10 @@ deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[Juno]]
-deps = ["Base64", "Logging", "Media", "Profile", "Test"]
-git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8"
+deps = ["Base64", "Logging", "Media", "Profile"]
+git-tree-sha1 = "4f2249fb58cfb140eeb89428e31791e2f8959d8c"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.7.2"
+version = "0.8.0"
 
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
@@ -205,9 +205,9 @@ version = "2019.0.117+2"
 
 [[MacroTools]]
 deps = ["DataStructures", "Markdown", "Random"]
-git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
+git-tree-sha1 = "07ee65e03e28ca88bc9a338a3726ae0c3efaa94b"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.3"
+version = "0.5.4"
 
 [[Markdown]]
 deps = ["Base64"]
@@ -230,9 +230,9 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[NNlib]]
 deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
-git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
+git-tree-sha1 = "755c0bab3912ff782167e1b4b774b833f8a0e550"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.2"
+version = "0.6.4"
 
 [[NaNMath]]
 git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
@@ -320,9 +320,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [[StatsBase]]
 deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "c53e809e63fe5cf5de13632090bc3520649c9950"
+git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.32.0"
+version = "0.32.1"
 
 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
@@ -349,9 +349,9 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [[ZipFile]]
 deps = ["Libdl", "Printf", "Zlib_jll"]
-git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
+git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.9.0"
+version = "0.9.1"
 
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
@@ -361,7 +361,7 @@ version = "1.2.11+8"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "ab2683e7670925ed73b7f076b26847683e38db8c"
+git-tree-sha1 = "3c65158c0aa0808cdfff8bca2a36430b038aad00"
 repo-rev = "master"
 repo-url = "https://github.com/FluxML/Zygote.jl.git"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"

From 425fcdbe6964d581b4d5f6eda1615e883a83b5bd Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 11:14:48 +0100
Subject: [PATCH 27/46] NNlib docs + misc docs improvements

---
 docs/make.jl                      |  3 +-
 docs/src/gpu.md                   |  4 +-
 docs/src/models/layers.md         | 30 ++++-------
 docs/src/models/nnlib.md          | 37 +++++++++++++
 docs/src/models/regularisation.md |  4 +-
 src/layers/normalise.jl           | 16 ++++--
 src/layers/stateless.jl           | 87 ++++++++++++++++++-------------
 7 files changed, 115 insertions(+), 66 deletions(-)
 create mode 100644 docs/src/models/nnlib.md

diff --git a/docs/make.jl b/docs/make.jl
index b950e959..fe3544fc 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -13,7 +13,8 @@ makedocs(modules=[Flux, NNlib],
                     ["Basics" => "models/basics.md",
                      "Recurrence" => "models/recurrence.md",
                      "Regularisation" => "models/regularisation.md",
-                     "Model Reference" => "models/layers.md"],
+                     "Model Reference" => "models/layers.md",
+                     "NNlib" => "models/nnlib.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
                      "Training" => "training/training.md"],
diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index bb13fdd1..19d0c8c6 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -30,7 +30,7 @@ If you define a structured model, like a `Dense` layer or `Chain`, you just need
 ```julia
 d = Dense(10, 5, σ)
 d = fmap(cu, d)
-d.W # Tracked CuArray
+d.W # CuArray
 d(cu(rand(10))) # CuArray output
 
 m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
@@ -53,7 +53,7 @@ julia> x = rand(10) |> gpu
  0.511655
 
 julia> m(x)
-Tracked 5-element CuArray{Float32,1}:
+5-element CuArray{Float32,1}:
  -0.30535
  ⋮
  -0.618002
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 5f2ab3ce..41e98f32 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -40,19 +40,6 @@ Maxout
 SkipConnection
 ```
 
-## Activation Functions
-
-Non-linearities that go between layers of your model. Most of these functions are defined in [NNlib](https://github.com/FluxML/NNlib.jl) but are available by default in Flux.
-
-Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
-
-```@docs
-σ
-relu
-leakyrelu
-elu
-swish
-```
 
 ## Normalisation & Regularisation
 
@@ -61,6 +48,7 @@ These layers don't affect the structure of the network but may improve training
 ```@docs
 BatchNorm
 Dropout
+Flux.dropout
 AlphaDropout
 LayerNorm
 GroupNorm
@@ -68,12 +56,12 @@ GroupNorm
 
 ## Cost Functions
 ```@docs
-mse
-crossentropy
-logitcrossentropy
-binarycrossentropy
-logitbinarycrossentropy
-kldivergence
-poisson
-hinge
+Flux.mse
+Flux.crossentropy
+Flux.logitcrossentropy
+Flux.binarycrossentropy
+Flux.logitbinarycrossentropy
+Flux.kldivergence
+Flux.poisson
+Flux.hinge
 ```
diff --git a/docs/src/models/nnlib.md b/docs/src/models/nnlib.md
new file mode 100644
index 00000000..f5732574
--- /dev/null
+++ b/docs/src/models/nnlib.md
@@ -0,0 +1,37 @@
+## NNlib
+Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
+
+## Activation Functions
+Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
+
+```@docs
+NNlib.elu
+NNlib.gelu
+NNlib.leakyrelu
+NNlib.logcosh
+NNlib.logsigmoid
+NNlib.sigmoid
+NNlib.relu
+NNlib.selu
+NNlib.softplus
+NNlib.softsign
+NNlib.swish
+```
+
+## Softmax
+```@docs
+NNlib.softmax
+NNlib.logsoftmax
+```
+
+## Pooling
+```@docs
+NNlib.maxpool
+NNlib.meanpool
+```
+
+## Convolution
+```@docs
+NNlib.conv
+NNlib.depthwiseconv
+```
\ No newline at end of file
diff --git a/docs/src/models/regularisation.md b/docs/src/models/regularisation.md
index e1d88d77..02aa3da8 100644
--- a/docs/src/models/regularisation.md
+++ b/docs/src/models/regularisation.md
@@ -31,7 +31,7 @@ julia> params(m)
  param([0.0, 0.0, 0.0, 0.0, 0.0])
 
 julia> sum(norm, params(m))
-26.01749952921026 (tracked)
+26.01749952921026
 ```
 
 Here's a larger example with a multi-layer perceptron.
@@ -52,7 +52,7 @@ One can also easily add per-layer regularisation via the `activations` function:
 ```julia
 julia> using Flux: activations
 
-julia> c = Chain(Dense(10,5,σ),Dense(5,2),softmax)
+julia> c = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
 Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
 
 julia> activations(c, rand(10))
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index b421d3e7..2268fdc0 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -7,6 +7,16 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
+"""
+    dropout(p, dims = :)
+
+Dropout function. For each input, either sets that input to `0` (with probability
+`p`) or scales it by `1/(1-p)`. The `dims` argument is to specify the unbroadcasted
+dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
+used as a regularisation, i.e. it reduces overfitting during training. 
+ 
+See also [`Dropout`](@ref).
+"""
 dropout(x, p; dims = :) = x
 
 @adjoint function dropout(x, p; dims = :)
@@ -18,10 +28,7 @@ end
 """
     Dropout(p, dims = :)
 
-A Dropout layer. For each input, either sets that input to `0` (with probability
-`p`) or scales it by `1/(1-p)`. The `dims` argument is to specified the unbroadcasted
- dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
- used as a regularisation, i.e. it reduces overfitting during training. see also [`dropout`](@ref).
+A Dropout layer. In the forward pass, applies the [`dropout`](@ref) function on the input.
 """
 mutable struct Dropout{F,D}
   p::F
@@ -43,6 +50,7 @@ end
 
 """
     AlphaDropout(p)
+    
 A dropout layer. It is used in Self-Normalizing Neural Networks.
 (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
 The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 159a8385..5de5842b 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -1,10 +1,12 @@
-using CuArrays
-using NNlib: logsoftmax, logσ
-
 # Cost functions
+"""
+    mse(ŷ, y)
 
+Return the mean squared error `sum((ŷ .- y).^2) / length(y)`. 
+"""
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
+
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
   return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
 end
@@ -17,10 +19,26 @@ function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Abstr
   return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
 end
 
+"""
+    crossentropy(ŷ, y; weight=1)
+
+Return the crossentropy computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`. 
+
+See also [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref).
+"""
 crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
 
-function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
-  return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
+"""
+    logitcrossentropy(ŷ, y; weight=1)
+
+Return the crossentropy computed after a [softmax](@ref) operation: 
+
+  -sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)
+
+See also [`crossentropy`](@ref), [`binarycrossentropy`](@ref).
+"""
+function logitcrossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
+  return -sum(y .* logsoftmax(ŷ) .* weight) * 1 // size(y, 2)
 end
 
 """
@@ -28,11 +46,7 @@ end
 
 Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerical stability.
 
-    julia> binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0.])
-    3-element Array{Float64,1}:
-    1.4244
-    0.352317
-    0.86167
+Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation.
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
@@ -40,44 +54,42 @@ binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ
 CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
 """
-    logitbinarycrossentropy(logŷ, y)
+    logitbinarycrossentropy(ŷ, y)
 
-`logitbinarycrossentropy(logŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(logŷ), y)`
+`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(ŷ), y)`
 but it is more numerically stable.
 
-    julia> logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0.])
-    3-element Array{Float64,1}:
-     1.4244
-     0.352317
-     0.86167
+See also [`binarycrossentropy`](@ref), [`sigmoid`](@ref), [`logsigmoid`](@ref).  
 """
-logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 
 # Re-definition to fix interaction with CuArrays.
-CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 
 """
-    normalise(x::AbstractArray; dims=1)
+    normalise(x; dims=1)
 
 Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
 
-    julia> a = reshape(collect(1:9), 3, 3)
-    3×3 Array{Int64,2}:
-     1  4  7
-     2  5  8
-     3  6  9
+```julia-repl
+julia> a = reshape(collect(1:9), 3, 3)
+3×3 Array{Int64,2}:
+  1  4  7
+  2  5  8
+  3  6  9
 
-    julia> normalise(a)
-    3×3 Array{Float64,2}:
-     -1.22474  -1.22474  -1.22474
-      0.0       0.0       0.0
-      1.22474   1.22474   1.22474
+julia> normalise(a)
+3×3 Array{Float64,2}:
+  -1.22474  -1.22474  -1.22474
+  0.0       0.0       0.0
+  1.22474   1.22474   1.22474
 
-    julia> normalise(a, dims=2)
-    3×3 Array{Float64,2}:
-     -1.22474  0.0  1.22474
-     -1.22474  0.0  1.22474
-     -1.22474  0.0  1.22474
+julia> normalise(a, dims=2)
+3×3 Array{Float64,2}:
+  -1.22474  0.0  1.22474
+  -1.22474  0.0  1.22474
+  -1.22474  0.0  1.22474
+```
 """
 function normalise(x::AbstractArray; dims=1)
   μ′ = mean(x, dims = dims)
@@ -87,6 +99,7 @@ end
 
 """
     kldivergence(ŷ, y)
+
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
 [KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
@@ -99,6 +112,7 @@ end
 
 """
     poisson(ŷ, y)
+
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
 [Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
@@ -106,7 +120,8 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
 """
     hinge(ŷ, y)
-Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
+
+Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)

From 169ed6eb25e3867f23c80af972830f6e8a1361b6 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 13:43:03 +0100
Subject: [PATCH 28/46] add ecosystem

---
 docs/src/ecosystem.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 docs/src/ecosystem.md

diff --git a/docs/src/ecosystem.md b/docs/src/ecosystem.md
new file mode 100644
index 00000000..e315244d
--- /dev/null
+++ b/docs/src/ecosystem.md
@@ -0,0 +1,18 @@
+# The Julia Ecosystem
+
+One of the main strengths of Julia lies in an ecosystem of packages 
+globally providing a rich and consistent user experience.
+
+This is a non-exhaustive list of Julia packages, nicely complementing `Flux` in typical
+machine learning and deep learning workflows:
+
+- [ArgParse.jl](https://github.com/carlobaldassi/ArgParse.jl): package for parsing command-line arguments to Julia programs.
+- [Augmentor.jl](https://github.com/Evizero/Augmentor.jl): a fast image augmentation library in Julia for machine learning.
+- [BSON.jl](https://github.com/JuliaIO/BSON.jl): package for working with the Binary JSON serialisation format
+- [DataFrames.jl](https://github.com/joshday/OnlineStats.jl): in-memory tabular data in Julia
+- [DrWatson.jl](https://github.com/JuliaDynamics/DrWatson.jl):  a scientific project assistant software
+- [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl): utility package for accessing common machine learning datasets
+- [OnlineStats.jl](https://github.com/joshday/OnlineStats.jl): single-pass algorithms for statistics
+- [Parameters.jl](https://github.com/mauro3/Parameters.jl): types with default field values, keyword constructors and (un-)pack macros
+- [ProgressMeters.jl](https://github.com/timholy/ProgressMeter.jl): progress meters for long-running computations
+- [TensorBoardLogger.jl](https://github.com/PhilipVinc/TensorBoardLogger.jl): easy peasy logging to [tensorboard](https://www.tensorflow.org/tensorboard) in Julia

From 4109f2e0d76bf88448b08bc45c6c4630ca25c1e7 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 13:45:17 +0100
Subject: [PATCH 29/46] cleanup

---
 docs/make.jl             | 1 +
 docs/src/models/nnlib.md | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index fe3544fc..7f73808a 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -21,6 +21,7 @@ makedocs(modules=[Flux, NNlib],
                   "One-Hot Encoding" => "data/onehot.md",
                   "GPU Support" => "gpu.md",
                   "Saving & Loading" => "saving.md",
+                  "The Julia Ecosystem" => "ecosystem.md",
                   "Performance Tips" => "performance.md",
                   "Community" => "community.md"],
          format = Documenter.HTML(assets = ["assets/flux.css"],
diff --git a/docs/src/models/nnlib.md b/docs/src/models/nnlib.md
index f5732574..9e570cb3 100644
--- a/docs/src/models/nnlib.md
+++ b/docs/src/models/nnlib.md
@@ -1,4 +1,4 @@
-## NNlib
+# NNlib
 Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
 
 ## Activation Functions

From 4f693e02cb210535aa19d16b4f04adf840b018c8 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 13:50:23 +0100
Subject: [PATCH 30/46] add model zoo reference

---
 docs/src/ecosystem.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/src/ecosystem.md b/docs/src/ecosystem.md
index e315244d..0672ffe6 100644
--- a/docs/src/ecosystem.md
+++ b/docs/src/ecosystem.md
@@ -16,3 +16,6 @@ machine learning and deep learning workflows:
 - [Parameters.jl](https://github.com/mauro3/Parameters.jl): types with default field values, keyword constructors and (un-)pack macros
 - [ProgressMeters.jl](https://github.com/timholy/ProgressMeter.jl): progress meters for long-running computations
 - [TensorBoardLogger.jl](https://github.com/PhilipVinc/TensorBoardLogger.jl): easy peasy logging to [tensorboard](https://www.tensorflow.org/tensorboard) in Julia
+
+
+This tight integration among Julia pakages is shown in some of the examples in the [model-zoo](https://github.com/FluxML/model-zoo) repository.
\ No newline at end of file

From b6c79b38b4bf54aba0ee096b38afd1180ad1ee55 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Wed, 26 Feb 2020 13:48:27 +0100
Subject: [PATCH 31/46] add DataLoader

special case train! for the unsupervised data iterator
---
 Manifest.toml                 |  2 +-
 Project.toml                  |  5 +-
 docs/make.jl                  |  4 +-
 docs/src/data/dataloader.md   |  6 +++
 docs/src/training/training.md | 19 +++++--
 src/Flux.jl                   |  1 +
 src/data/Data.jl              | 10 ++++
 src/data/dataloader.jl        | 88 +++++++++++++++++++++++++++++++++
 src/optimise/train.jl         | 19 ++++---
 test/data.jl                  | 93 ++++++++++++++++++++++++++++-------
 test/runtests.jl              | 59 ++++++++++++++--------
 11 files changed, 253 insertions(+), 53 deletions(-)
 create mode 100644 docs/src/data/dataloader.md
 create mode 100644 src/data/dataloader.jl

diff --git a/Manifest.toml b/Manifest.toml
index 693f7ca2..788e5354 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -252,7 +252,7 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.1.0"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
diff --git a/Project.toml b/Project.toml
index 71282a10..bd105730 100644
--- a/Project.toml
+++ b/Project.toml
@@ -40,7 +40,10 @@ julia = "1"
 
 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+
 [targets]
-test = ["Test", "Documenter"]
+test = ["Test", "Documenter", "IterTools", "LinearAlgebra"]
diff --git a/docs/make.jl b/docs/make.jl
index fe3544fc..0d597500 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -15,10 +15,12 @@ makedocs(modules=[Flux, NNlib],
                      "Regularisation" => "models/regularisation.md",
                      "Model Reference" => "models/layers.md",
                      "NNlib" => "models/nnlib.md"],
+                  "Handling Data" =>
+                    ["One-Hot Encoding" => "data/onehot.md",
+                     "DataLoader" => "data/dataloader.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
                      "Training" => "training/training.md"],
-                  "One-Hot Encoding" => "data/onehot.md",
                   "GPU Support" => "gpu.md",
                   "Saving & Loading" => "saving.md",
                   "Performance Tips" => "performance.md",
diff --git a/docs/src/data/dataloader.md b/docs/src/data/dataloader.md
new file mode 100644
index 00000000..70a883c9
--- /dev/null
+++ b/docs/src/data/dataloader.md
@@ -0,0 +1,6 @@
+# DataLoader
+Flux provides the `DataLoader` type in the `Flux.Data` module to handle iteration over mini-batches of data. 
+
+```@docs
+Flux.Data.DataLoader
+```
\ No newline at end of file
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index b42db7c9..64b2b5e8 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -7,10 +7,10 @@ To actually train a model we need four things:
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 
-With these we can call `Flux.train!`:
+With these we can call `train!`:
 
-```julia
-Flux.train!(objective, params, data, opt)
+```@docs
+Flux.Optimise.train!
 ```
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
@@ -56,7 +56,8 @@ data = [(x, y)]
 ```julia
 data = [(x, y), (x, y), (x, y)]
 # Or equivalently
-data = Iterators.repeated((x, y), 3)
+using IterTools: ncycle
+data = ncycle([(x, y)], 3)
 ```
 
 It's common to load the `x`s and `y`s separately. In this case you can use `zip`:
@@ -67,6 +68,14 @@ ys = [rand( 10), rand( 10), rand( 10)]
 data = zip(xs, ys)
 ```
 
+Training data can be conveniently  partitioned for mini-batch training using the [`Flux.Data.DataLoader`](@ref) type:
+
+```julia
+X = rand(28, 28, 60000)
+Y = rand(0:9, 60000)
+data = DataLoader(X, Y, batchsize=128) 
+```
+
 Note that, by default, `train!` only loops over the data once (a single "epoch").
 A convenient way to run multiple epochs from the REPL is provided by `@epochs`.
 
@@ -120,7 +129,7 @@ An example follows that works similar to the default `Flux.train` but with no ca
 You don't need callbacks if you just code the calls to your functions directly into the loop.
 E.g. in the places marked with comments.
 
-```
+```julia
 function my_custom_train!(loss, ps, data, opt)
   ps = Params(ps)
   for d in data
diff --git a/src/Flux.jl b/src/Flux.jl
index 9969b323..c99e41a1 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -7,6 +7,7 @@ using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using MacroTools: @forward
 @reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
+
 export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
diff --git a/src/data/Data.jl b/src/data/Data.jl
index 88af9549..940b7ea7 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -3,6 +3,9 @@ module Data
 import ..Flux
 import SHA
 
+using Random: shuffle!
+using Base: @propagate_inbounds
+
 export CMUDict, cmudict
 
 deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...)
@@ -26,6 +29,9 @@ function __init__()
   mkpath(deps())
 end
 
+include("dataloader.jl")
+export DataLoader
+
 include("mnist.jl")
 export MNIST
 
@@ -42,7 +48,11 @@ using .Sentiment
 include("iris.jl")
 export Iris
 
+<<<<<<< HEAD
 include("housing.jl")
 export Housing
 
 end
+=======
+end #module
+>>>>>>> af20a785... add DataLoader
diff --git a/src/data/dataloader.jl b/src/data/dataloader.jl
new file mode 100644
index 00000000..baf32a83
--- /dev/null
+++ b/src/data/dataloader.jl
@@ -0,0 +1,88 @@
+# Adapted from Knet's src/data.jl (author: Deniz Yuret)
+
+struct DataLoader
+    data
+    batchsize::Int
+    nobs::Int
+    partial::Bool
+    imax::Int
+    indices::Vector{Int}
+    shuffle::Bool
+end
+
+"""
+     DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+
+An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
+(except possibly the last one). 
+
+Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in 
+supervised learning. The last dimension in each tensor is considered to be the observation
+dimension. 
+
+If `shuffle=true`, shuffles the observations each time iterations are re-started.
+If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
+
+Example usage:
+
+    Xtrain = rand(10, 100)
+    dtrain = DataLoader(Xtrain, batchsize=2) 
+    # iterate over 50 mini-batches
+    for x in dtrain: 
+        @assert size(x) == (10, 2)
+        ...
+    end
+
+    Xtrain = rand(10, 100)
+    Ytrain = rand(100)
+    dtrain = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true) 
+    for epoch in 1:100
+        for (x, y) in dtrain: 
+            @assert size(x) == (10, 2)
+            @assert size(y) == (2,)
+            ...
+        end
+    end
+
+    # train for 10 epochs
+    using IterTools: ncycle 
+    Flux.train!(loss, ps, ncycle(dtrain, 10), opt)
+"""
+function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
+    batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
+    
+    nx = size(data[1])[end]
+    for i=2:length(data)
+        nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
+    end
+    if nx < batchsize
+        @warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
+        batchsize = nx
+    end
+    imax = partial ? nx : nx - batchsize + 1
+    ids = 1:min(nx, batchsize)
+    DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
+end
+
+getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
+
+@propagate_inbounds function Base.iterate(d::DataLoader, i=0)     # returns data in d.indices[i+1:i+batchsize]
+    i >= d.imax && return nothing
+    if d.shuffle && i == 0
+        shuffle!(d.indices)
+    end
+    nexti = min(i + d.batchsize, d.nobs)
+    ids = d.indices[i+1:nexti]
+    if length(d.data) == 1
+        batch = getdata(d.data[1], ids)
+    else
+        batch = ((getdata(x, ids) for x in d.data)...,)
+    end
+    return (batch, nexti)
+end
+
+function Base.length(d::DataLoader)
+    n = d.nobs / d.batchsize
+    d.partial ? ceil(Int,n) : floor(Int,n)
+end
\ No newline at end of file
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 59404a42..34a98394 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -61,13 +61,14 @@ end
 For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
 backpropagation and calls the optimizer `opt`.
 
+In case datapoints `d` are of array type, assumes no splatting is needed 
+and computes the gradient of `loss(d)`.
+
 Takes a callback as keyword argument `cb`. For example, this will print "training"
 every 10 seconds:
 
-```julia
-Flux.train!(loss, params, data, opt,
-            cb = throttle(() -> println("training"), 10))
-```
+  train!(loss, params, data, opt,
+         cb = throttle(() -> println("training"), 10))
 
 The callback can call `Flux.stop()` to interrupt the training loop.
 
@@ -78,8 +79,14 @@ function train!(loss, ps, data, opt; cb = () -> ())
   cb = runall(cb)
   @progress for d in data
     try
-      gs = gradient(ps) do
-        loss(d...)
+      if d isa AbstractArray
+        gs = gradient(ps) do
+          loss(d)
+        end
+      else
+        gs = gradient(ps) do
+          loss(d...)
+        end
       end
       update!(opt, ps, gs)
       cb()
diff --git a/test/data.jl b/test/data.jl
index 6c012a93..1a090174 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -1,28 +1,85 @@
-using Flux.Data
-using Test
+@testset "DataLoader" begin
+    X = reshape([1:10;], (2, 5))
+    Y = [1:5;]
 
-@test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
+    d = DataLoader(X, batchsize=2)
+    batches = collect(d)
+    @test length(batches) == 3
+    @test batches[1] == X[:,1:2]
+    @test batches[2] == X[:,3:4]
+    @test batches[3] == X[:,5:5]
 
-@test length(CMUDict.phones()) == 39
+    d = DataLoader(X, batchsize=2, partial=false)
+    batches = collect(d)
+    @test length(batches) == 2
+    @test batches[1] == X[:,1:2]
+    @test batches[2] == X[:,3:4]
 
-@test length(CMUDict.symbols()) == 84
+    d = DataLoader(X, Y, batchsize=2)
+    batches = collect(d)
+    @test length(batches) == 3
+    @test length(batches[1]) == 2
+    @test length(batches[2]) == 2
+    @test length(batches[3]) == 2
+    @test batches[1][1] == X[:,1:2]
+    @test batches[1][2] == Y[1:2]
+    @test batches[2][1] == X[:,3:4]
+    @test batches[2][2] == Y[3:4]
+    @test batches[3][1] == X[:,5:5]
+    @test batches[3][2] == Y[5:5]
 
-@test MNIST.images()[1] isa Matrix
-@test MNIST.labels() isa Vector{Int64}
+    # test interaction with `train!`
+    θ = ones(2)
+    X = zeros(2, 10)
+    loss(x) = sum((x .- θ).^2)
+    d  = DataLoader(X) 
+    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    @test norm(θ) < 1e-4
 
-@test FashionMNIST.images()[1] isa Matrix
-@test FashionMNIST.labels() isa Vector{Int64}
+    # test interaction with `train!`
+    θ = zeros(2)
+    X = ones(2, 10)
+    Y = fill(2, 10)
+    loss(x, y) = sum((y - x'*θ).^2)
+    d  = DataLoader(X, Y) 
+    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    @test norm(θ .- 1) < 1e-10
+end
 
-@test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
+@testset "CMUDict" begin 
+    @test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
 
-@test Iris.features() isa Matrix
-@test size(Iris.features()) == (4,150) 
+    @test length(CMUDict.phones()) == 39
 
-@test Iris.labels() isa Vector{String}
-@test size(Iris.labels()) == (150,)
+    @test length(CMUDict.symbols()) == 84
+end
 
-@test Housing.features() isa Matrix
-@test size(Housing.features()) == (506, 13)
+@testset "MNIST" begin 
+    @test MNIST.images()[1] isa Matrix
+    @test MNIST.labels() isa Vector{Int64}
+end
 
-@test Housing.targets() isa Array{Float64}
-@test size(Housing.targets()) == (506, 1)
+@testset "FashionMNIST" begin 
+    @test FashionMNIST.images()[1] isa Matrix
+    @test FashionMNIST.labels() isa Vector{Int64}
+end
+
+@testset "Sentiment" begin 
+    @test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
+end
+
+@testset "Iris" begin 
+    @test Iris.features() isa Matrix
+    @test size(Iris.features()) == (4,150)
+
+    @test Iris.labels() isa Vector{String}
+    @test size(Iris.labels()) == (150,)
+end
+
+@testest "Housing" begin
+    @test Housing.features() isa Matrix
+    @test size(Housing.features()) == (506, 13)
+
+    @test Housing.targets() isa Array{Float64}
+    @test size(Housing.targets()) == (506, 1)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 1505e96a..81182f0d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,32 +1,49 @@
-using Flux, Test, Random, Statistics, Documenter
-using Random
+using Flux 
+using Flux.Data
+using Test 
+using Random, Statistics, LinearAlgebra
+using Documenter
+using IterTools: ncycle
 
 Random.seed!(0)
 
 @testset "Flux" begin
 
-@info "Testing Basics"
+  @testset "Utils" begin
+    include("utils.jl")
+  end
 
-include("utils.jl")
-include("onehot.jl")
-include("optimise.jl")
-include("data.jl")
+  @testset "Onehot" begin
+    include("onehot.jl")
+  end
 
-@info "Testing Layers"
+  @testset "Optimise" begin
+    include("optimise.jl")
+  end
 
-include("layers/basic.jl")
-include("layers/normalisation.jl")
-include("layers/stateless.jl")
-include("layers/conv.jl")
+  @testset "Data" begin
+    include("data.jl")
+  end
 
-if Flux.use_cuda[]
-  include("cuda/cuda.jl")
-else
-  @warn "CUDA unavailable, not testing GPU support"
-end
+  @testset "Layers" begin
+    include("layers/basic.jl")
+    include("layers/normalisation.jl")
+    include("layers/stateless.jl")
+    include("layers/conv.jl")
+  end
 
-if VERSION >= v"1.2"
-  doctest(Flux)
-end
+  @testset "CUDA" begin
+    if Flux.use_cuda[]
+      include("cuda/cuda.jl")
+    else
+      @warn "CUDA unavailable, not testing GPU support"
+    end
+  end
 
-end
+  @testset "Docs" begin
+    if VERSION >= v"1.2"
+      doctest(Flux)
+    end
+  end
+
+end # testset Flux

From 487002878ed530303cf9527e7cca0ea57b34d5b2 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 27 Feb 2020 20:49:05 +0100
Subject: [PATCH 32/46] restrict train! special casing

---
 src/optimise/train.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 34a98394..54b7f53a 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -79,7 +79,7 @@ function train!(loss, ps, data, opt; cb = () -> ())
   cb = runall(cb)
   @progress for d in data
     try
-      if d isa AbstractArray
+      if d isa AbstractArray{<:Number}
         gs = gradient(ps) do
           loss(d)
         end

From 97141e8c98fc94feadbe287f45a32b58bd3d515c Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 27 Feb 2020 20:49:55 +0100
Subject: [PATCH 33/46] improve docstring

---
 src/optimise/train.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 54b7f53a..79ebcc06 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -61,7 +61,7 @@ end
 For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
 backpropagation and calls the optimizer `opt`.
 
-In case datapoints `d` are of array type, assumes no splatting is needed 
+In case datapoints `d` are of numeric array type, assumes no splatting is needed 
 and computes the gradient of `loss(d)`.
 
 Takes a callback as keyword argument `cb`. For example, this will print "training"

From a72258ea2a428ce4b12e711395856091f17f9fcc Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 18:55:49 +0100
Subject: [PATCH 34/46] fix rebase

---
 src/data/Data.jl | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/data/Data.jl b/src/data/Data.jl
index 940b7ea7..16a025a7 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -48,11 +48,7 @@ using .Sentiment
 include("iris.jl")
 export Iris
 
-<<<<<<< HEAD
 include("housing.jl")
 export Housing
 
 end
-=======
-end #module
->>>>>>> af20a785... add DataLoader

From a1efc434c21d2e4026e5d4f8764854451bac88c5 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 19:40:44 +0100
Subject: [PATCH 35/46] fix typo

---
 test/data.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index 1a090174..c7a8fdfd 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -76,7 +76,7 @@ end
     @test size(Iris.labels()) == (150,)
 end
 
-@testest "Housing" begin
+@testset "Housing" begin
     @test Housing.features() isa Matrix
     @test size(Housing.features()) == (506, 13)
 

From 5cbd2cecf29cf58a4e4bd97e637515c299a522d8 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 29 Feb 2020 16:09:59 -0600
Subject: [PATCH 36/46] Changed testmode! to return model

---
 src/functor.jl               |  2 +-
 src/layers/basic.jl          |  2 +-
 src/layers/normalise.jl      | 10 +++++-----
 test/layers/normalisation.jl | 16 ++++++++--------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index 4edfbd98..ee384b98 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -50,7 +50,7 @@ Possible values include:
 - `true` for testing
 - `:auto` or `nothing` for Flux to detect the mode automatically
 """
-testmode!(m, mode) = nothing
+testmode!(m, mode = true) = m
 
 params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
 
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 6788f761..10d1f07b 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -33,7 +33,7 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
-testmode!(m::Chain, mode = true) = map(x -> testmode!(x, mode), m.layers)
+testmode!(m::Chain, mode = true) = (map(x -> testmode!(x, mode), m.layers); m)
 
 function Base.show(io::IO, c::Chain)
   print(io, "Chain(")
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7b438bc2..36c6d2bd 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -44,7 +44,7 @@ function (a::Dropout)(x)
 end
 
 testmode!(m::Dropout, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, d::Dropout)
   print(io, "Dropout(", d.p)
@@ -83,7 +83,7 @@ function (a::AlphaDropout)(x)
 end
 
 testmode!(m::AlphaDropout, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 """
     LayerNorm(h::Integer)
@@ -191,7 +191,7 @@ end
 @functor BatchNorm
 
 testmode!(m::BatchNorm, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::BatchNorm)
   print(io, "BatchNorm($(join(size(l.β), ", "))")
@@ -290,7 +290,7 @@ end
 @functor InstanceNorm
 
 testmode!(m::InstanceNorm, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::InstanceNorm)
   print(io, "InstanceNorm($(join(size(l.β), ", "))")
@@ -393,7 +393,7 @@ end
 @functor GroupNorm
 
 testmode!(m::GroupNorm, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode)
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::GroupNorm)
   print(io, "GroupNorm($(join(size(l.β), ", "))")
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 594fb586..79bd9c77 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -85,19 +85,19 @@ end
     @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
   end
 
-  let m = trainmode(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
+  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:6), 3, 2, 1)
     y = reshape(permutedims(x, [2, 1, 3]), 2, :)
     y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
     @test m(x) == y
   end
 
-  let m = trainmode(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
+  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:12), 2, 3, 2, 1)
     y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
     @test m(x) == y
   end
 
-  let m = trainmode(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
+  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
     y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
     @test m(x) == y
@@ -165,7 +165,7 @@ end
     @test isapprox(y, sigmoid.((x .- expand_inst(m.μ, affine_shape)) ./ sqrt.(expand_inst(m.σ², affine_shape) .+ m.ϵ)), atol = 1.0e-7)
   end
 
-  let m = trainmode(InstanceNorm(2)), sizes = (2, 4, 1, 2, 3),
+  let m = testmode!(InstanceNorm(2), false), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
     y = reshape(m(y), sizes...)
@@ -182,7 +182,7 @@ end
   end
 
   # show that instance norm is equal to batch norm when channel and batch dims are squashed
-  let m_inorm = trainmode(InstanceNorm(2)), m_bnorm = trainmode(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
+  let m_inorm = testmode!(InstanceNorm(2), false), m_bnorm = testmode!(BatchNorm(12), false), sizes = (5, 5, 3, 4, 2, 6),
       x = reshape(Float32.(collect(1:prod(sizes))), sizes)
     @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes)
   end
@@ -266,7 +266,7 @@ if VERSION >= v"1.1"
     @test isapprox(y, out, atol = 1.0e-7)
   end
 
-  let m = trainmode(GroupNorm(2,2)), sizes = (2, 4, 1, 2, 3),
+  let m = testmode!(GroupNorm(2,2), false), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
     y = reshape(m(y), sizes...)
@@ -283,13 +283,13 @@ if VERSION >= v"1.1"
   end
 
   # show that group norm is the same as instance norm when the group size is the same as the number of channels
-  let IN = trainmode(InstanceNorm(4)), GN = trainmode(GroupNorm(4,4)), sizes = (2,2,3,4,5),
+  let IN = testmode!(InstanceNorm(4), false), GN = testmode!(GroupNorm(4,4), false), sizes = (2,2,3,4,5),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test IN(x) ≈ GN(x)
   end
 
   # show that group norm is the same as batch norm for a group of size 1 and batch of size 1
-  let BN = trainmode(BatchNorm(4)), GN = trainmode(GroupNorm(4,4)), sizes = (2,2,3,4,1),
+  let BN = testmode!(BatchNorm(4), false), GN = testmode!(GroupNorm(4,4), false), sizes = (2,2,3,4,1),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test BN(x) ≈ GN(x)
   end

From 568ecb1c979a6b05e379d13c2ed2d6ed45f2a71b Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 29 Feb 2020 16:25:18 -0600
Subject: [PATCH 37/46] Removed trainmode from tests

---
 test/layers/normalisation.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 79bd9c77..f9d4849a 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -2,7 +2,6 @@ using Flux, Test, Statistics
 using Zygote: pullback
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
-trainmode(f) = (testmode!(f, false); f)
 
 @testset "Dropout" begin
   x = [1.,2.,3.]

From 6076847a454027c2599b9e8588df824f734a087e Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Sun, 1 Mar 2020 15:07:12 +0100
Subject: [PATCH 38/46] fix a few typos in docstrings

---
 docs/src/training/optimisers.md | 8 ++++----
 src/optimise/optimisers.jl      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 37288b5d..1ee526b3 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -62,7 +62,7 @@ ADAMW
 
 ## Optimiser Interface
 
-Flux's optimsers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
+Flux's optimisers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
 
 In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
 
@@ -100,15 +100,15 @@ Flux internally calls on this function via the `update!` function. It shares the
 
 ## Composing Optimisers
 
-Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
+Flux defines a special kind of optimiser simply called `Optimiser` which takes in arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
 that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
 
 ```julia
 opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
 ```
 
-Here we apply exponential decay to the `Descent` optimser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
-It is then applied like any optimser.
+Here we apply exponential decay to the `Descent` optimiser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
+It is then applied like any optimiser.
 
 ```julia
 w = randn(10, 10)
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index cf4496f4..212b876e 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -77,7 +77,7 @@ Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
 
 ## Parameters
   - Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
-  - Nesterov Momentum (ρ): Paramters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
+  - Nesterov Momentum (ρ): Parameters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
 
 ## Examples
 ```julia
@@ -105,7 +105,7 @@ end
 """
     RMSProp(η, ρ)
 
-Implements the RMSProp algortihm. Often a good choice for recurrent networks. Paramters other than learning rate generally don't need tuning.
+Implements the RMSProp algortihm. Often a good choice for recurrent networks. Parameters other than learning rate generally don't need tuning.
 
 ## Parameters
   - Learning Rate (η): Defaults to `0.001`.

From 32e0aa9fcb2812b1aca279d5466a2d3c8a6264f4 Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Sun, 1 Mar 2020 15:15:39 +0100
Subject: [PATCH 39/46] docstring ensure signature code formatting

by using a four space indent instead of two
---
 src/data/dataloader.jl     | 2 +-
 src/data/iris.jl           | 2 --
 src/optimise/optimisers.jl | 8 ++++----
 src/optimise/train.jl      | 4 ++--
 src/utils.jl               | 2 +-
 5 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/data/dataloader.jl b/src/data/dataloader.jl
index baf32a83..8868a9b0 100644
--- a/src/data/dataloader.jl
+++ b/src/data/dataloader.jl
@@ -11,7 +11,7 @@ struct DataLoader
 end
 
 """
-     DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+    DataLoader(data...; batchsize=1, shuffle=false, partial=true)
 
 An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
 (except possibly the last one). 
diff --git a/src/data/iris.jl b/src/data/iris.jl
index d78606d8..f74e0709 100644
--- a/src/data/iris.jl
+++ b/src/data/iris.jl
@@ -28,7 +28,6 @@ function load()
 end
 
 """
-
     labels()
 
 Get the labels of the iris dataset, a 150 element array of strings listing the
@@ -53,7 +52,6 @@ function labels()
 end
 
 """
-
     features()
 
 Get the features of the iris dataset.  This is a 4x150 matrix of Float64
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index cf4496f4..75ba8618 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -6,7 +6,7 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-  Descent(η)
+    Descent(η)
 
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
@@ -441,7 +441,7 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
-  InvDecay(γ)
+    InvDecay(γ)
 
 Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
 ```
@@ -470,7 +470,7 @@ function apply!(o::InvDecay, x, Δ)
 end
 
 """
-  ExpDecay(eta, decay, decay_step, clip)
+    ExpDecay(eta, decay, decay_step, clip)
 
 Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
 
@@ -509,7 +509,7 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-  WeightDecay(wd)
+    WeightDecay(wd)
 
 Decays the weight by `wd`
 
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 79ebcc06..e12ab27b 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -3,8 +3,8 @@ import Zygote: Params, gradient
 
 
 """
-  update!(opt, p, g)
-  update!(opt, ps::Params, gs)
+    update!(opt, p, g)
+    update!(opt, ps::Params, gs)
 
 Perform an update step of the parameters `ps` (or the single parameter `p`) 
 according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
diff --git a/src/utils.jl b/src/utils.jl
index 2dba21c7..f483c5d9 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -60,7 +60,7 @@ head(x::Tuple) = reverse(Base.tail(reverse(x)))
 squeezebatch(x) = reshape(x, head(size(x)))
 
 """
-  batch(xs)
+    batch(xs)
 
 Batch the arrays in `xs` into a single array.
 

From f4365dab94e6cc2f46e7604f5ba1de311617db28 Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Sun, 1 Mar 2020 15:19:22 +0100
Subject: [PATCH 40/46] fix docstring example indentation as well

---
 src/optimise/optimisers.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 75ba8618..c8e00126 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -451,7 +451,7 @@ Applies inverse time decay to an optimiser, i.e., the effective step size at ite
 
 ## Example
 ```julia
-  Optimiser(InvDecay(..), Opt(..))
+Optimiser(InvDecay(..), Opt(..))
 ```
 """
 mutable struct InvDecay
@@ -483,9 +483,8 @@ Discount the learning rate `eta` by a multiplicative factor `decay` every `decay
 ## Example
 To apply exponential decay to an optimiser:
 ```julia
-  Optimiser(ExpDecay(..), Opt(..))
-
-  opt = Optimiser(ExpDecay(), ADAM())
+Optimiser(ExpDecay(..), Opt(..))
+opt = Optimiser(ExpDecay(), ADAM())
 ```
 """
 mutable struct ExpDecay

From d67a2e40b3039830c68253e973d292257e00537a Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Sun, 1 Mar 2020 15:20:40 +0100
Subject: [PATCH 41/46] remove stray code block start from docstring

---
 src/optimise/optimisers.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index c8e00126..f853ac23 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -444,7 +444,6 @@ end
     InvDecay(γ)
 
 Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
-```
 
 ## Parameters
   - gamma (γ): Defaults to `0.001`

From c001d0f3c5cf8613cac2be67821cc6d0561280a4 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 12:30:41 -0600
Subject: [PATCH 42/46] Added trainmode! and updated docs with warning

---
 docs/src/models/layers.md    |  1 +
 src/Flux.jl                  |  2 +-
 src/functor.jl               | 21 ++++++++++++++++++++-
 test/layers/normalisation.jl | 16 ++++++++--------
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 763fbf8c..100cee4d 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -72,6 +72,7 @@ Many normalisation layers behave differently under training and inference (testi
 
 ```@docs
 testmode!
+trainmode!
 ```
 
 ## Cost Functions
diff --git a/src/Flux.jl b/src/Flux.jl
index 5f9878f3..163fcdf2 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -11,7 +11,7 @@ export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
        DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
-       SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!
+       SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!
 
 include("optimise/Optimise.jl")
 using .Optimise
diff --git a/src/functor.jl b/src/functor.jl
index ee384b98..fce730b1 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -40,11 +40,14 @@ end
 trainable(m) = functor(m)[1]
 
 """
-  testmode!(m, mode = true)
+    testmode!(m, mode = true)
 
 Set a layer or model's test mode (see below).
 Using `:auto` mode will treat any gradient computation as training.
 
+_Note_: if you manually set a model into test mode, you need to manually place
+it back into train mode.
+
 Possible values include:
 - `false` for training
 - `true` for testing
@@ -52,6 +55,22 @@ Possible values include:
 """
 testmode!(m, mode = true) = m
 
+"""
+    trainmode!(m, mode = true)
+
+Set a layer of model's train mode (see below).
+Symmetric to [`testmode`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
+
+_Note_: if you manually set a model into train mode, you need to manually place
+it into test mode.
+
+Possible values include:
+- `true` for training
+- `false` for testing
+- `:auto` or `nothing` for Flux to detect the mode automatically
+"""
+trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode)
+
 params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
 
 function params!(p::Params, x, seen = IdSet())
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index f9d4849a..ed2879b0 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -84,19 +84,19 @@ end
     @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
   end
 
-  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:6), 3, 2, 1)
+  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
     y = reshape(permutedims(x, [2, 1, 3]), 2, :)
     y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
     @test m(x) == y
   end
 
-  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:12), 2, 3, 2, 1)
+  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
     y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
     @test m(x) == y
   end
 
-  let m = testmode!(BatchNorm(2), false), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
+  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
     y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
     @test m(x) == y
@@ -164,7 +164,7 @@ end
     @test isapprox(y, sigmoid.((x .- expand_inst(m.μ, affine_shape)) ./ sqrt.(expand_inst(m.σ², affine_shape) .+ m.ϵ)), atol = 1.0e-7)
   end
 
-  let m = testmode!(InstanceNorm(2), false), sizes = (2, 4, 1, 2, 3),
+  let m = trainmode!(InstanceNorm(2)), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
     y = reshape(m(y), sizes...)
@@ -181,7 +181,7 @@ end
   end
 
   # show that instance norm is equal to batch norm when channel and batch dims are squashed
-  let m_inorm = testmode!(InstanceNorm(2), false), m_bnorm = testmode!(BatchNorm(12), false), sizes = (5, 5, 3, 4, 2, 6),
+  let m_inorm = trainmode!(InstanceNorm(2)), m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
       x = reshape(Float32.(collect(1:prod(sizes))), sizes)
     @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes)
   end
@@ -265,7 +265,7 @@ if VERSION >= v"1.1"
     @test isapprox(y, out, atol = 1.0e-7)
   end
 
-  let m = testmode!(GroupNorm(2,2), false), sizes = (2, 4, 1, 2, 3),
+  let m = trainmode!(GroupNorm(2,2)), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
     y = reshape(m(y), sizes...)
@@ -282,13 +282,13 @@ if VERSION >= v"1.1"
   end
 
   # show that group norm is the same as instance norm when the group size is the same as the number of channels
-  let IN = testmode!(InstanceNorm(4), false), GN = testmode!(GroupNorm(4,4), false), sizes = (2,2,3,4,5),
+  let IN = trainmode!(InstanceNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,5),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test IN(x) ≈ GN(x)
   end
 
   # show that group norm is the same as batch norm for a group of size 1 and batch of size 1
-  let BN = testmode!(BatchNorm(4), false), GN = testmode!(GroupNorm(4,4), false), sizes = (2,2,3,4,1),
+  let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,1),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test BN(x) ≈ GN(x)
   end

From 35e460b044d47433999c5719111ff1b14138fef2 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 12:44:36 -0600
Subject: [PATCH 43/46] Fixed broken @ref in docstring

---
 src/functor.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/functor.jl b/src/functor.jl
index fce730b1..ba8c9212 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -59,7 +59,7 @@ testmode!(m, mode = true) = m
     trainmode!(m, mode = true)
 
 Set a layer of model's train mode (see below).
-Symmetric to [`testmode`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
+Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
 
 _Note_: if you manually set a model into train mode, you need to manually place
 it into test mode.

From 23f791e32b6176500d0a48af1afe90b4f8a7958c Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 12:49:30 -0600
Subject: [PATCH 44/46] Add "during X phase" phrasing to testmode!/trainmode!
 docstring.

---
 src/functor.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index ba8c9212..0d7c55f1 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -46,7 +46,7 @@ Set a layer or model's test mode (see below).
 Using `:auto` mode will treat any gradient computation as training.
 
 _Note_: if you manually set a model into test mode, you need to manually place
-it back into train mode.
+it back into train mode during training phase.
 
 Possible values include:
 - `false` for training
@@ -62,7 +62,7 @@ Set a layer of model's train mode (see below).
 Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
 
 _Note_: if you manually set a model into train mode, you need to manually place
-it into test mode.
+it into test mode during testing phase.
 
 Possible values include:
 - `true` for training

From 88cad1c5e7fb1d16702bff72444a3b91c7bb9469 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 12:50:49 -0600
Subject: [PATCH 45/46] Bump minor version to v0.10.3

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index bd105730..f88d2451 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.2"
+version = "0.10.3"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

From e49d9c4537714441730f4023b12b168916246137 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 1 Mar 2020 13:11:07 -0600
Subject: [PATCH 46/46] Debump version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f88d2451..bd105730 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.3"
+version = "0.10.2"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"