From b4ed16ad9cd52905a94ea18b70148724998742ab Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Tue, 3 Dec 2019 22:48:48 -0600
Subject: [PATCH 01/23] Added outdims for some basic layers

---
 src/layers/basic.jl | 35 +++++++++++++++++++++++++++++++
 src/layers/conv.jl  | 51 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 2a465208..f2e7645d 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -39,6 +39,17 @@ function Base.show(io::IO, c::Chain)
   print(io, ")")
 end
 
+"""
+    outdims(c::Chain, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+outdims(m, (10, 10)) == (6, 6)
+```
+"""
+outdims(c::Chain, isize::Tuple) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -116,6 +127,19 @@ end
 (a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::Dense, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Dense(10, 5)
+outdims(m, (5, 2)) == (5,)
+outdims(m, (10,)) == (5,)
+```
+"""
+outdims(l::Dense, isize) = (size(l.W)[2],)
+
 """
     Diagonal(in::Integer)
 
@@ -145,6 +169,17 @@ function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", length(l.α), ")")
 end
 
+"""
+    outdims(l::Diagonal, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Diagonal(10)
+outdims(m, (10,)) == (10,)
+```
+"""
+outdims(l::Diagonal, isize) = (length(l.α),)
 
 """
     Maxout(over)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index f4de3ffc..eeeea82b 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,5 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv
 
+_convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
+
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
 """
@@ -68,6 +70,18 @@ end
 (a::Conv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::Conv, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Conv((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::Conv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     ConvTranspose(size, in=>out)
     ConvTranspose(size, in=>out, relu)
@@ -140,6 +154,7 @@ end
 
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
+
 """
     DepthwiseConv(size, in=>out)
     DepthwiseConv(size, in=>out, relu)
@@ -204,6 +219,18 @@ end
 (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::DepthwiseConv, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = DepthwiseConv((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::DepthwiseConv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     CrossCor(size, in=>out)
     CrossCor(size, in=>out, relu)
@@ -304,6 +331,18 @@ function Base.show(io::IO, m::MaxPool)
   print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
+"""
+    outdims(l::MaxPool, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = MaxPool((2, 2))
+outdims(m, (10, 10)) == (5, 5)
+```
+"""
+outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
+
 """
     MeanPool(k)
 
@@ -331,3 +370,15 @@ end
 function Base.show(io::IO, m::MeanPool)
   print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
+
+"""
+    outdims(l::MeanPool, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = MeanPool((2, 2))
+outdims(m, (10, 10)) == (5, 5)
+```
+"""
+outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
\ No newline at end of file

From 31dda0ce6cd8c264d083d453823f4f13fa755da5 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 5 Dec 2019 21:57:10 -0600
Subject: [PATCH 02/23] Updated with all basic and conv layers outdims

---
 src/layers/basic.jl | 16 ++++++++++++++--
 src/layers/conv.jl  | 25 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index f2e7645d..8794b58c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -40,7 +40,7 @@ function Base.show(io::IO, c::Chain)
 end
 
 """
-    outdims(c::Chain, isize::Tuple)
+    outdims(c::Chain, isize)
 
 Calculate the output dimensions given the input dimensions, `isize`.
 
@@ -49,7 +49,7 @@ m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
 outdims(m, (10, 10)) == (6, 6)
 ```
 """
-outdims(c::Chain, isize::Tuple) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
+outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -228,6 +228,18 @@ function (mo::Maxout)(input::AbstractArray)
     mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end
 
+"""
+    outdims(c::Maxout, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Maxout(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::Maxout, isize) = outdims(first(l.over))
+
 """
     SkipConnection(layers, connection)
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index eeeea82b..2e3e87d7 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,6 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv
 
 _convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
+_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad))
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -155,6 +156,18 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::ConvTranspose, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = ConvTranspose((3, 3), 3 => 16)
+outdims(m, (8, 8)) == (10, 10)
+```
+"""
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     DepthwiseConv(size, in=>out)
     DepthwiseConv(size, in=>out, relu)
@@ -302,6 +315,18 @@ end
 (a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::CrossCor, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = CrossCor((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::CrossCor{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     MaxPool(k)
 

From 6265b1fa39c5d7d289ccd5a00c94ae9f448377fc Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 5 Dec 2019 22:54:25 -0600
Subject: [PATCH 03/23] Added tests for outdims

---
 src/layers/basic.jl  |  8 ++++----
 src/layers/conv.jl   |  8 ++++----
 test/layers/basic.jl | 15 +++++++++++++++
 test/layers/conv.jl  | 20 ++++++++++++++++++++
 4 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 8794b58c..b62d8bb9 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -49,7 +49,7 @@ m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
 outdims(m, (10, 10)) == (6, 6)
 ```
 """
-outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
+outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(isize)
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -138,7 +138,7 @@ outdims(m, (5, 2)) == (5,)
 outdims(m, (10,)) == (5,)
 ```
 """
-outdims(l::Dense, isize) = (size(l.W)[2],)
+outdims(l::Dense, isize) = (size(l.W)[1],)
 
 """
     Diagonal(in::Integer)
@@ -234,11 +234,11 @@ end
 Calculate the output dimensions given the input dimensions, `isize`.
 
 ```julia
-m = Maxout(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
 outdims(m, (10, 10)) == (8, 8)
 ```
 """
-outdims(l::Maxout, isize) = outdims(first(l.over))
+outdims(l::Maxout, isize) = outdims(first(l.over), isize)
 
 """
     SkipConnection(layers, connection)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 2e3e87d7..6ce9bcbf 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,7 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv
 
 _convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
-_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad))
+_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad)
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -238,7 +238,7 @@ end
 Calculate the output dimensions given the input dimensions, `isize`.
 
 ```julia
-m = DepthwiseConv((3, 3), 3 => 16)
+m = DepthwiseConv((3, 3), 3 => 6)
 outdims(m, (10, 10)) == (8, 8)
 ```
 """
@@ -366,7 +366,7 @@ m = MaxPool((2, 2))
 outdims(m, (10, 10)) == (5, 5)
 ```
 """
-outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
+outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
 
 """
     MeanPool(k)
@@ -406,4 +406,4 @@ m = MeanPool((2, 2))
 outdims(m, (10, 10)) == (5, 5)
 ```
 """
-outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
\ No newline at end of file
+outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
\ No newline at end of file
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 0ff1776d..421c7721 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -92,4 +92,19 @@ import Flux: activations
       @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
     end
   end
+
+  @testset "output dimensions" begin
+    m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+    @test Flux.outdims(m, (10, 10)) == (6, 6)
+
+    m = Dense(10, 5)
+    @test Flux.outdims(m, (5, 2)) == (5,)
+    @test Flux.outdims(m, (10,)) == (5,)
+
+    m = Flux.Diagonal(10)
+    @test Flux.outdims(m, (10,)) == (10,)
+
+    m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
+    @test Flux.outdims(m, (10, 10)) == (8, 8)
+  end
 end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index b4136062..5701df80 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -107,3 +107,23 @@ end
     true
   end
 end
+
+@testset "conv output dimensions" begin
+  m = Conv((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+
+  m = ConvTranspose((3, 3), 3 => 16)
+  @test Flux.outdims(m, (8, 8)) == (10, 10)
+
+  m = DepthwiseConv((3, 3), 3 => 6)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+
+  m = CrossCor((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+
+  m = MaxPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+
+  m = MeanPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+end
\ No newline at end of file

From a64378b11272444f8803ec0155262d47ab0cef71 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 7 Dec 2019 13:21:26 -0600
Subject: [PATCH 04/23] Switched to using NNlib for conv.jl outdims.

---
 src/layers/basic.jl | 20 -------------
 src/layers/conv.jl  | 73 ++++++++++-----------------------------------
 2 files changed, 15 insertions(+), 78 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index b62d8bb9..6f056429 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -169,16 +169,6 @@ function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", length(l.α), ")")
 end
 
-"""
-    outdims(l::Diagonal, isize)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = Diagonal(10)
-outdims(m, (10,)) == (10,)
-```
-"""
 outdims(l::Diagonal, isize) = (length(l.α),)
 
 """
@@ -228,16 +218,6 @@ function (mo::Maxout)(input::AbstractArray)
     mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end
 
-"""
-    outdims(c::Maxout, isize)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
-outdims(m, (10, 10)) == (8, 8)
-```
-"""
 outdims(l::Maxout, isize) = outdims(first(l.over), isize)
 
 """
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 6ce9bcbf..7b32f999 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,6 +1,8 @@
-using NNlib: conv, ∇conv_data, depthwiseconv
+using NNlib: conv, ∇conv_data, depthwiseconv, output_size
+
+# pad dims of x with dims of y until ndims(x) == ndims(y)
+_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
 
-_convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
 _convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad)
 
 expand(N, i::Tuple) = i
@@ -75,13 +77,16 @@ end
     outdims(l::Conv, isize::Tuple)
 
 Calculate the output dimensions given the input dimensions, `isize`.
+Batch size and channel size are ignored as per `NNlib.jl`.
 
 ```julia
 m = Conv((3, 3), 3 => 16)
 outdims(m, (10, 10)) == (8, 8)
+outdims(m, (10, 10, 1, 3)) == (8, 8)
 ```
 """
-outdims(l::Conv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::Conv, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
     ConvTranspose(size, in=>out)
@@ -156,17 +161,7 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-"""
-    outdims(l::ConvTranspose, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = ConvTranspose((3, 3), 3 => 16)
-outdims(m, (8, 8)) == (10, 10)
-```
-"""
-outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.pad[1:N])
 
 """
     DepthwiseConv(size, in=>out)
@@ -232,17 +227,8 @@ end
 (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-"""
-    outdims(l::DepthwiseConv, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = DepthwiseConv((3, 3), 3 => 6)
-outdims(m, (10, 10)) == (8, 8)
-```
-"""
-outdims(l::DepthwiseConv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::DepthwiseConv, isize) =
+  output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
     CrossCor(size, in=>out)
@@ -315,17 +301,8 @@ end
 (a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-"""
-    outdims(l::CrossCor, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = CrossCor((3, 3), 3 => 16)
-outdims(m, (10, 10)) == (8, 8)
-```
-"""
-outdims(l::CrossCor{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::CrossCor, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
     MaxPool(k)
@@ -356,17 +333,7 @@ function Base.show(io::IO, m::MaxPool)
   print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
-"""
-    outdims(l::MaxPool, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = MaxPool((2, 2))
-outdims(m, (10, 10)) == (5, 5)
-```
-"""
-outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
+outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
 
 """
     MeanPool(k)
@@ -396,14 +363,4 @@ function Base.show(io::IO, m::MeanPool)
   print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
-"""
-    outdims(l::MeanPool, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = MeanPool((2, 2))
-outdims(m, (10, 10)) == (5, 5)
-```
-"""
-outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
\ No newline at end of file
+outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
\ No newline at end of file

From 0cdd11c0dc8e8e82a90467cc66e3b8330ad57682 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 7 Dec 2019 14:05:50 -0600
Subject: [PATCH 05/23] Added tests for varying padding, stride, and dilation
 with outdims.

---
 src/layers/conv.jl  |  4 ++--
 test/layers/conv.jl | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 7b32f999..03de438a 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -3,7 +3,7 @@ using NNlib: conv, ∇conv_data, depthwiseconv, output_size
 # pad dims of x with dims of y until ndims(x) == ndims(y)
 _paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
 
-_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad)
+_convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+ (ksize .- 1).*dsize .- (pad[1:2:end] .+ pad[2:2:end])
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -161,7 +161,7 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
 
 """
     DepthwiseConv(size, in=>out)
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 5701df80..1a22b385 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -111,19 +111,51 @@ end
 @testset "conv output dimensions" begin
   m = Conv((3, 3), 3 => 16)
   @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = Conv((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
 
   m = ConvTranspose((3, 3), 3 => 16)
   @test Flux.outdims(m, (8, 8)) == (10, 10)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (2, 2)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (4, 4)) == (5, 5)
 
   m = DepthwiseConv((3, 3), 3 => 6)
   @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
 
   m = CrossCor((3, 3), 3 => 16)
   @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = CrossCor((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
 
   m = MaxPool((2, 2))
   @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MaxPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MaxPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
 
   m = MeanPool((2, 2))
   @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MeanPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MeanPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
 end
\ No newline at end of file

From 04991d3261f006f134beb6333f504ad27e11a706 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 7 Dec 2019 14:06:11 -0600
Subject: [PATCH 06/23] Added entry to docs for outdims

---
 docs/src/models/basics.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index d83fc462..c6dc4e19 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -219,3 +219,13 @@ Flux.@functor Affine
 ```
 
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
+
+## Utility functions
+
+Flux provides some utility functions to help you generate models in an automated fashion.
+
+`outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
+
+```@docs
+outdims
+```

From 2f854bdfc0d7064f4e28988d6418d9b09324c11e Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Tue, 10 Dec 2019 09:57:08 -0600
Subject: [PATCH 07/23] Recommitting to trigger new build


From 9803826a368fa3f527e9c2682876f168e11f75fc Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Mon, 20 Jan 2020 13:53:28 -0500
Subject: [PATCH 08/23] test restructure on the GPU

Requires https://github.com/FluxML/Zygote.jl/pull/474
---
 test/cuda/cuda.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 1576d88f..911eef93 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -58,6 +58,13 @@ end
   @test y[3,:] isa CuArray
 end
 
+@testset "restructure gpu" begin
+  dudt = Dense(1,1) |> gpu
+  p,re = Flux.destructure(dudt)
+  foo(x) = sum(re(p)(x))
+  @test gradient(foo, cu(rand(1)))[1] isa CuArray
+end
+
 if CuArrays.has_cudnn()
   @info "Testing Flux/CUDNN"
   include("cudnn.jl")

From 197a1a70c09deba9f4d5ae1bf74bc12a86314288 Mon Sep 17 00:00:00 2001
From: pranjaldatta <pranjaldatta99@gmail.com>
Date: Fri, 7 Feb 2020 03:47:19 +0530
Subject: [PATCH 09/23] added BostonHousing dataset and testing

---
 src/data/Data.jl    |   3 +
 src/data/housing.jl | 136 ++++++++++++++++++++++++++++++++++++++++++++
 test/data.jl        |   8 ++-
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 src/data/housing.jl

diff --git a/src/data/Data.jl b/src/data/Data.jl
index d7cd0303..88af9549 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -42,4 +42,7 @@ using .Sentiment
 include("iris.jl")
 export Iris
 
+include("housing.jl")
+export Housing
+
 end
diff --git a/src/data/housing.jl b/src/data/housing.jl
new file mode 100644
index 00000000..0d167dc0
--- /dev/null
+++ b/src/data/housing.jl
@@ -0,0 +1,136 @@
+"""
+1. Title: Boston Housing Data
+
+2. Sources:
+   (a) Origin:  This dataset was taken from the StatLib library which is
+                maintained at Carnegie Mellon University.
+   (b) Creator:  Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the 
+                 demand for clean air', J. Environ. Economics & Management,
+                 vol.5, 81-102, 1978.
+   (c) Date: July 7, 1993
+
+3. Number of Instances: 506
+
+4. Number of Attributes: 13 continuous attributes (including "class"
+                            attribute "MEDV"), 1 binary-valued attribute.
+                            
+5. Attribute Information:
+   
+       1. CRIM      per capita crime rate by town
+       2. ZN        proportion of residential land zoned for lots over 
+                    25,000 sq.ft.
+       3. INDUS     proportion of non-retail business acres per town
+       4. CHAS      Charles River dummy variable (= 1 if tract bounds 
+                    river; 0 otherwise)
+       5. NOX       nitric oxides concentration (parts per 10 million)
+       6. RM        average number of rooms per dwelling
+       7. AGE       proportion of owner-occupied units built prior to 1940
+       8. DIS       weighted distances to five Boston employment centres
+       9. RAD       index of accessibility to radial highways
+       10. TAX      full-value property-tax rate per 10,000 dollars
+       11. PTRATIO  pupil-teacher ratio by town
+       12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
+                    by town
+       13. LSTAT    % lower status of the population
+       14. MEDV     Median value of owner-occupied homes in 1000's of dollars   
+
+       Downloaded From: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data
+
+"""
+module Housing
+
+using DelimitedFiles
+using ..Data: deps, download_and_verify
+
+#Uncomment if package exists
+#const cache_prefix = "https://cache.julialang.org/"
+const cache_prefix = ""
+
+function load()
+    isfile(deps("housing.data")) && return
+    
+    @info "Downloading the Boston housing Dataset"
+    download_and_verify("$(cache_prefix)https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
+                        deps("housing.data"),
+                        "baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")
+    
+    #@info "Download complete. Working on the files"
+    path = deps()
+    isfile(deps("housing.data")) && touch(joinpath(path, "tempfile.data"))
+    open(joinpath(path, "tempfile.data"), "a") do fout
+        open(deps("housing.data"), "r") do fin
+            for line in eachline(fin)
+                line = replace(lstrip(line), r" +" => s",")
+                println(fout, line)
+            end
+        end
+    end
+    mv(joinpath(path, "tempfile.data"), deps("housing.data"), force=true)
+end
+
+"""
+Gets the targets for the Boston housing dataset, a 506 element array listing the targets for each example
+
+```jldoctest
+julia> using Flux
+
+julia> target = Flux.Data.Housing.targets()
+
+julia> summary(target)
+506×1 Array{Float64,2}
+
+julia> target[1]
+24.0
+
+"""
+function targets()
+    load()
+    housing = readdlm(deps("housing.data"), ',')
+    reshape(Vector{Float64}(housing[1:end,end]), (506, 1))           
+end
+
+
+"""
+Gets the names of the features provided in the dataset
+
+"""
+function feature_names()
+    ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"]
+end
+
+
+"""
+Gets the features of the Boston Housing Dataset. This is a 506x13 Matrix of Float64 datatypes.
+The values are in the order ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"].
+It has 506 examples.
+
+```jldoctest
+julia> using Flux
+
+julia> features = Flux.Data.Housing.features()
+
+julia> summary(features)
+506×13 Array{Float64,2}
+
+julia> features[1, :]
+13-element Array{Float64,1}:
+0.00632
+18.0    
+2.31   
+0.0    
+0.538  
+   ⋮      
+296.0    
+15.3    
+396.9    
+4.98   
+
+"""
+function features()
+    load()
+    housing = readdlm(deps("housing.data"), ',')
+    Matrix{Float64}(housing[1:end, 1:13])    
+end
+
+
+end
\ No newline at end of file
diff --git a/test/data.jl b/test/data.jl
index 6b777873..aa913806 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -16,7 +16,13 @@ using Test
 @test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
 
 @test Iris.features() isa Matrix
-@test size(Iris.features()) == (4,150)
+@test size(Iris.features()) == (4,150) 
 
 @test Iris.labels() isa Vector{String}
 @test size(Iris.labels()) == (150,)
+
+@test Housing.features() isa Matrix
+@test size(Housing.features()) == (506, 13)
+
+@test Housing.targets() isa Array{Float64}
+@test size(Housing.targets()) == (506, 1)
\ No newline at end of file

From c37fc3cfa63a82deec33d40f837b880341440c7a Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 9 Feb 2020 19:45:04 -0600
Subject: [PATCH 10/23] Recommitting to trigger build


From f5b9cf659cb14f0b05ab98b2fef70f705adfc8c3 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 20 Feb 2020 23:38:56 -0600
Subject: [PATCH 11/23] Updated docs to specify exactly what layers support
 outdims

---
 docs/src/models/basics.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index c6dc4e19..6e8d0b76 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -225,6 +225,17 @@ This enables a useful extra set of functionality for our `Affine` layer, such as
 Flux provides some utility functions to help you generate models in an automated fashion.
 
 `outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
+Currently limited to the following layers:
+- `Chain`
+- `Dense`
+- `Conv`
+- `Diagonal`
+- `Maxout`
+- `ConvTranspose`
+- `DepthwiseConv`
+- `CrossCor`
+- `MaxPool`
+- `MeanPool`
 
 ```@docs
 outdims

From 6ced7e1ecff379cf3df3f62f05557317dc56e41f Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sun, 23 Feb 2020 13:42:11 -0500
Subject: [PATCH 12/23] expand Colors compat

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f76063bd..71282a10 100644
--- a/Project.toml
+++ b/Project.toml
@@ -27,7 +27,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 AbstractTrees = "0.2, 0.3"
 Adapt = "1"
 CodecZlib = "0.5, 0.6"
-Colors = "0.8, 0.9"
+Colors = "0.8, 0.9, 0.10, 0.11"
 CuArrays = "1.6"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"

From db4eaf254b5de8902349afbd705243c22d0ec91a Mon Sep 17 00:00:00 2001
From: Bulat Suleymanov <motjumi@gmail.com>
Date: Mon, 24 Feb 2020 13:16:51 +0500
Subject: [PATCH 13/23] Edit description of convolutional layer

---
 src/layers/conv.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index f4de3ffc..829051ae 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -17,7 +17,7 @@ Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
     out = 16
     Conv((2, 2), 1=>16, relu)
 
-Data should be stored in WHCN order (width, height, # channels, # batches).
+Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 

From 569021a9f1f9910f7f2e9ac6869bb149b9da7023 Mon Sep 17 00:00:00 2001
From: pranjaldatta <pranjaldatta99@gmail.com>
Date: Wed, 26 Feb 2020 15:05:23 +0530
Subject: [PATCH 14/23] added newlines  at end of file

---
 src/data/housing.jl | 2 +-
 test/data.jl        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/data/housing.jl b/src/data/housing.jl
index 0d167dc0..61391304 100644
--- a/src/data/housing.jl
+++ b/src/data/housing.jl
@@ -133,4 +133,4 @@ function features()
 end
 
 
-end
\ No newline at end of file
+end
diff --git a/test/data.jl b/test/data.jl
index aa913806..6c012a93 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -25,4 +25,4 @@ using Test
 @test size(Housing.features()) == (506, 13)
 
 @test Housing.targets() isa Array{Float64}
-@test size(Housing.targets()) == (506, 1)
\ No newline at end of file
+@test size(Housing.targets()) == (506, 1)

From 759fe9df2fb0a4665052383fae1b0fd8978a2f52 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Wed, 26 Feb 2020 20:27:39 +0100
Subject: [PATCH 15/23] update docs and export update!

---
 docs/src/training/optimisers.md |  3 ++-
 src/optimise/Optimise.jl        |  2 +-
 src/optimise/train.jl           | 17 +++++++++++++++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 5e8b95de..37288b5d 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -21,7 +21,7 @@ grads = gradient(() -> loss(x, y), θ)
 We want to update each parameter, using the gradient, in order to improve (reduce) the loss. Here's one way to do that:
 
 ```julia
-using Flux: update!
+using Flux.Optimise: update!
 
 η = 0.1 # Learning Rate
 for p in (W, b)
@@ -46,6 +46,7 @@ An optimiser `update!` accepts a parameter and a gradient, and updates the param
 All optimisers return an object that, when passed to `train!`, will update the parameters passed to it.
 
 ```@docs
+Flux.Optimise.update!
 Descent
 Momentum
 Nesterov
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index 68c18a6f..28a1849d 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -1,6 +1,6 @@
 module Optimise
 
-export train!,
+export train!, update!,
 	SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
 	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
 	InvDecay, ExpDecay, WeightDecay, stop, Optimiser
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index ae0f334c..59404a42 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -1,9 +1,22 @@
 using Juno
 import Zygote: Params, gradient
 
+
+"""
+  update!(opt, p, g)
+  update!(opt, ps::Params, gs)
+
+Perform an update step of the parameters `ps` (or the single parameter `p`) 
+according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
+
+As a result, the parameters are mutated and the optimizer's internal state may change. 
+
+  update!(x, x̄)
+  
+Update the array `x` according to `x .-= x̄`.
+"""
 function update!(x::AbstractArray, x̄)
-  x .+= x̄
-  return x
+  x .-= x̄
 end
 
 function update!(opt, x, x̄)

From a121742f9c766b954f56a46e631333853e97d5ad Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 27 Feb 2020 13:56:05 +0530
Subject: [PATCH 16/23] pkg up

---
 Manifest.toml | 68 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 12986ccd..55f3e229 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -40,15 +40,15 @@ version = "2.1.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
+git-tree-sha1 = "5660775f2a3214420add960e1ff2baf46d5297cd"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "5.0.1"
+version = "5.1.0"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
+git-tree-sha1 = "e0c2805c9a7d338823c0d8f574242e284410fa61"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.7.0"
+version = "2.9.1"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -74,6 +74,12 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"
 
+[[CompilerSupportLibraries_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "b57c5d019367c90f234a7bc7e24ff0a84971da5d"
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "0.2.0+1"
+
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
 git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
@@ -87,9 +93,9 @@ version = "1.1.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
+git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.7"
+version = "0.17.10"
 
 [[Dates]]
 deps = ["Printf"]
@@ -107,9 +113,9 @@ version = "1.0.2"
 
 [[DiffRules]]
 deps = ["NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
+git-tree-sha1 = "eb0c34204c8410888844ada5359ac8b96292cfd1"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.0.0"
+version = "1.0.1"
 
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
@@ -123,15 +129,15 @@ version = "1.2.0"
 
 [[FFTW_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
+git-tree-sha1 = "ddb57f4cf125243b4aa4908c94d73a805f3cbf2c"
 uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
-version = "3.3.9+3"
+version = "3.3.9+4"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
+git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.4"
+version = "0.8.5"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -140,9 +146,9 @@ version = "0.6.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
+git-tree-sha1 = "88b082d492be6b63f967b6c96b352e25ced1a34c"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.8"
+version = "0.10.9"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
@@ -152,9 +158,9 @@ version = "2.0.1"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
+git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.3.0"
+version = "0.3.1"
 
 [[IntelOpenMP_jll]]
 deps = ["Libdl", "Pkg"]
@@ -192,10 +198,10 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
 [[MKL_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
+deps = ["IntelOpenMP_jll", "Libdl", "Pkg"]
+git-tree-sha1 = "720629cc8cbd12c146ca01b661fd1a6cf66e2ff4"
 uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
-version = "2019.0.117+0"
+version = "2019.0.117+2"
 
 [[MacroTools]]
 deps = ["DataStructures", "Markdown", "Random"]
@@ -234,10 +240,10 @@ uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
 version = "0.3.3"
 
 [[OpenSpecFun_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
+deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
+git-tree-sha1 = "d110040968b9afe95c6bd9c6233570b0fe8abd22"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.3+1"
+version = "0.5.3+2"
 
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@@ -273,9 +279,9 @@ version = "0.2.0"
 
 [[Requires]]
 deps = ["UUIDs"]
-git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
+git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.0.0"
+version = "1.0.1"
 
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@@ -298,9 +304,9 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[SpecialFunctions]]
 deps = ["OpenSpecFun_jll"]
-git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
+git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.9.0"
+version = "0.10.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
@@ -349,15 +355,17 @@ version = "0.9.0"
 
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
+git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e"
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.11+7"
+version = "1.2.11+8"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
+git-tree-sha1 = "ab2683e7670925ed73b7f076b26847683e38db8c"
+repo-rev = "master"
+repo-url = "https://github.com/FluxML/Zygote.jl.git"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.5"
+version = "0.4.7"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]

From 35f6998be7572bb557948d3cee65797be22c9019 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 27 Feb 2020 22:19:06 +0530
Subject: [PATCH 17/23] pkg up

---
 Manifest.toml | 60 +++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 55f3e229..693f7ca2 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -8,15 +8,15 @@ version = "0.5.0"
 
 [[AbstractTrees]]
 deps = ["Markdown"]
-git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
+git-tree-sha1 = "86d092c2599f1f7bb01668bf8eb3412f98d61e47"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.3.0"
+version = "0.3.2"
 
 [[Adapt]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "82dab828020b872fa9efd3abec1152b075bc7cbf"
+git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "1.0.0"
+version = "1.0.1"
 
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
@@ -34,21 +34,21 @@ version = "0.2.0"
 
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
+git-tree-sha1 = "d7ceadd8f821177d05b897c0517e94633db535fe"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "2.1.0"
+version = "3.1.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "5660775f2a3214420add960e1ff2baf46d5297cd"
+git-tree-sha1 = "01e90fa34e25776bc7c8661183d4519149ebfe59"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "5.1.0"
+version = "6.0.0"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "e0c2805c9a7d338823c0d8f574242e284410fa61"
+git-tree-sha1 = "f86269ff60ebe082a2806ecbce51f3cadc68afe9"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.9.1"
+version = "2.10.2"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -58,15 +58,15 @@ version = "0.6.0"
 
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
+git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.8.1"
+version = "0.9.1"
 
 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
-git-tree-sha1 = "c9c1845d6bf22e34738bee65c357a69f416ed5d1"
+git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.9.6"
+version = "0.11.2"
 
 [[CommonSubexpressions]]
 deps = ["Test"]
@@ -82,9 +82,9 @@ version = "0.2.0+1"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
+git-tree-sha1 = "7c20c5a45bb245cf248f454d26966ea70255b271"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.6.0"
+version = "1.7.2"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -140,9 +140,9 @@ uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
 version = "0.8.5"
 
 [[FixedPointNumbers]]
-git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
+git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.6.1"
+version = "0.7.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
@@ -173,10 +173,10 @@ deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[Juno]]
-deps = ["Base64", "Logging", "Media", "Profile", "Test"]
-git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8"
+deps = ["Base64", "Logging", "Media", "Profile"]
+git-tree-sha1 = "4f2249fb58cfb140eeb89428e31791e2f8959d8c"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.7.2"
+version = "0.8.0"
 
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
@@ -205,9 +205,9 @@ version = "2019.0.117+2"
 
 [[MacroTools]]
 deps = ["DataStructures", "Markdown", "Random"]
-git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
+git-tree-sha1 = "07ee65e03e28ca88bc9a338a3726ae0c3efaa94b"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.3"
+version = "0.5.4"
 
 [[Markdown]]
 deps = ["Base64"]
@@ -230,9 +230,9 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[NNlib]]
 deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
-git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
+git-tree-sha1 = "755c0bab3912ff782167e1b4b774b833f8a0e550"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.2"
+version = "0.6.4"
 
 [[NaNMath]]
 git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
@@ -320,9 +320,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [[StatsBase]]
 deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "c53e809e63fe5cf5de13632090bc3520649c9950"
+git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.32.0"
+version = "0.32.1"
 
 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
@@ -349,9 +349,9 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [[ZipFile]]
 deps = ["Libdl", "Printf", "Zlib_jll"]
-git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
+git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.9.0"
+version = "0.9.1"
 
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
@@ -361,7 +361,7 @@ version = "1.2.11+8"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "ab2683e7670925ed73b7f076b26847683e38db8c"
+git-tree-sha1 = "3c65158c0aa0808cdfff8bca2a36430b038aad00"
 repo-rev = "master"
 repo-url = "https://github.com/FluxML/Zygote.jl.git"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"

From 425fcdbe6964d581b4d5f6eda1615e883a83b5bd Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 11:14:48 +0100
Subject: [PATCH 18/23] NNlib docs + misc docs improvements

---
 docs/make.jl                      |  3 +-
 docs/src/gpu.md                   |  4 +-
 docs/src/models/layers.md         | 30 ++++-------
 docs/src/models/nnlib.md          | 37 +++++++++++++
 docs/src/models/regularisation.md |  4 +-
 src/layers/normalise.jl           | 16 ++++--
 src/layers/stateless.jl           | 87 ++++++++++++++++++-------------
 7 files changed, 115 insertions(+), 66 deletions(-)
 create mode 100644 docs/src/models/nnlib.md

diff --git a/docs/make.jl b/docs/make.jl
index b950e959..fe3544fc 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -13,7 +13,8 @@ makedocs(modules=[Flux, NNlib],
                     ["Basics" => "models/basics.md",
                      "Recurrence" => "models/recurrence.md",
                      "Regularisation" => "models/regularisation.md",
-                     "Model Reference" => "models/layers.md"],
+                     "Model Reference" => "models/layers.md",
+                     "NNlib" => "models/nnlib.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
                      "Training" => "training/training.md"],
diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index bb13fdd1..19d0c8c6 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -30,7 +30,7 @@ If you define a structured model, like a `Dense` layer or `Chain`, you just need
 ```julia
 d = Dense(10, 5, σ)
 d = fmap(cu, d)
-d.W # Tracked CuArray
+d.W # CuArray
 d(cu(rand(10))) # CuArray output
 
 m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
@@ -53,7 +53,7 @@ julia> x = rand(10) |> gpu
  0.511655
 
 julia> m(x)
-Tracked 5-element CuArray{Float32,1}:
+5-element CuArray{Float32,1}:
  -0.30535
  ⋮
  -0.618002
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 5f2ab3ce..41e98f32 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -40,19 +40,6 @@ Maxout
 SkipConnection
 ```
 
-## Activation Functions
-
-Non-linearities that go between layers of your model. Most of these functions are defined in [NNlib](https://github.com/FluxML/NNlib.jl) but are available by default in Flux.
-
-Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
-
-```@docs
-σ
-relu
-leakyrelu
-elu
-swish
-```
 
 ## Normalisation & Regularisation
 
@@ -61,6 +48,7 @@ These layers don't affect the structure of the network but may improve training
 ```@docs
 BatchNorm
 Dropout
+Flux.dropout
 AlphaDropout
 LayerNorm
 GroupNorm
@@ -68,12 +56,12 @@ GroupNorm
 
 ## Cost Functions
 ```@docs
-mse
-crossentropy
-logitcrossentropy
-binarycrossentropy
-logitbinarycrossentropy
-kldivergence
-poisson
-hinge
+Flux.mse
+Flux.crossentropy
+Flux.logitcrossentropy
+Flux.binarycrossentropy
+Flux.logitbinarycrossentropy
+Flux.kldivergence
+Flux.poisson
+Flux.hinge
 ```
diff --git a/docs/src/models/nnlib.md b/docs/src/models/nnlib.md
new file mode 100644
index 00000000..f5732574
--- /dev/null
+++ b/docs/src/models/nnlib.md
@@ -0,0 +1,37 @@
+## NNlib
+Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
+
+## Activation Functions
+Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
+
+```@docs
+NNlib.elu
+NNlib.gelu
+NNlib.leakyrelu
+NNlib.logcosh
+NNlib.logsigmoid
+NNlib.sigmoid
+NNlib.relu
+NNlib.selu
+NNlib.softplus
+NNlib.softsign
+NNlib.swish
+```
+
+## Softmax
+```@docs
+NNlib.softmax
+NNlib.logsoftmax
+```
+
+## Pooling
+```@docs
+NNlib.maxpool
+NNlib.meanpool
+```
+
+## Convolution
+```@docs
+NNlib.conv
+NNlib.depthwiseconv
+```
\ No newline at end of file
diff --git a/docs/src/models/regularisation.md b/docs/src/models/regularisation.md
index e1d88d77..02aa3da8 100644
--- a/docs/src/models/regularisation.md
+++ b/docs/src/models/regularisation.md
@@ -31,7 +31,7 @@ julia> params(m)
  param([0.0, 0.0, 0.0, 0.0, 0.0])
 
 julia> sum(norm, params(m))
-26.01749952921026 (tracked)
+26.01749952921026
 ```
 
 Here's a larger example with a multi-layer perceptron.
@@ -52,7 +52,7 @@ One can also easily add per-layer regularisation via the `activations` function:
 ```julia
 julia> using Flux: activations
 
-julia> c = Chain(Dense(10,5,σ),Dense(5,2),softmax)
+julia> c = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
 Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
 
 julia> activations(c, rand(10))
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index b421d3e7..2268fdc0 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -7,6 +7,16 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
+"""
+    dropout(p, dims = :)
+
+Dropout function. For each input, either sets that input to `0` (with probability
+`p`) or scales it by `1/(1-p)`. The `dims` argument is to specify the unbroadcasted
+dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
+used as a regularisation, i.e. it reduces overfitting during training. 
+ 
+See also [`Dropout`](@ref).
+"""
 dropout(x, p; dims = :) = x
 
 @adjoint function dropout(x, p; dims = :)
@@ -18,10 +28,7 @@ end
 """
     Dropout(p, dims = :)
 
-A Dropout layer. For each input, either sets that input to `0` (with probability
-`p`) or scales it by `1/(1-p)`. The `dims` argument is to specified the unbroadcasted
- dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
- used as a regularisation, i.e. it reduces overfitting during training. see also [`dropout`](@ref).
+A Dropout layer. In the forward pass, applies the [`dropout`](@ref) function on the input.
 """
 mutable struct Dropout{F,D}
   p::F
@@ -43,6 +50,7 @@ end
 
 """
     AlphaDropout(p)
+    
 A dropout layer. It is used in Self-Normalizing Neural Networks.
 (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
 The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 159a8385..5de5842b 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -1,10 +1,12 @@
-using CuArrays
-using NNlib: logsoftmax, logσ
-
 # Cost functions
+"""
+    mse(ŷ, y)
 
+Return the mean squared error `sum((ŷ .- y).^2) / length(y)`. 
+"""
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
+
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
   return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
 end
@@ -17,10 +19,26 @@ function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Abstr
   return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
 end
 
+"""
+    crossentropy(ŷ, y; weight=1)
+
+Return the crossentropy computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`. 
+
+See also [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref).
+"""
 crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
 
-function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
-  return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
+"""
+    logitcrossentropy(ŷ, y; weight=1)
+
+Return the crossentropy computed after a [softmax](@ref) operation: 
+
+  -sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)
+
+See also [`crossentropy`](@ref), [`binarycrossentropy`](@ref).
+"""
+function logitcrossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
+  return -sum(y .* logsoftmax(ŷ) .* weight) * 1 // size(y, 2)
 end
 
 """
@@ -28,11 +46,7 @@ end
 
 Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerical stability.
 
-    julia> binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0.])
-    3-element Array{Float64,1}:
-    1.4244
-    0.352317
-    0.86167
+Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation.
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
@@ -40,44 +54,42 @@ binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ
 CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
 """
-    logitbinarycrossentropy(logŷ, y)
+    logitbinarycrossentropy(ŷ, y)
 
-`logitbinarycrossentropy(logŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(logŷ), y)`
+`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(ŷ), y)`
 but it is more numerically stable.
 
-    julia> logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0.])
-    3-element Array{Float64,1}:
-     1.4244
-     0.352317
-     0.86167
+See also [`binarycrossentropy`](@ref), [`sigmoid`](@ref), [`logsigmoid`](@ref).  
 """
-logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 
 # Re-definition to fix interaction with CuArrays.
-CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 
 """
-    normalise(x::AbstractArray; dims=1)
+    normalise(x; dims=1)
 
 Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
 
-    julia> a = reshape(collect(1:9), 3, 3)
-    3×3 Array{Int64,2}:
-     1  4  7
-     2  5  8
-     3  6  9
+```julia-repl
+julia> a = reshape(collect(1:9), 3, 3)
+3×3 Array{Int64,2}:
+  1  4  7
+  2  5  8
+  3  6  9
 
-    julia> normalise(a)
-    3×3 Array{Float64,2}:
-     -1.22474  -1.22474  -1.22474
-      0.0       0.0       0.0
-      1.22474   1.22474   1.22474
+julia> normalise(a)
+3×3 Array{Float64,2}:
+  -1.22474  -1.22474  -1.22474
+  0.0       0.0       0.0
+  1.22474   1.22474   1.22474
 
-    julia> normalise(a, dims=2)
-    3×3 Array{Float64,2}:
-     -1.22474  0.0  1.22474
-     -1.22474  0.0  1.22474
-     -1.22474  0.0  1.22474
+julia> normalise(a, dims=2)
+3×3 Array{Float64,2}:
+  -1.22474  0.0  1.22474
+  -1.22474  0.0  1.22474
+  -1.22474  0.0  1.22474
+```
 """
 function normalise(x::AbstractArray; dims=1)
   μ′ = mean(x, dims = dims)
@@ -87,6 +99,7 @@ end
 
 """
     kldivergence(ŷ, y)
+
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
 [KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
@@ -99,6 +112,7 @@ end
 
 """
     poisson(ŷ, y)
+
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
 [Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
@@ -106,7 +120,8 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
 """
     hinge(ŷ, y)
-Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
+
+Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)

From b6c79b38b4bf54aba0ee096b38afd1180ad1ee55 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Wed, 26 Feb 2020 13:48:27 +0100
Subject: [PATCH 19/23] add DataLoader

special case train! for the unsupervised data iterator
---
 Manifest.toml                 |  2 +-
 Project.toml                  |  5 +-
 docs/make.jl                  |  4 +-
 docs/src/data/dataloader.md   |  6 +++
 docs/src/training/training.md | 19 +++++--
 src/Flux.jl                   |  1 +
 src/data/Data.jl              | 10 ++++
 src/data/dataloader.jl        | 88 +++++++++++++++++++++++++++++++++
 src/optimise/train.jl         | 19 ++++---
 test/data.jl                  | 93 ++++++++++++++++++++++++++++-------
 test/runtests.jl              | 59 ++++++++++++++--------
 11 files changed, 253 insertions(+), 53 deletions(-)
 create mode 100644 docs/src/data/dataloader.md
 create mode 100644 src/data/dataloader.jl

diff --git a/Manifest.toml b/Manifest.toml
index 693f7ca2..788e5354 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -252,7 +252,7 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.1.0"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
diff --git a/Project.toml b/Project.toml
index 71282a10..bd105730 100644
--- a/Project.toml
+++ b/Project.toml
@@ -40,7 +40,10 @@ julia = "1"
 
 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+
 [targets]
-test = ["Test", "Documenter"]
+test = ["Test", "Documenter", "IterTools", "LinearAlgebra"]
diff --git a/docs/make.jl b/docs/make.jl
index fe3544fc..0d597500 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -15,10 +15,12 @@ makedocs(modules=[Flux, NNlib],
                      "Regularisation" => "models/regularisation.md",
                      "Model Reference" => "models/layers.md",
                      "NNlib" => "models/nnlib.md"],
+                  "Handling Data" =>
+                    ["One-Hot Encoding" => "data/onehot.md",
+                     "DataLoader" => "data/dataloader.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
                      "Training" => "training/training.md"],
-                  "One-Hot Encoding" => "data/onehot.md",
                   "GPU Support" => "gpu.md",
                   "Saving & Loading" => "saving.md",
                   "Performance Tips" => "performance.md",
diff --git a/docs/src/data/dataloader.md b/docs/src/data/dataloader.md
new file mode 100644
index 00000000..70a883c9
--- /dev/null
+++ b/docs/src/data/dataloader.md
@@ -0,0 +1,6 @@
+# DataLoader
+Flux provides the `DataLoader` type in the `Flux.Data` module to handle iteration over mini-batches of data. 
+
+```@docs
+Flux.Data.DataLoader
+```
\ No newline at end of file
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index b42db7c9..64b2b5e8 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -7,10 +7,10 @@ To actually train a model we need four things:
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 
-With these we can call `Flux.train!`:
+With these we can call `train!`:
 
-```julia
-Flux.train!(objective, params, data, opt)
+```@docs
+Flux.Optimise.train!
 ```
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
@@ -56,7 +56,8 @@ data = [(x, y)]
 ```julia
 data = [(x, y), (x, y), (x, y)]
 # Or equivalently
-data = Iterators.repeated((x, y), 3)
+using IterTools: ncycle
+data = ncycle([(x, y)], 3)
 ```
 
 It's common to load the `x`s and `y`s separately. In this case you can use `zip`:
@@ -67,6 +68,14 @@ ys = [rand( 10), rand( 10), rand( 10)]
 data = zip(xs, ys)
 ```
 
+Training data can be conveniently  partitioned for mini-batch training using the [`Flux.Data.DataLoader`](@ref) type:
+
+```julia
+X = rand(28, 28, 60000)
+Y = rand(0:9, 60000)
+data = DataLoader(X, Y, batchsize=128) 
+```
+
 Note that, by default, `train!` only loops over the data once (a single "epoch").
 A convenient way to run multiple epochs from the REPL is provided by `@epochs`.
 
@@ -120,7 +129,7 @@ An example follows that works similar to the default `Flux.train` but with no ca
 You don't need callbacks if you just code the calls to your functions directly into the loop.
 E.g. in the places marked with comments.
 
-```
+```julia
 function my_custom_train!(loss, ps, data, opt)
   ps = Params(ps)
   for d in data
diff --git a/src/Flux.jl b/src/Flux.jl
index 9969b323..c99e41a1 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -7,6 +7,7 @@ using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using MacroTools: @forward
 @reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
+
 export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
diff --git a/src/data/Data.jl b/src/data/Data.jl
index 88af9549..940b7ea7 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -3,6 +3,9 @@ module Data
 import ..Flux
 import SHA
 
+using Random: shuffle!
+using Base: @propagate_inbounds
+
 export CMUDict, cmudict
 
 deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...)
@@ -26,6 +29,9 @@ function __init__()
   mkpath(deps())
 end
 
+include("dataloader.jl")
+export DataLoader
+
 include("mnist.jl")
 export MNIST
 
@@ -42,7 +48,11 @@ using .Sentiment
 include("iris.jl")
 export Iris
 
+<<<<<<< HEAD
 include("housing.jl")
 export Housing
 
 end
+=======
+end #module
+>>>>>>> af20a785... add DataLoader
diff --git a/src/data/dataloader.jl b/src/data/dataloader.jl
new file mode 100644
index 00000000..baf32a83
--- /dev/null
+++ b/src/data/dataloader.jl
@@ -0,0 +1,88 @@
+# Adapted from Knet's src/data.jl (author: Deniz Yuret)
+
+struct DataLoader
+    data
+    batchsize::Int
+    nobs::Int
+    partial::Bool
+    imax::Int
+    indices::Vector{Int}
+    shuffle::Bool
+end
+
+"""
+     DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+
+An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
+(except possibly the last one). 
+
+Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in 
+supervised learning. The last dimension in each tensor is considered to be the observation
+dimension. 
+
+If `shuffle=true`, shuffles the observations each time iterations are re-started.
+If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
+
+Example usage:
+
+    Xtrain = rand(10, 100)
+    dtrain = DataLoader(Xtrain, batchsize=2) 
+    # iterate over 50 mini-batches
+    for x in dtrain: 
+        @assert size(x) == (10, 2)
+        ...
+    end
+
+    Xtrain = rand(10, 100)
+    Ytrain = rand(100)
+    dtrain = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true) 
+    for epoch in 1:100
+        for (x, y) in dtrain: 
+            @assert size(x) == (10, 2)
+            @assert size(y) == (2,)
+            ...
+        end
+    end
+
+    # train for 10 epochs
+    using IterTools: ncycle 
+    Flux.train!(loss, ps, ncycle(dtrain, 10), opt)
+"""
+function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
+    batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
+    
+    nx = size(data[1])[end]
+    for i=2:length(data)
+        nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
+    end
+    if nx < batchsize
+        @warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
+        batchsize = nx
+    end
+    imax = partial ? nx : nx - batchsize + 1
+    ids = 1:min(nx, batchsize)
+    DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
+end
+
+getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
+
+@propagate_inbounds function Base.iterate(d::DataLoader, i=0)     # returns data in d.indices[i+1:i+batchsize]
+    i >= d.imax && return nothing
+    if d.shuffle && i == 0
+        shuffle!(d.indices)
+    end
+    nexti = min(i + d.batchsize, d.nobs)
+    ids = d.indices[i+1:nexti]
+    if length(d.data) == 1
+        batch = getdata(d.data[1], ids)
+    else
+        batch = ((getdata(x, ids) for x in d.data)...,)
+    end
+    return (batch, nexti)
+end
+
+function Base.length(d::DataLoader)
+    n = d.nobs / d.batchsize
+    d.partial ? ceil(Int,n) : floor(Int,n)
+end
\ No newline at end of file
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 59404a42..34a98394 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -61,13 +61,14 @@ end
 For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
 backpropagation and calls the optimizer `opt`.
 
+In case datapoints `d` are of array type, assumes no splatting is needed 
+and computes the gradient of `loss(d)`.
+
 Takes a callback as keyword argument `cb`. For example, this will print "training"
 every 10 seconds:
 
-```julia
-Flux.train!(loss, params, data, opt,
-            cb = throttle(() -> println("training"), 10))
-```
+  train!(loss, params, data, opt,
+         cb = throttle(() -> println("training"), 10))
 
 The callback can call `Flux.stop()` to interrupt the training loop.
 
@@ -78,8 +79,14 @@ function train!(loss, ps, data, opt; cb = () -> ())
   cb = runall(cb)
   @progress for d in data
     try
-      gs = gradient(ps) do
-        loss(d...)
+      if d isa AbstractArray
+        gs = gradient(ps) do
+          loss(d)
+        end
+      else
+        gs = gradient(ps) do
+          loss(d...)
+        end
       end
       update!(opt, ps, gs)
       cb()
diff --git a/test/data.jl b/test/data.jl
index 6c012a93..1a090174 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -1,28 +1,85 @@
-using Flux.Data
-using Test
+@testset "DataLoader" begin
+    X = reshape([1:10;], (2, 5))
+    Y = [1:5;]
 
-@test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
+    d = DataLoader(X, batchsize=2)
+    batches = collect(d)
+    @test length(batches) == 3
+    @test batches[1] == X[:,1:2]
+    @test batches[2] == X[:,3:4]
+    @test batches[3] == X[:,5:5]
 
-@test length(CMUDict.phones()) == 39
+    d = DataLoader(X, batchsize=2, partial=false)
+    batches = collect(d)
+    @test length(batches) == 2
+    @test batches[1] == X[:,1:2]
+    @test batches[2] == X[:,3:4]
 
-@test length(CMUDict.symbols()) == 84
+    d = DataLoader(X, Y, batchsize=2)
+    batches = collect(d)
+    @test length(batches) == 3
+    @test length(batches[1]) == 2
+    @test length(batches[2]) == 2
+    @test length(batches[3]) == 2
+    @test batches[1][1] == X[:,1:2]
+    @test batches[1][2] == Y[1:2]
+    @test batches[2][1] == X[:,3:4]
+    @test batches[2][2] == Y[3:4]
+    @test batches[3][1] == X[:,5:5]
+    @test batches[3][2] == Y[5:5]
 
-@test MNIST.images()[1] isa Matrix
-@test MNIST.labels() isa Vector{Int64}
+    # test interaction with `train!`
+    θ = ones(2)
+    X = zeros(2, 10)
+    loss(x) = sum((x .- θ).^2)
+    d  = DataLoader(X) 
+    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    @test norm(θ) < 1e-4
 
-@test FashionMNIST.images()[1] isa Matrix
-@test FashionMNIST.labels() isa Vector{Int64}
+    # test interaction with `train!`
+    θ = zeros(2)
+    X = ones(2, 10)
+    Y = fill(2, 10)
+    loss(x, y) = sum((y - x'*θ).^2)
+    d  = DataLoader(X, Y) 
+    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    @test norm(θ .- 1) < 1e-10
+end
 
-@test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
+@testset "CMUDict" begin 
+    @test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
 
-@test Iris.features() isa Matrix
-@test size(Iris.features()) == (4,150) 
+    @test length(CMUDict.phones()) == 39
 
-@test Iris.labels() isa Vector{String}
-@test size(Iris.labels()) == (150,)
+    @test length(CMUDict.symbols()) == 84
+end
 
-@test Housing.features() isa Matrix
-@test size(Housing.features()) == (506, 13)
+@testset "MNIST" begin 
+    @test MNIST.images()[1] isa Matrix
+    @test MNIST.labels() isa Vector{Int64}
+end
 
-@test Housing.targets() isa Array{Float64}
-@test size(Housing.targets()) == (506, 1)
+@testset "FashionMNIST" begin 
+    @test FashionMNIST.images()[1] isa Matrix
+    @test FashionMNIST.labels() isa Vector{Int64}
+end
+
+@testset "Sentiment" begin 
+    @test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
+end
+
+@testset "Iris" begin 
+    @test Iris.features() isa Matrix
+    @test size(Iris.features()) == (4,150)
+
+    @test Iris.labels() isa Vector{String}
+    @test size(Iris.labels()) == (150,)
+end
+
+@testest "Housing" begin
+    @test Housing.features() isa Matrix
+    @test size(Housing.features()) == (506, 13)
+
+    @test Housing.targets() isa Array{Float64}
+    @test size(Housing.targets()) == (506, 1)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 1505e96a..81182f0d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,32 +1,49 @@
-using Flux, Test, Random, Statistics, Documenter
-using Random
+using Flux 
+using Flux.Data
+using Test 
+using Random, Statistics, LinearAlgebra
+using Documenter
+using IterTools: ncycle
 
 Random.seed!(0)
 
 @testset "Flux" begin
 
-@info "Testing Basics"
+  @testset "Utils" begin
+    include("utils.jl")
+  end
 
-include("utils.jl")
-include("onehot.jl")
-include("optimise.jl")
-include("data.jl")
+  @testset "Onehot" begin
+    include("onehot.jl")
+  end
 
-@info "Testing Layers"
+  @testset "Optimise" begin
+    include("optimise.jl")
+  end
 
-include("layers/basic.jl")
-include("layers/normalisation.jl")
-include("layers/stateless.jl")
-include("layers/conv.jl")
+  @testset "Data" begin
+    include("data.jl")
+  end
 
-if Flux.use_cuda[]
-  include("cuda/cuda.jl")
-else
-  @warn "CUDA unavailable, not testing GPU support"
-end
+  @testset "Layers" begin
+    include("layers/basic.jl")
+    include("layers/normalisation.jl")
+    include("layers/stateless.jl")
+    include("layers/conv.jl")
+  end
 
-if VERSION >= v"1.2"
-  doctest(Flux)
-end
+  @testset "CUDA" begin
+    if Flux.use_cuda[]
+      include("cuda/cuda.jl")
+    else
+      @warn "CUDA unavailable, not testing GPU support"
+    end
+  end
 
-end
+  @testset "Docs" begin
+    if VERSION >= v"1.2"
+      doctest(Flux)
+    end
+  end
+
+end # testset Flux

From 487002878ed530303cf9527e7cca0ea57b34d5b2 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 27 Feb 2020 20:49:05 +0100
Subject: [PATCH 20/23] restrict train! special casing

---
 src/optimise/train.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 34a98394..54b7f53a 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -79,7 +79,7 @@ function train!(loss, ps, data, opt; cb = () -> ())
   cb = runall(cb)
   @progress for d in data
     try
-      if d isa AbstractArray
+      if d isa AbstractArray{<:Number}
         gs = gradient(ps) do
           loss(d)
         end

From 97141e8c98fc94feadbe287f45a32b58bd3d515c Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 27 Feb 2020 20:49:55 +0100
Subject: [PATCH 21/23] improve docstring

---
 src/optimise/train.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 54b7f53a..79ebcc06 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -61,7 +61,7 @@ end
 For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
 backpropagation and calls the optimizer `opt`.
 
-In case datapoints `d` are of array type, assumes no splatting is needed 
+In case datapoints `d` are of numeric array type, assumes no splatting is needed 
 and computes the gradient of `loss(d)`.
 
 Takes a callback as keyword argument `cb`. For example, this will print "training"

From a72258ea2a428ce4b12e711395856091f17f9fcc Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 18:55:49 +0100
Subject: [PATCH 22/23] fix rebase

---
 src/data/Data.jl | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/data/Data.jl b/src/data/Data.jl
index 940b7ea7..16a025a7 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -48,11 +48,7 @@ using .Sentiment
 include("iris.jl")
 export Iris
 
-<<<<<<< HEAD
 include("housing.jl")
 export Housing
 
 end
-=======
-end #module
->>>>>>> af20a785... add DataLoader

From a1efc434c21d2e4026e5d4f8764854451bac88c5 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Sat, 29 Feb 2020 19:40:44 +0100
Subject: [PATCH 23/23] fix typo

---
 test/data.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/data.jl b/test/data.jl
index 1a090174..c7a8fdfd 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -76,7 +76,7 @@ end
     @test size(Iris.labels()) == (150,)
 end
 
-@testest "Housing" begin
+@testset "Housing" begin
     @test Housing.features() isa Matrix
     @test size(Housing.features()) == (506, 13)