From 633f0df01fc0e200e4a03cb7f3e93f868a7d1b72 Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Tue, 12 Mar 2019 02:31:42 +0530
Subject: [PATCH 001/139] Added new loss functions.

---
 src/layers/stateless.jl | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 23fd1651..3bb48f1f 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -54,3 +54,31 @@ function normalise(x::AbstractArray, dims)
   Base.depwarn("`normalise(x::AbstractArray, dims)` is deprecated, use `normalise(a, dims=dims)` instead.", :normalise)
   normalise(x, dims = dims)
 end
+
+"""
+    Kullback Leibler Divergence(KL Divergence)
+KLDivergence is a measure of how much one probability distribution is different from the other.
+It is always non-negative and zero only when both the distributions are equal everywhere.
+
+"""
+function KLDivergence(ŷ, y)
+  entropy = sum(y .* log.(y)) *1 //size(y,2)
+  cross_entropy = crossentropy(ŷ, y)
+  return entropy + cross_entropy
+end
+
+"""
+    Poisson Loss function
+Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
+
+"""
+Poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
+
+"""
+    Logcosh Loss function
+"""
+
+logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y)))
+
+Hinge(ŷ, y) = sum(max.(0.0, 1 .-  ŷ .* y)) *1 // size(y,2)
+

From 61386c04f8ac8a6badcf8ca889169eb623b5327b Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Tue, 12 Mar 2019 02:36:37 +0530
Subject: [PATCH 002/139] Tests added.

---
 test/layers/stateless.jl | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 34abb8cb..336adc12 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -49,7 +49,31 @@ const ϵ = 1e-7
   @testset "logitbinarycrossentropy" begin
     @test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0)
   end
+  
+  y = [1 2 3]
+  y1 = [4.0 5.0 6.0]
+  @testset "KLDivergence" begin
+    @test Flux.KLDivergence(y, y1) ≈ 4.761838062403337
+    @test Flux.KLDivergence(y, y) ≈ 0 
+  end
 
+  @testset "Hinge" begin
+    @test Flux.Hinge(y, y1) ≈ 0
+    @test Flux.Hinge(y, 0.2 .* y) ≈ 0.33333
+  end
+  
+  y = [0.1 0.2 0.3]
+  y1 = [0.4 0.5 0.6]
+  @testset "Poisson" begin
+    @test Flux.Poisson(y, y1) ≈ 1.0160455586700767
+    @test Flux.Poisson(y, y) ≈ 0.5044459776946685
+  end
+
+  @testset "logcosh" begin
+    @test Flux.logcosh(y, y1) ≈ 0.13302230977782092
+    @test Flux.logcosh(y, y) ≈ 0
+  end
+  
   @testset "no spurious promotions" begin
     for T in (Float16, Float32, Float64)
       y = rand(T, 2)

From 57a52e33750c9f8afcf7a8937abbbee766419121 Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Tue, 12 Mar 2019 02:58:32 +0530
Subject: [PATCH 003/139] Error of recurrent decimals fixed.

---
 test/layers/stateless.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 336adc12..f961ed2f 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -56,10 +56,12 @@ const ϵ = 1e-7
     @test Flux.KLDivergence(y, y1) ≈ 4.761838062403337
     @test Flux.KLDivergence(y, y) ≈ 0 
   end
-
+  
+  y = [1 2 3 4]
+  y1 = [5.0 6.0 7.0 8.0]
   @testset "Hinge" begin
     @test Flux.Hinge(y, y1) ≈ 0
-    @test Flux.Hinge(y, 0.2 .* y) ≈ 0.33333
+    @test Flux.Hinge(y, 0.5 .* y) ≈ 0.125
   end
   
   y = [0.1 0.2 0.3]

From c4d12e57fe6a3ea0473e5fa6145d1d55789c9358 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Tue, 26 Mar 2019 03:09:48 +0530
Subject: [PATCH 004/139] Loss function names in lowercase

---
 src/layers/stateless.jl  |  9 +++------
 test/layers/stateless.jl | 18 +++++++++---------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 3bb48f1f..424db1df 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -59,9 +59,8 @@ end
     Kullback Leibler Divergence(KL Divergence)
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
-
 """
-function KLDivergence(ŷ, y)
+function kldivergence(ŷ, y)
   entropy = sum(y .* log.(y)) *1 //size(y,2)
   cross_entropy = crossentropy(ŷ, y)
   return entropy + cross_entropy
@@ -70,15 +69,13 @@ end
 """
     Poisson Loss function
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
-
 """
-Poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
+poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
 """
     Logcosh Loss function
 """
-
 logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y)))
 
-Hinge(ŷ, y) = sum(max.(0.0, 1 .-  ŷ .* y)) *1 // size(y,2)
+hinge(ŷ, y) = sum(max.(0.0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index f961ed2f..97bfea10 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -52,23 +52,23 @@ const ϵ = 1e-7
   
   y = [1 2 3]
   y1 = [4.0 5.0 6.0]
-  @testset "KLDivergence" begin
-    @test Flux.KLDivergence(y, y1) ≈ 4.761838062403337
-    @test Flux.KLDivergence(y, y) ≈ 0 
+  @testset "kldivergence" begin
+    @test Flux.kldivergence(y, y1) ≈ 4.761838062403337
+    @test Flux.kldivergence(y, y) ≈ 0 
   end
   
   y = [1 2 3 4]
   y1 = [5.0 6.0 7.0 8.0]
-  @testset "Hinge" begin
-    @test Flux.Hinge(y, y1) ≈ 0
-    @test Flux.Hinge(y, 0.5 .* y) ≈ 0.125
+  @testset "hinge" begin
+    @test Flux.hinge(y, y1) ≈ 0
+    @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
   end
   
   y = [0.1 0.2 0.3]
   y1 = [0.4 0.5 0.6]
-  @testset "Poisson" begin
-    @test Flux.Poisson(y, y1) ≈ 1.0160455586700767
-    @test Flux.Poisson(y, y) ≈ 0.5044459776946685
+  @testset "poisson" begin
+    @test Flux.poisson(y, y1) ≈ 1.0160455586700767
+    @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
 
   @testset "logcosh" begin

From 6f078857beda49e7f1d565cc7e4dded6c55db3d0 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Tue, 26 Mar 2019 03:15:28 +0530
Subject: [PATCH 005/139] Added reference links to loss functions

---
 src/layers/stateless.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 424db1df..aaefcee9 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -59,6 +59,7 @@ end
     Kullback Leibler Divergence(KL Divergence)
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
+https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
 """
 function kldivergence(ŷ, y)
   entropy = sum(y .* log.(y)) *1 //size(y,2)
@@ -69,6 +70,7 @@ end
 """
     Poisson Loss function
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
+https://isaacchanghau.github.io/post/loss_functions/
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 

From 930adb122dc5443f205ced401b5275ddbeeb67ca Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Mon, 25 Mar 2019 23:43:06 +0530
Subject: [PATCH 006/139] Avoided promotion to Float64 in hinge.

---
 src/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index aaefcee9..3221ddff 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -79,5 +79,5 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 """
 logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y)))
 
-hinge(ŷ, y) = sum(max.(0.0, 1 .-  ŷ .* y)) *1 // size(y,2)
+hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 

From 4efcc69ba5de4f68f5e0e0dc474b44ddf9388615 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Tue, 26 Mar 2019 23:23:02 +0530
Subject: [PATCH 007/139] logcosh averaged

---
 src/layers/stateless.jl  | 2 +-
 test/layers/stateless.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 3221ddff..6b6abb5e 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -77,7 +77,7 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 """
     Logcosh Loss function
 """
-logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y)))
+logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y))) *1 // size(y,2)
 
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 97bfea10..e8d881fb 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -72,7 +72,7 @@ const ϵ = 1e-7
   end
 
   @testset "logcosh" begin
-    @test Flux.logcosh(y, y1) ≈ 0.13302230977782092
+    @test Flux.logcosh(y, y1) ≈ 0.044340769925940306
     @test Flux.logcosh(y, y) ≈ 0
   end
   

From b84ab7ac95aa1eca3ec302bc7f997518b3e71612 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Fri, 5 Apr 2019 03:16:54 +0530
Subject: [PATCH 008/139] Removed logcosh

---
 src/layers/stateless.jl  | 5 -----
 test/layers/stateless.jl | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 6b6abb5e..3444f0f4 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -74,10 +74,5 @@ https://isaacchanghau.github.io/post/loss_functions/
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
-"""
-    Logcosh Loss function
-"""
-logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y))) *1 // size(y,2)
-
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index e8d881fb..d912a5fe 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -70,11 +70,6 @@ const ϵ = 1e-7
     @test Flux.poisson(y, y1) ≈ 1.0160455586700767
     @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
-
-  @testset "logcosh" begin
-    @test Flux.logcosh(y, y1) ≈ 0.044340769925940306
-    @test Flux.logcosh(y, y) ≈ 0
-  end
   
   @testset "no spurious promotions" begin
     for T in (Float16, Float32, Float64)

From 710084ffbfca78805d8c0fe41be8e9dbb58b3c4f Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Fri, 5 Apr 2019 23:50:16 +0530
Subject: [PATCH 009/139] Loss functions added to docs

---
 docs/src/training/training.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index ae483783..76c099eb 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -32,6 +32,18 @@ Flux.train!(loss, ps, data, opt)
 
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
 
+In-built loss functions:
+```@docs
+mse
+crossentropy
+logitcrossentropy
+binarycrossentropy
+logitbinarycrossentropy
+kldivergence
+poisson
+hinge
+```
+
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From 540b7366ec0edd711953223ef44bf342d691127f Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 00:54:49 -0700
Subject: [PATCH 010/139] make activations zygote friendly

---
 src/layers/basic.jl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 83eeee21..b4b869c5 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -45,17 +45,15 @@ end
 # it might be replaced in the future for better performance
 # see issue https://github.com/FluxML/Flux.jl/issues/702
 # Johnny Chen -- @johnnychen94
+# only slightly changed to better handle interaction with Zygote @dsweber2
 """
     activations(c::Chain, input)
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  rst = []
-  for l in c
-    x = get(rst, length(rst), input)
-    push!(rst, l(x))
-  end
-  return rst
+  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
+                       [l for l in c], dims=1, init=input) 
+  return copy(buffed)
 end
 
 
From 38790dd4db5520e6e587783804d1144a3b75ac9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <m.giordano@ucl.ac.uk>
Date: Sun, 8 Sep 2019 16:15:35 +0100
Subject: [PATCH 011/139] Restore purity

---
 .gitattributes | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitattributes b/.gitattributes
index e02ed0b7..4992eb2c 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 paper/* linguist-documentation
+CITATION.bib linguist-detectable=false

From 82261b5bb7e6783d6a273c8e7803c4fbb28a3dd8 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 00:54:49 -0700
Subject: [PATCH 012/139] make activations zygote friendly

---
 src/layers/basic.jl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 13d56472..fd187d8c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -45,17 +45,15 @@ end
 # it might be replaced in the future for better performance
 # see issue https://github.com/FluxML/Flux.jl/issues/702
 # Johnny Chen -- @johnnychen94
+# only slightly changed to better handle interaction with Zygote @dsweber2
 """
     activations(c::Chain, input)
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  rst = []
-  for l in c
-    x = get(rst, length(rst), input)
-    push!(rst, l(x))
-  end
-  return rst
+  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
+                       [l for l in c], dims=1, init=input) 
+  return copy(buffed)
 end
 
 
From 1bb25dc1f9c54666d73b516629e0c89033e1c0e2 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 01:34:12 -0700
Subject: [PATCH 013/139] adding the extra commits broke the accumulate version

---
 src/layers/basic.jl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index fd187d8c..e1e9ab45 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -51,9 +51,12 @@ end
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
-                       [l for l in c], dims=1, init=input) 
-  return copy(buffed)
+  res = Zygote.Buffer([], length(c))
+  res[1] = c[1](input)
+  for (i,l) in enumerate(c[2:end])
+    res[i+1] = l(res[i])
+  end
+  return copy(res)
 end
 
 
From f41219133e8a233c8e0056972641378c4e83c427 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 10:46:56 -0700
Subject: [PATCH 014/139] deal with empty Chain

---
 src/layers/basic.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index e1e9ab45..9ef6f195 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -52,9 +52,11 @@ Calculate the forward results of each layers in Chain `c` with `input` as model
 """
 function activations(c::Chain, input)
   res = Zygote.Buffer([], length(c))
-  res[1] = c[1](input)
-  for (i,l) in enumerate(c[2:end])
-    res[i+1] = l(res[i])
+  if length(c) > 0
+    res[1] = c[1](input)
+    for (i,l) in enumerate(c[2:end])
+      res[i+1] = l(res[i])
+    end
   end
   return copy(res)
 end

From 46abfbbd5cd4579e66912996c5ff4b568a01d1ea Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Wed, 11 Sep 2019 17:36:37 -0700
Subject: [PATCH 015/139] recursive way of doing activations

---
 src/layers/basic.jl | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 9ef6f195..e2e3e56a 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -51,16 +51,17 @@ end
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  res = Zygote.Buffer([], length(c))
-  if length(c) > 0
-    res[1] = c[1](input)
-    for (i,l) in enumerate(c[2:end])
-      res[i+1] = l(res[i])
-    end
-  end
-  return copy(res)
+    extraChain(c.layers, input)
 end
 
+function extraChain(fs::Tuple, x)
+    res = first(fs)(x)
+    return (res, extraChain(Base.tail(fs), res)...)
+end
+
+extraChain(::Tuple{}, x) = []
+
+
 
 """
     Dense(in::Integer, out::Integer, σ = identity)

From 04fce70019ee59a9ae8050ec8d683670f12e5942 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 29 Aug 2019 16:34:35 +0200
Subject: [PATCH 016/139] Move low-level CUDNN wrappers to CuArrays.

---
 src/cuda/cuda.jl  |  1 +
 src/cuda/cudnn.jl | 80 +++++++++------------------------------------
 src/cuda/curnn.jl | 83 ++++++++++++-----------------------------------
 3 files changed, 36 insertions(+), 128 deletions(-)

diff --git a/src/cuda/cuda.jl b/src/cuda/cuda.jl
index 028a0f8b..00f0d0f2 100644
--- a/src/cuda/cuda.jl
+++ b/src/cuda/cuda.jl
@@ -3,6 +3,7 @@ module CUDA
 using ..CuArrays
 
 if CuArrays.libcudnn !== nothing  # TODO: use CuArrays.has_cudnn()
+  using CuArrays: CUDNN
   include("curnn.jl")
   include("cudnn.jl")
 else
diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
index 448ea140..aa16f926 100644
--- a/src/cuda/cudnn.jl
+++ b/src/cuda/cudnn.jl
@@ -1,7 +1,6 @@
-using CuArrays: libcudnn
-using CuArrays.CUDNN: @check, handle, cudnnStatus_t, cudnnTensorDescriptor_t,
-  cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
-import CuArrays.CUDAdrv: CuPtr, CU_NULL
+using CuArrays.CUDNN: handle, TensorDesc, FilterDesc
+
+import CuArrays.CUDAdrv: CU_NULL
 
 using LinearAlgebra
 
@@ -15,22 +14,17 @@ Base.unsafe_convert(::Type{Ptr{Nothing}}, dd::DropoutDesc) = dd.ptr
 function DropoutDesc(ρ::Real; seed::Integer=0)
   d = [C_NULL]
   s = Csize_t[0]
-  @check ccall((:cudnnCreateDropoutDescriptor,libcudnn), cudnnStatus_t, (Ptr{Ptr{Nothing}},), d)
-  @check ccall((:cudnnDropoutGetStatesSize,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Csize_t}),handle(),s)
+  CUDNN.cudnnCreateDropoutDescriptor(d)
+  CUDNN.cudnnDropoutGetStatesSize(handle(), s)
   states = CuArray{UInt8}(undef, s[]) # TODO: can we drop this when ρ=0?
   desc = DropoutDesc(d[], states)
-  @check ccall((:cudnnSetDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Nothing},Cfloat,CuPtr{Nothing},Csize_t,Culonglong),
-    desc,handle(),ρ,states,length(states),seed)
+  CUDNN.cudnnSetDropoutDescriptor(desc, handle(), ρ, states, length(states), seed)
   finalizer(desc) do x
-    @check ccall((:cudnnDestroyDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
+    CUDNN.cudnnDestroyDropoutDescriptor(x)
   end
   return desc
 end
 
-const BATCHNORM_SPATIAL = 1
-const BATCHNORM_ACTIVATION = 0
-const BATCHNORM_MIN_EPS = 1e-5
-
 @inline _wsize(y) = (map(_ -> 1, size(y)[1:end-2])..., size(y)[end-1], 1)
 
 @inline _reddims(y) = (collect(1:ndims(y)-2)..., ndims(y))
@@ -67,9 +61,9 @@ function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray
                         alpha = T(1), beta = T(0),
                         eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
   dims = _wsize(x)
-  if eps < BATCHNORM_MIN_EPS
-    # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", BATCHNORM_MIN_EPS)
-    eps = BATCHNORM_MIN_EPS
+  if eps < CUDNN.CUDNN_BN_MIN_EPSILON
+    # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", CUDNN.CUDNN_BN_MIN_EPSILON)
+    eps = CUDNN.CUDNN_BN_MIN_EPSILON
   end
   xd = TensorDesc(x)
   yd = TensorDesc(y)
@@ -85,42 +79,14 @@ function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray
       ivar = CU_NULL
     end
 
-    @check ccall((:cudnnBatchNormalizationForwardTraining, libcudnn), cudnnStatus_t,
-                 (cudnnHandle_t,cudnnBatchNormMode_t,
-                  Ptr{T}, Ptr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, CuPtr{T},
-                  Cdouble, CuPtr{T}, CuPtr{T},
-                  Cdouble, CuPtr{T}, CuPtr{T}),
-                  handle(), BATCHNORM_SPATIAL,
-                  Ref(T(alpha)), Ref(T(beta)),
-                  xd, x,
-                  yd, y,
-                  gd, g, b,
-                  momentum, running_mean, running_var,
-                  eps, mean, ivar)
+    CUDNN.cudnnBatchNormalizationForwardTraining(handle(), CUDNN.CUDNN_BATCHNORM_SPATIAL, Ref(T(alpha)), Ref(T(beta)), xd, x, yd, y, gd, g, b, momentum, running_mean, running_var, eps, mean, ivar)
 
     if cache !== nothing
       cache.mean = mean
       cache.ivar = ivar
     end
   else
-    @check ccall((:cudnnBatchNormalizationForwardInference, libcudnn), cudnnStatus_t,
-                 (Ptr{cudnnHandle_t},cudnnBatchNormMode_t,
-                  Ptr{T}, Ptr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, CuPtr{T},
-                  CuPtr{T}, CuPtr{T},
-                  Cdouble),
-                  handle(), BATCHNORM_SPATIAL,
-                  Ref(T(alpha)), Ref(T(beta)),
-                  xd, x,
-                  yd, y,
-                  gd, g, b,
-                  running_mean, running_var,
-                  eps)
+    CUDNN.cudnnBatchNormalizationForwardInference(handle(), CUDNN.CUDNN_BATCHNORM_SPATIAL, Ref(T(alpha)), Ref(T(beta)), xd, x, yd, y, gd, g, b, running_mean, running_var, eps)
   end
 end
 
@@ -164,27 +130,11 @@ function cudnnBNBackward!(dg::CuArray{T}, g::CuArray{T}, db::CuArray{T},
       mean, ivar = CU_NULL, CU_NULL
     end
 
-    if eps < BATCHNORM_MIN_EPS
-      eps = BATCHNORM_MIN_EPS
+    if eps < CUDNN.CUDNN_BN_MIN_EPSILON
+      eps = CUDNN.CUDNN_BN_MIN_EPSILON
     end
 
-    @check ccall((:cudnnBatchNormalizationBackward, libcudnn), cudnnStatus_t,
-                 (cudnnHandle_t,cudnnBatchNormMode_t,
-                  Ptr{T}, Ptr{T},
-                  Ptr{T}, Ptr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, CuPtr{T}, CuPtr{T},
-                  Cdouble, CuPtr{T}, CuPtr{T}),
-                  handle(), BATCHNORM_SPATIAL,
-                  Ref(T(alpha)), Ref(T(beta)),
-                  Ref(T(dalpha)), Ref(T(dbeta)),
-                  xd, x,
-                  dyd, dy,
-                  dxd, dx,
-                  gd, g, dg, db,
-                  eps, mean, ivar)
+    CUDNN.cudnnBatchNormalizationBackward(handle(), CUDNN.CUDNN_BATCHNORM_SPATIAL, Ref(T(alpha)), Ref(T(beta)), Ref(T(dalpha)), Ref(T(dbeta)), xd, x, dyd, dy, dxd, dx, gd, g, dg, db, eps, mean, ivar)
   else
     ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps)
     dx .= dy .* reshape(g, _wsize(x)) .* ivar
diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl
index ca8b5140..c37d031c 100644
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@@ -1,8 +1,6 @@
-using CuArrays: libcudnn
-using CuArrays.CUDNN: @check, cudnnStatus_t, cudnnTensorDescriptor_t,
-  cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
+using CuArrays.CUDNN: handle, cudnnDataType, TensorDesc, FilterDesc
 
-import CuArrays.CUDAdrv: CuPtr, CU_NULL
+import CuArrays.CUDAdrv: CU_NULL
 
 using LinearAlgebra
 
@@ -48,8 +46,7 @@ Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr
 
 function rnnParamSize(T, r, input)
   size = Csize_t[0]
-  @check ccall((:cudnnGetRNNParamsSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Ptr{Nothing},Ptr{Csize_t},Cint),
-    handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T))
+  CUDNN.cudnnGetRNNParamsSize(handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T))
   return Int(size[])÷sizeof(T)
 end
 
@@ -58,28 +55,26 @@ ngates(r::RNNDesc) = ngates(r.mode)
 
 function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T
   d = [C_NULL]
-  @check ccall((:cudnnCreateRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Ptr{Nothing}},),d)
+  CUDNN.cudnnCreateRNNDescriptor(d)
 
   dropoutDesc = DropoutDesc(0)
   inputMode = LINEAR_INPUT
   direction = UNIDIRECTIONAL
   algo = RNN_ALGO_STANDARD
-  @check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint),
-    handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
+  CUDNN.cudnnSetRNNDescriptor_v6(handle(),d[],hidden,layers,dropoutDesc,CUDNN.cudnnRNNInputMode_t(inputMode),CUDNN.cudnnDirectionMode_t(direction),CUDNN.cudnnRNNMode_t(mode),CUDNN.cudnnRNNAlgo_t(algo),cudnnDataType(T))
 
-  w = CuArrays.zeros(T, rnnParamSize(T, d[], input))
+  w =CuArrays.zeros(T, rnnParamSize(T, d[], input))
   # TODO: avoid reserve allocation here
   rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
   finalizer(rd) do x
-    @check ccall((:cudnnDestroyRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
+    CUDNN.cudnnDestroyRNNDescriptor(x)
   end
   return rd
 end
 
 function rnnWorkspaceSize(r::RNNDesc, seqlen, xdesc)
   size = Csize_t[0]
-  @check ccall((:cudnnGetRNNWorkspaceSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Ptr{Ptr{Nothing}},Ptr{Csize_t}),
-    handle(), r, seqlen, xdesc, size)
+  CUDNN.cudnnGetRNNWorkspaceSize(handle(), r, seqlen, xdesc, size)
   return Int(size[])
 end
 
@@ -95,31 +90,18 @@ getworkspace(r::RNNDesc, seqlen, xdesc) =
 
 function rnnTrainingReserveSize(r::RNNDesc, seqlen, xdesc)
   size = Csize_t[0]
-  @check ccall((:cudnnGetRNNTrainingReserveSize,libcudnn), cudnnStatus_t, (Ptr{Nothing}, Ptr{Nothing}, Cint, Ptr{Ptr{Nothing}}, Ptr{Csize_t}),
-    handle(), r, seqlen, xdesc, size)
+  CUDNN.cudnnGetRNNTrainingReserveSize(handle(), r, seqlen, xdesc, size)
   return Int(size[])
 end
 
 function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
                          workspace, reserve=nothing) where T
   if reserve == nothing
-    @check ccall((:cudnnRNNForwardInference, libcudnn), cudnnStatus_t,
-                 (Ptr{Nothing}, Ptr{Nothing}, Cint,
-                  Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  CuPtr{Nothing}, Csize_t),
-                 handle(), rnn, seqlen,
-                 xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
-                 workspace, length(workspace))
+    CUDNN.cudnnRNNForwardInference(handle(), rnn, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y,
+                                   hod, ho, cod, co, workspace, length(workspace))
   else
-    @check ccall((:cudnnRNNForwardTraining, libcudnn), cudnnStatus_t,
-                 (Ptr{Nothing}, Ptr{Nothing}, Cint,
-                  Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                  CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
-                 handle(), rnn, seqlen,
-                 xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
-                 workspace, length(workspace), reserve, length(reserve))
+    CUDNN.cudnnRNNForwardTraining(handle(), rnn, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y,
+                                  hod, ho, cod, co, workspace, length(workspace), reserve, length(reserve))
   end
 end
 
@@ -134,8 +116,8 @@ end
 # TODO: can we just manipulate strides here?
 # TODO: should use repmat, but this isn't implemented.
 hBatch(x::AbstractVector, h::CuVector) = h
-hBatch(x::AbstractMatrix, h::CuVector) = h .* CuArrays.ones(1, size(x, 2))
-hBatch(x::AbstractMatrix, h::CuMatrix) = h .* CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1)
+hBatch(x::AbstractMatrix, h::CuVector) = h .*CuArrays.ones(1, size(x, 2))
+hBatch(x::AbstractMatrix, h::CuMatrix) = h .*CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1)
 
 function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
   h = hBatch(x, h_)
@@ -169,18 +151,6 @@ end
 forwardTrain(rnn::RNNDesc{T}, x::CuArray{T}, h::CuArray{T}, c = nothing) where T =
   forward(rnn, x, h, c, Val{true})
 
-function cudnnRNNBackwardData(rnn::RNNDesc{T}, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
-                              wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, rs) where T
-  @check ccall((:cudnnRNNBackwardData,libcudnn),cudnnStatus_t,
-               (Ptr{Nothing}, Ptr{Nothing}, Cint,
-                Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing},
-                CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
-               handle(), rnn, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
-               wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, length(ws), rs, length(rs))
-end
-
 function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
   # Same as above, any more efficient way?
   dy = dy_ isa Integer ? zero(y) : dy_
@@ -188,37 +158,24 @@ function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
   dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2))
   dh = similar(h)
   dc = c == nothing ? nothing : similar(c)
-  cudnnRNNBackwardData(rnn, 1,
+  CUDNN.cudnnRNNBackwardData(handle(), rnn, 1,
     yd, y, yd, dy, hDesc(dho)..., hDesc(dco)...,
     FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
     hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)...,
-    workspace[], reserve)
+    workspace[], length(workspace[]), reserve, length(reserve))
   return c == nothing ? (dx, dh) : (dx, dh, dc)
 end
 
 backwardData(rnn, y, dy, dho, hx, reserve) =
   backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve)
 
-function cudnnRNNBackwardWeights(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, yd, y, dwd, dw,
-                                 workspace, reserve) where T
-  @check ccall((:cudnnRNNBackwardWeights,libcudnn), cudnnStatus_t,
-               (Ptr{Nothing}, Ptr{Nothing}, Cint,  # handle, rnnDesc, seqLength
-                Ptr{Ptr{Nothing}}, CuPtr{T}, #x
-                Ptr{Nothing}, CuPtr{T}, #hx
-                Ptr{Ptr{Nothing}}, CuPtr{T}, #y
-                CuPtr{Nothing}, Csize_t, #ws
-                Ptr{Nothing}, CuPtr{T}, #dw
-                CuPtr{Nothing}, Csize_t), #rs
-               handle(), rnn, seqlen, xd, x, hd, h, yd, y,
-               workspace, length(workspace), dwd, dw, reserve, length(reserve))
-end
-
 function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T
   dw = zero(rnn.params)
-  cudnnRNNBackwardWeights(rnn, 1,
+  CUDNN.cudnnRNNBackwardWeights(handle(), rnn, 1,
     xDesc(x), x, hDesc(h)..., xDesc(y), y,
+    workspace[], length(workspace[]),
     FilterDesc(T, (1, 1, length(dw))), dw,
-    workspace[], reserve)
+    reserve, length(reserve))
   return params(dw, rnn.input, rnn.hidden, ngates(rnn))
 end
 

From 1e7ff4f65ddb6ee1eada1f9e960ade56593e89d9 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 29 Aug 2019 17:26:10 +0200
Subject: [PATCH 017/139] Query the worksize.

---
 src/cuda/curnn.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl
index c37d031c..bbd4e122 100644
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@@ -158,11 +158,12 @@ function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
   dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2))
   dh = similar(h)
   dc = c == nothing ? nothing : similar(c)
+  workspace = getworkspace(rnn, 1, yd)
   CUDNN.cudnnRNNBackwardData(handle(), rnn, 1,
     yd, y, yd, dy, hDesc(dho)..., hDesc(dco)...,
     FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
     hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)...,
-    workspace[], length(workspace[]), reserve, length(reserve))
+    workspace, length(workspace), reserve, length(reserve))
   return c == nothing ? (dx, dh) : (dx, dh, dc)
 end
 

From 4942d7fcfd405b7790c038e3e557015da38d8152 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 30 Aug 2019 08:39:51 +0200
Subject: [PATCH 018/139] Move functionality over to CuArrays.

---
 src/cuda/cudnn.jl | 148 +------------------------------
 src/cuda/curnn.jl | 221 ++++------------------------------------------
 2 files changed, 21 insertions(+), 348 deletions(-)

diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
index aa16f926..d394182e 100644
--- a/src/cuda/cudnn.jl
+++ b/src/cuda/cudnn.jl
@@ -1,149 +1,5 @@
-using CuArrays.CUDNN: handle, TensorDesc, FilterDesc
-
-import CuArrays.CUDAdrv: CU_NULL
-
-using LinearAlgebra
-
-mutable struct DropoutDesc
-  ptr::Ptr{Nothing}
-  states::CuVector{UInt8}
-end
-
-Base.unsafe_convert(::Type{Ptr{Nothing}}, dd::DropoutDesc) = dd.ptr
-
-function DropoutDesc(ρ::Real; seed::Integer=0)
-  d = [C_NULL]
-  s = Csize_t[0]
-  CUDNN.cudnnCreateDropoutDescriptor(d)
-  CUDNN.cudnnDropoutGetStatesSize(handle(), s)
-  states = CuArray{UInt8}(undef, s[]) # TODO: can we drop this when ρ=0?
-  desc = DropoutDesc(d[], states)
-  CUDNN.cudnnSetDropoutDescriptor(desc, handle(), ρ, states, length(states), seed)
-  finalizer(desc) do x
-    CUDNN.cudnnDestroyDropoutDescriptor(x)
-  end
-  return desc
-end
-
-@inline _wsize(y) = (map(_ -> 1, size(y)[1:end-2])..., size(y)[end-1], 1)
-
-@inline _reddims(y) = (collect(1:ndims(y)-2)..., ndims(y))
-
-mutable struct BNCache
-  mean
-  ivar
-end
-
-BNCache() = BNCache(nothing, nothing)
-
-# NOTE: CuDNN supports only 4D and 5D Tensors for BatchNorm Operations
-# so reshape a 2D Tensor into 4D
-batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2},
-          running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-          cache = nothing, alpha = T(1), beta = T(0),
-          eps = T(1e-5), training = true) where T<:Union{Float32, Float64} =
-  dropdims(batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), running_mean, running_var, momentum,
-            cache = cache, alpha = alpha, beta = beta, eps = eps, training = training), dims = (1, 2))
-
-function batchnorm(g::CuArray{T}, b::CuArray{T}, x::Union{CuArray{T, 4},CuArray{T,5}},
-                   running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-                   cache = nothing, alpha = T(1), beta = T(0),
-                   eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
-  y = similar(x)
-  cudnnBNForward!(y, g, b, x, running_mean, running_var, momentum, cache = cache,
-      alpha = alpha, beta = beta, eps = eps, training = training)
-  y
-end
-
-function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray{T},
-                        running_mean::CuArray{T}, running_var::CuArray{T},
-                        momentum; cache = nothing,
-                        alpha = T(1), beta = T(0),
-                        eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
-  dims = _wsize(x)
-  if eps < CUDNN.CUDNN_BN_MIN_EPSILON
-    # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", CUDNN.CUDNN_BN_MIN_EPSILON)
-    eps = CUDNN.CUDNN_BN_MIN_EPSILON
-  end
-  xd = TensorDesc(x)
-  yd = TensorDesc(y)
-  gd = TensorDesc(T, dims)
-
-  if training
-
-    if cache !== nothing
-      mean = zeros(CuArray{T}, dims...)
-      ivar = ones(CuArray{T}, dims...)
-    else
-      mean = CU_NULL
-      ivar = CU_NULL
-    end
-
-    CUDNN.cudnnBatchNormalizationForwardTraining(handle(), CUDNN.CUDNN_BATCHNORM_SPATIAL, Ref(T(alpha)), Ref(T(beta)), xd, x, yd, y, gd, g, b, momentum, running_mean, running_var, eps, mean, ivar)
-
-    if cache !== nothing
-      cache.mean = mean
-      cache.ivar = ivar
-    end
-  else
-    CUDNN.cudnnBatchNormalizationForwardInference(handle(), CUDNN.CUDNN_BATCHNORM_SPATIAL, Ref(T(alpha)), Ref(T(beta)), xd, x, yd, y, gd, g, b, running_mean, running_var, eps)
-  end
-end
-
-function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2}, dy::CuArray{T, 2},
-           running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-           cache = nothing, eps = T(1e-5), alpha = T(1),
-           beta = T(0), training = true) where T<:Union{Float32, Float64}
-  dg, db, dx = ∇batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), reshape(dy, 1, 1, size(dy, 1),
-                          size(dy, 2)), running_mean, running_var, momentum, cache = cache, eps = eps,
-                          alpha = alpha, beta = beta, training = training)
-  (dg, db, dropdims(dx, dims = (1, 2)))
-end
-
-function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
-                    running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-                    cache = nothing, eps = T(1e-5), alpha = T(1),
-                    beta = T(0), training = true) where T<:Union{Float32, Float64}
-  dg = similar(g)
-  db = similar(b)
-  dx = similar(x)
-  cudnnBNBackward!(dg, g, db, dx, x, dy, running_mean, running_var, T(momentum),
-    training = training, cache = cache, eps = eps, alpha = alpha, beta = beta)
-  (dg, db, dx)
-end
-
-function cudnnBNBackward!(dg::CuArray{T}, g::CuArray{T}, db::CuArray{T},
-                          dx::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
-                          running_mean::CuArray{T}, running_var::CuArray{T},
-                          momentum; cache = nothing, eps = T(1e-5),
-                          alpha = T(1), beta = T(0),
-                          dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
-  if training
-    xd = TensorDesc(x)
-    dyd = TensorDesc(dy)
-    dxd = TensorDesc(dx)
-    gd = TensorDesc(T, _wsize(x))
-    if cache !== nothing
-      mean, ivar = cache.mean, cache.ivar
-      info("mean and ivar are fetched from the cache")
-    else
-      mean, ivar = CU_NULL, CU_NULL
-    end
-
-    if eps < CUDNN.CUDNN_BN_MIN_EPSILON
-      eps = CUDNN.CUDNN_BN_MIN_EPSILON
-    end
-
-    CUDNN.cudnnBatchNormalizationBackward(handle(), CUDNN.CUDNN_BATCHNORM_SPATIAL, Ref(T(alpha)), Ref(T(beta)), Ref(T(dalpha)), Ref(T(dbeta)), xd, x, dyd, dy, dxd, dx, gd, g, dg, db, eps, mean, ivar)
-  else
-    ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps)
-    dx .= dy .* reshape(g, _wsize(x)) .* ivar
-    dg .= squeeze(sum(dy .* (x .- reshape(running_mean, _wsize(x))) .* ivar, _reddims(dy)), dims = (1,2,4))
-    db .= squeeze(sum(dy, _reddims(dy)), dims = (1,2,4))
-  end
-end
-
-# Flux Interface
+import ..Flux: data
+import CuArrays.CUDNN: batchnorm, ∇batchnorm
 
 (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, cache = nothing) where T<:Union{Float32, Float64} =
   BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = Flux.istraining()))
diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl
index bbd4e122..edbf58c5 100644
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@@ -1,190 +1,7 @@
-using CuArrays.CUDNN: handle, cudnnDataType, TensorDesc, FilterDesc
-
-import CuArrays.CUDAdrv: CU_NULL
-
-using LinearAlgebra
-
-const RNN_RELU = 0 # Stock RNN with ReLu activation
-const RNN_TANH = 1 # Stock RNN with tanh activation
-const LSTM = 2     # LSTM with no peephole connections
-const GRU = 3      # Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1)
-
-const LINEAR_INPUT = 0
-const SKIP_INPUT = 1
-
-const UNIDIRECTIONAL = 0
-const BIDIRECTIONAL = 1
-
-const RNN_ALGO_STANDARD = 0
-const RNN_ALGO_PERSIST_STATIC = 1
-const RNN_ALGO_PERSIST_DYNAMIC = 2
-
-# param layout:
-# RNN: [weight, bias] × [input, hidden]
-# GRU: [weight, bias] × [input, hidden] × [reset, update, newmem]
-# LSTM: [weight, bias] × [input, hidden] × [input, forget, newmem, output]
-
-function params(w::CuVector, input, hidden, n = 1)
-  slice(offset, shape) = reshape(view(w, offset.+(1:prod(shape))), shape)
-  wx = slice(0, (input, hidden*n))
-  wh = slice(length(wx), (hidden, hidden*n))
-  bias = view(w, length(wx)+length(wh) .+ (1:hidden*n))
-  (wx, wh), bias
-end
-
-mutable struct RNNDesc{T}
-  mode::Int
-  input::Int
-  hidden::Int
-  params::CuVector{T}
-  weights::NTuple{2,CuMatrix{T}}
-  bias::CuVector{T}
-  ptr::Ptr{Nothing}
-end
-
-Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr
-
-function rnnParamSize(T, r, input)
-  size = Csize_t[0]
-  CUDNN.cudnnGetRNNParamsSize(handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T))
-  return Int(size[])÷sizeof(T)
-end
-
-ngates(mode) = [1, 1, 4, 3][mode+1]
-ngates(r::RNNDesc) = ngates(r.mode)
-
-function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T
-  d = [C_NULL]
-  CUDNN.cudnnCreateRNNDescriptor(d)
-
-  dropoutDesc = DropoutDesc(0)
-  inputMode = LINEAR_INPUT
-  direction = UNIDIRECTIONAL
-  algo = RNN_ALGO_STANDARD
-  CUDNN.cudnnSetRNNDescriptor_v6(handle(),d[],hidden,layers,dropoutDesc,CUDNN.cudnnRNNInputMode_t(inputMode),CUDNN.cudnnDirectionMode_t(direction),CUDNN.cudnnRNNMode_t(mode),CUDNN.cudnnRNNAlgo_t(algo),cudnnDataType(T))
-
-  w =CuArrays.zeros(T, rnnParamSize(T, d[], input))
-  # TODO: avoid reserve allocation here
-  rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
-  finalizer(rd) do x
-    CUDNN.cudnnDestroyRNNDescriptor(x)
-  end
-  return rd
-end
-
-function rnnWorkspaceSize(r::RNNDesc, seqlen, xdesc)
-  size = Csize_t[0]
-  CUDNN.cudnnGetRNNWorkspaceSize(handle(), r, seqlen, xdesc, size)
-  return Int(size[])
-end
-
-const workspace = [CuVector{UInt8}(undef, 1)]
-
-getworkspace(bytes) =
-  length(workspace[]) ≥ bytes ?
-    workspace[] :
-    (workspace[] = CuVector{UInt8}(undef, bytes))
-
-getworkspace(r::RNNDesc, seqlen, xdesc) =
-  getworkspace(rnnWorkspaceSize(r, seqlen, xdesc))
-
-function rnnTrainingReserveSize(r::RNNDesc, seqlen, xdesc)
-  size = Csize_t[0]
-  CUDNN.cudnnGetRNNTrainingReserveSize(handle(), r, seqlen, xdesc, size)
-  return Int(size[])
-end
-
-function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
-                         workspace, reserve=nothing) where T
-  if reserve == nothing
-    CUDNN.cudnnRNNForwardInference(handle(), rnn, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y,
-                                   hod, ho, cod, co, workspace, length(workspace))
-  else
-    CUDNN.cudnnRNNForwardTraining(handle(), rnn, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y,
-                                  hod, ho, cod, co, workspace, length(workspace), reserve, length(reserve))
-  end
-end
-
-xDesc(x) = [TensorDesc(eltype(x), (1, size(x, 1), size(x, 2)))]
-
-hDesc(h::Nothing) = C_NULL, CU_NULL
-hDesc(x::Integer) = (@assert x == 0; hDesc(nothing))
-function hDesc(h::CuArray)
-  TensorDesc(eltype(h), (size(h, 1), size(h, 2), 1)), h
-end
-
-# TODO: can we just manipulate strides here?
-# TODO: should use repmat, but this isn't implemented.
-hBatch(x::AbstractVector, h::CuVector) = h
-hBatch(x::AbstractMatrix, h::CuVector) = h .*CuArrays.ones(1, size(x, 2))
-hBatch(x::AbstractMatrix, h::CuMatrix) = h .*CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1)
-
-function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
-  h = hBatch(x, h_)
-  c = c_ == nothing ? nothing : hBatch(x, c_)
-  @assert size(x, 1) == rnn.input
-  @assert size(h, 1) == rnn.hidden
-  @assert size(x, 2) == size(h, 2)
-  seqLength = 1
-  xdesc = xDesc(x)
-  y = x isa AbstractVector ? similar(x, rnn.hidden) : similar(x, rnn.hidden, size(x, 2))
-  ho = similar(h)
-  ydesc = xDesc(y)
-  workspace = getworkspace(rnn, seqLength, xdesc)
-  reserve = train == Val{true} ?
-    CuVector{UInt8}(undef, rnnTrainingReserveSize(rnn, seqLength, xdesc)) :
-    nothing
-  co = c == nothing ? c : similar(c)
-  cudnnRNNForward(rnn, seqLength,
-                  xdesc, x,
-                  hDesc(h)...,
-                  hDesc(c)...,
-                  FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
-                  ydesc, y,
-                  hDesc(ho)...,
-                  hDesc(co)...,
-                  workspace, reserve)
-  result = c == nothing ? (y, ho) : (y, ho, co)
-  return train == Val{true} ? (reserve, result) : result
-end
-
-forwardTrain(rnn::RNNDesc{T}, x::CuArray{T}, h::CuArray{T}, c = nothing) where T =
-  forward(rnn, x, h, c, Val{true})
-
-function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
-  # Same as above, any more efficient way?
-  dy = dy_ isa Integer ? zero(y) : dy_
-  yd = xDesc(y)
-  dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2))
-  dh = similar(h)
-  dc = c == nothing ? nothing : similar(c)
-  workspace = getworkspace(rnn, 1, yd)
-  CUDNN.cudnnRNNBackwardData(handle(), rnn, 1,
-    yd, y, yd, dy, hDesc(dho)..., hDesc(dco)...,
-    FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
-    hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)...,
-    workspace, length(workspace), reserve, length(reserve))
-  return c == nothing ? (dx, dh) : (dx, dh, dc)
-end
-
-backwardData(rnn, y, dy, dho, hx, reserve) =
-  backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve)
-
-function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T
-  dw = zero(rnn.params)
-  CUDNN.cudnnRNNBackwardWeights(handle(), rnn, 1,
-    xDesc(x), x, hDesc(h)..., xDesc(y), y,
-    workspace[], length(workspace[]),
-    FilterDesc(T, (1, 1, length(dw))), dw,
-    reserve, length(reserve))
-  return params(dw, rnn.input, rnn.hidden, ngates(rnn))
-end
-
-# Interface
-
 import ..Flux: Flux, relu
 using CuArrays.CUDAnative
 using CuArrays: @cuindex, cudims
+using LinearAlgebra
 
 function LinearAlgebra.copy_transpose!(dst::CuArray, src::CuArray)
   function kernel(dst, src)
@@ -202,7 +19,7 @@ CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}}
 CuLSTM{T} = Flux.LSTMCell{<:CuArray{T,2},<:CuArray{T,1}}
 CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}}
 
-function copyparams!(m::CuRNNs, d::RNNDesc)
+function copyparams!(m::CuRNNs, d::CUDNN.RNNDesc)
   Wi, Wh = d.weights
   copy_transpose!(Wi, m.Wi)
   copy_transpose!(Wh, m.Wh)
@@ -210,19 +27,19 @@ function copyparams!(m::CuRNNs, d::RNNDesc)
   return
 end
 
-function RNNDesc(m::CuRNNs{T}) where T
+function CUDNN.RNNDesc(m::CuRNNs{T}) where T
   h, i = length(m.h), size(m.Wi, 2)
   mode = m isa CuRNN ?
-    (m.σ == tanh ? RNN_TANH : RNN_RELU) :
-    m isa CuGRU ? GRU : LSTM
-  r = RNNDesc{T}(mode, i, h)
+    (m.σ == tanh ? CUDNN.CUDNN_RNN_TANH : CUDNN.CUDNN_RNN_RELU) :
+    m isa CuGRU ? CUDNN.CUDNN_GRU : CUDNN.CUDNN_LSTM
+  r = CUDNN.RNNDesc{T}(mode, i, h)
   return r
 end
 
 const descs = WeakKeyDict()
 
 function desc(rnn)
-  d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = RNNDesc(rnn))
+  d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = CUDNN.RNNDesc(rnn))
   copyparams!(rnn, d)
   return d
 end
@@ -230,17 +47,17 @@ end
 using ..Flux: @adjoint
 
 function (m::CuRNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  y, h′ = forward(desc(m), x, h)
+  y, h′ = CUDNN.forward(desc(m), x, h)
   return h′, y
 end
 
 function (m::CuGRU{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  y, h′ = forward(desc(m), x, h)
+  y, h′ = CUDNN.forward(desc(m), x, h)
   return h′, y
 end
 
 function (m::CuLSTM{T})(h::NTuple{2,CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  y, h′, c′ = forward(desc(m), x, h[1], h[2])
+  y, h′, c′ = CUDNN.forward(desc(m), x, h[1], h[2])
   return (h′, c′), y
 end
 
@@ -257,12 +74,12 @@ unbroadcast(x::AbstractArray, Δ) =
 
 for RNN in (CuRNN, CuGRU)
   @eval @adjoint function (m::$RNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-    reserve, (y, ho) = forwardTrain(desc(m), x, h)
+    reserve, (y, ho) = CUDNN.forwardTrain(desc(m), x, h)
     (ho, y), function (Δ)
       dho, dy = Δ
-      h_ = hBatch(x, h)
-      dx, dh = backwardData(descs[m], y, dy, dho, h_, reserve)
-      (dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
+      h_ = CUDNN.hBatch(x, h)
+      dx, dh = CUDNN.backwardData(descs[m], y, dy, dho, h_, reserve)
+      (dWi, dWh), db = CUDNN.backwardWeights(descs[m], x, h_, y, reserve)
       dm = Ref{Any}((σ=nothing,Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing))
       (dm, unbroadcast(h, dh), dx)
     end
@@ -270,14 +87,14 @@ for RNN in (CuRNN, CuGRU)
 end
 
 @adjoint function (m::CuLSTM)((h, c)::Tuple{CuArray{T},CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  reserve, (y, ho, co) = forwardTrain(desc(m), x, h, c)
+  reserve, (y, ho, co) = CUDNN.forwardTrain(desc(m), x, h, c)
   ((ho, co), y), function (Δ)
     dhc, dy = Δ
     dho, dco = dhc === nothing ? (nothing, nothing) : dhc
-    h_ = hBatch(x, h)
-    c_ = hBatch(x, c)
-    dx, dh, dc = backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve)
-    (dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
+    h_ = CUDNN.hBatch(x, h)
+    c_ = CUDNN.hBatch(x, c)
+    dx, dh, dc = CUDNN.backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve)
+    (dWi, dWh), db = CUDNN.backwardWeights(descs[m], x, h_, y, reserve)
     dm = Ref{Any}((Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing,c=nothing))
     (dm, (unbroadcast(h, dh), unbroadcast(c, dc)), dx)
   end

From 6ea2557c468090c64ced5e831a8cdd990ecb5281 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 30 Aug 2019 13:41:15 +0200
Subject: [PATCH 019/139] Use correct CuArrays branch for CI.

---
 Manifest.toml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 17eb544e..e54c4a92 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -105,8 +105,10 @@ uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 version = "4.0.0"
 
 [[CuArrays]]
-deps = ["AbstractFFTs", "Adapt", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "46b48742a84bb839e74215b7e468a4a1c6ba30f9"
+deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
+git-tree-sha1 = "8189fcb50b24998bad7518e52443fdb542403093"
+repo-rev = "tb/flux"
+repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 version = "1.2.1"
 
@@ -264,7 +266,7 @@ uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 version = "0.3.7"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]

From c5e56b7e04fcc24d240c3ca8711e3174fb29c82f Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Tue, 17 Sep 2019 17:22:35 +0100
Subject: [PATCH 020/139] move setweights and copy_transpose

---
 Manifest.toml      |  2 +-
 Project.toml       |  1 -
 src/cuda/curnn.jl  | 22 +---------------------
 test/cuda/curnn.jl |  4 ++--
 4 files changed, 4 insertions(+), 25 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index e5c84399..299a40b5 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -106,7 +106,7 @@ version = "4.0.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "155349d2c40568a23cbc4599f0e17e2fdf1bbbcc"
+git-tree-sha1 = "63b4a10d3a4f22ef215d0970483b18296717d1fb"
 repo-rev = "tb/flux"
 repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
diff --git a/Project.toml b/Project.toml
index 2fcdc943..7cd78984 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,7 +11,6 @@ Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl
index 19f6e9df..86422d03 100644
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@@ -1,32 +1,12 @@
 import ..Flux: Flux, relu
 using CuArrays.CUDAnative
 using CuArrays: @cuindex, cudims
-using LinearAlgebra
-
-function LinearAlgebra.copy_transpose!(dst::CuArray, src::CuArray)
-  function kernel(dst, src)
-    I = @cuindex dst
-    dst[I...] = src[reverse(I)...]
-    return
-  end
-  blk, thr = cudims(dst)
-  @cuda blocks=blk threads=thr kernel(dst, src)
-  return dst
-end
 
 CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuArray{T,2},<:CuArray{T,1}}
 CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}}
 CuLSTM{T} = Flux.LSTMCell{<:CuArray{T,2},<:CuArray{T,1}}
 CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}}
 
-function copyparams!(m::CuRNNs, d::CUDNN.RNNDesc)
-  Wi, Wh = d.weights
-  copy_transpose!(Wi, m.Wi)
-  copy_transpose!(Wh, m.Wh)
-  copy_transpose!(d.bias, m.b)
-  return
-end
-
 function CUDNN.RNNDesc(m::CuRNNs{T}) where T
   h, i = length(m.h), size(m.Wi, 2)
   mode = m isa CuRNN ?
@@ -40,7 +20,7 @@ const descs = WeakKeyDict()
 
 function desc(rnn)
   d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = CUDNN.RNNDesc(rnn))
-  copyparams!(rnn, d)
+  CUDNN.setweights!(d, rnn.Wi, rnn.Wh, rnn.b)
   return d
 end
 
diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl
index 1e834d14..e417ea58 100644
--- a/test/cuda/curnn.jl
+++ b/test/cuda/curnn.jl
@@ -22,8 +22,8 @@ end
       rand(10, batch_size)
     cux = gpu(x)
 
-    y, back = forward((r, x) -> (r(x)), rnn, x)
-    cuy, cuback = forward((r, x) -> (r(x)), curnn, cux)
+    y, back = forward((r, x) -> r(x), rnn, x)
+    cuy, cuback = forward((r, x) -> r(x), curnn, cux)
 
     @test y ≈ collect(cuy)
     @test haskey(Flux.CUDA.descs, curnn.cell)

From 46bc8e5e648b5f5fe2811b8c21912367437cbb47 Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Thu, 26 Sep 2019 17:14:18 +0100
Subject: [PATCH 021/139] move pullbacks to CuArrays

---
 Manifest.toml     | 12 ++++++------
 src/cuda/curnn.jl | 27 +++++++++++----------------
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 299a40b5..d10fc71b 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -46,9 +46,9 @@ version = "0.6.2"
 
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "9b2b4b71d6b7f946c9689bb4dea03ff92e3c7091"
+git-tree-sha1 = "e063efb91cfefd7e6afd92c435d01398107a500b"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "1.1.0"
+version = "1.2.0"
 
 [[CUDAdrv]]
 deps = ["CUDAapi", "Libdl", "Printf"]
@@ -105,8 +105,8 @@ uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 version = "4.0.0"
 
 [[CuArrays]]
-deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "63b4a10d3a4f22ef215d0970483b18296717d1fb"
+deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
+git-tree-sha1 = "4e638627673078c58b6e6bb789937822d83350ff"
 repo-rev = "tb/flux"
 repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
@@ -172,9 +172,9 @@ version = "0.10.3"
 
 [[GPUArrays]]
 deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "StaticArrays", "Test"]
-git-tree-sha1 = "b5009ac44b141ded5e6f04c4db83807970f56e91"
+git-tree-sha1 = "77e27264276fe97a7e7fb928bf8999a145abc018"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "1.0.2"
+version = "1.0.3"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl
index 86422d03..fb454729 100644
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@@ -56,7 +56,7 @@ unbroadcast(x::AbstractArray, Δ) =
 coerce_cuda(x::Union{CuArray,Nothing}) = x
 coerce_cuda(x::Tuple) = coerce_cuda.(x)
 
-coerce_cuda(x) = x .+ CuArrays.fill(0)
+coerce_cuda(x::AbstractArray) = x .+ CuArrays.fill(0)
 
 function struct_grad!(cx::Zygote.Context, x, x̄)
   for f in fieldnames(typeof(x))
@@ -69,28 +69,23 @@ end
 
 for RNN in (CuRNN, CuGRU)
   @eval @adjoint function (m::$RNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-    reserve, (y, ho) = CUDNN.forwardTrain(desc(m), x, h)
+    (y, ho), back = CUDNN.pullback(desc(m), x, h)
     (ho, y), function (Δ)
-      dho, dy = coerce_cuda(Δ)
-      h_ = CUDNN.hBatch(x, h)
-      dx, dh = CUDNN.backwardData(descs[m], y, dy, dho, h_, reserve)
-      (dWi, dWh), db = CUDNN.backwardWeights(descs[m], x, h_, y, reserve)
-      dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing))
-      (dm, unbroadcast(h, dh), dx)
+      dho, dy = coerce_cuda(Δ) # Support FillArrays etc.
+      m̄ = back(dy, dho)
+      dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(m̄.Wi),Wh=transpose(m̄.Wh),b=m̄.b,h=nothing))
+      (dm, unbroadcast(h, m̄.h), m̄.x)
     end
   end
 end
 
 @adjoint function (m::CuLSTM)((h, c)::Tuple{CuArray{T},CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  reserve, (y, ho, co) = CUDNN.forwardTrain(desc(m), x, h, c)
+  (y, ho, co), back = CUDNN.pullback(desc(m), x, h, c)
   ((ho, co), y), function (Δ)
-    dhc, dy = coerce_cuda(Δ)
+    dhc, dy = coerce_cuda(Δ) # Support FillArrays etc.
     dho, dco = dhc === nothing ? (nothing, nothing) : dhc
-    h_ = CUDNN.hBatch(x, h)
-    c_ = CUDNN.hBatch(x, c)
-    dx, dh, dc = CUDNN.backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve)
-    (dWi, dWh), db = CUDNN.backwardWeights(descs[m], x, h_, y, reserve)
-    dm = struct_grad!(__context__, m, (Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing,c=nothing))
-    (dm, (unbroadcast(h, dh), unbroadcast(c, dc)), dx)
+    m̄ = back(dy, dho, dco)
+    dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(m̄.Wi),Wh=transpose(m̄.Wh),b=m̄.b,h=nothing,c=nothing))
+    (dm, (unbroadcast(h, m̄.h), unbroadcast(c, m̄.c)), m̄.x)
   end
 end

From 691a29cf32bb01e9ca528ab869d72a17a1dec3a4 Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Fri, 27 Sep 2019 14:15:58 +0100
Subject: [PATCH 022/139] cudnn bug is fixed

---
 Manifest.toml     | 2 +-
 test/cuda/cuda.jl | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index d10fc71b..9919a94d 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -106,7 +106,7 @@ version = "4.0.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "4e638627673078c58b6e6bb789937822d83350ff"
+git-tree-sha1 = "cc22ec1abd471b4529883a8174944b513d75ab33"
 repo-rev = "tb/flux"
 repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 3508e561..20399ef7 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -51,9 +51,7 @@ end
 end
 
 if CuArrays.libcudnn != nothing
-    @info "Testing Flux/CUDNN"
-    include("cudnn.jl")
-    if !haskey(ENV, "CI_DISABLE_CURNN_TEST")
-      include("curnn.jl")
-    end
+  @info "Testing Flux/CUDNN"
+  include("cudnn.jl")
+  include("curnn.jl")
 end

From e287982b7897c2674358e7a753570b3a5235a8f4 Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Fri, 27 Sep 2019 14:55:30 +0100
Subject: [PATCH 023/139] use CuArrays master

---
 Manifest.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 9919a94d..4d825f17 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -106,8 +106,8 @@ version = "4.0.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "cc22ec1abd471b4529883a8174944b513d75ab33"
-repo-rev = "tb/flux"
+git-tree-sha1 = "45683305171430978c17f496969dc9b6d3094a51"
+repo-rev = "master"
 repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 version = "1.3.0"

From a98a1b8bb5e1829c8ad561abe8f92071c63ba5a2 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 27 Sep 2019 21:43:39 +0530
Subject: [PATCH 024/139] fixes

---
 docs/src/saving.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/saving.md b/docs/src/saving.md
index f71c4350..8e795298 100644
--- a/docs/src/saving.md
+++ b/docs/src/saving.md
@@ -113,6 +113,6 @@ You can even store optimiser state alongside the model, to resume training
 exactly where you left off.
 
 ```julia
-opt = ADAM(params(model))
+opt = ADAM()
 @save "model-$(now()).bson" model opt
 ```

From 32ac71734de3903af021b30b96dda4e492070e8c Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 27 Sep 2019 21:43:59 +0530
Subject: [PATCH 025/139] optimiser interface docs

---
 docs/src/training/optimisers.md | 75 +++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 9eb659c4..47f2e9e6 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -58,3 +58,78 @@ AMSGrad
 NADAM
 ADAMW
 ```
+
+## Optimiser Interface
+
+Flux's optimsers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
+
+In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
+
+```julia
+mutable struct Momentum{T,S,D}
+  eta::T
+  rho::S
+  velocity::D
+end
+```
+
+The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state. **Note that this behaviour is set to change in consequent versions of Flux**. We can now define the rule applied when this optimiser is invoked.
+
+```julia
+function apply!(o::Momentum, x, Δ)
+  η, ρ = o.eta, o.rho
+  v = get!(o.velocity, x, zero(x))::typeof(x)
+  @. v = ρ * v - η * Δ
+  @. Δ = -v
+end
+```
+
+This is the basic definition of a Momentum update rule given by:
+$v = ρ * v - η * Δ$
+$w = w - v$
+
+The `apply!` defines the update rules for an optimsier `opt`, given the parameters and gradients. It returns the updated gradients usually. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
+
+Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully. In the future, it will also be delegating immutable update operations.
+
+## Composing Optimisers
+
+Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimsers listed in it sequentially. Each optimiser produces a modified gradient
+that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
+
+``julia
+opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
+```
+
+Here we apply exponential decay to the `Descent` optimser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
+It is then applied like any optimser.
+
+```julia
+w = randn(10, 10)
+w1 = randn(10,10)
+ps = Params([w, w1])
+
+loss(x) = Flux.mse(w * x, w1 * x)
+
+loss(rand(10)) # around 9
+
+for t = 1:10^5
+  θ = Params([w, w1])
+  θ̄ = gradient(() -> loss(rand(10)), θ)
+  Flux.Optimise.update!(opt, θ, θ̄)
+end
+
+loss(rand(10)) # around 0.9
+```
+
+In this manner it is possible to compose optimisers for some added flexibility.
+
+## Decays
+
+Similar to optimisers, Flux also defines some simple decays that can be used in conjunction with other optimisers, or standalone.
+
+```@docs
+ExpDecay
+InvDecay
+WeightDecay
+```

From 8bb0db7d0c17a638c69cd6b8e3eae1c0fab09c2b Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 27 Sep 2019 22:04:53 +0530
Subject: [PATCH 026/139] opt docstrings

---
 src/optimise/optimisers.jl | 41 ++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 58cd5ff7..be400457 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -8,6 +8,7 @@ const ϵ = 1e-8
 
 """
     Descent(η)
+    Defaults: η = 0.1
 
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
@@ -23,7 +24,8 @@ function apply!(o::Descent, x, Δ)
 end
 
 """
-    Momentum(η = 0.01; ρ = 0.9)
+    Momentum(η, ρ)
+    Defaults: η = 0.01, ρ = 0.9
 
 Gradient descent with learning rate `η` and momentum `ρ`.
 """
@@ -43,7 +45,8 @@ function apply!(o::Momentum, x, Δ)
 end
 
 """
-    Nesterov(eta, ρ = 0.9)
+    Nesterov(η, ρ)
+    Defaults: η = 0.001, ρ = 0.9
 
 Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
 """
@@ -64,7 +67,8 @@ function apply!(o::Nesterov, x, Δ)
 end
 
 """
-    RMSProp(η = 0.001, ρ = 0.9)
+    RMSProp(η, ρ)
+    Defaults: η = 0.001, ρ = 0.9
 
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 optimiser. Parameters other than learning rate don't need tuning. Often a good
@@ -86,7 +90,8 @@ function apply!(o::RMSProp, x, Δ)
 end
 
 """
-    ADAM(η = 0.001, β = (0.9, 0.999))
+    Defaults: η = 0.001, β = (0.9, 0.999)
+    ADAM() => ADAM(η, β)
 
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 """
@@ -109,7 +114,8 @@ function apply!(o::ADAM, x, Δ)
 end
 
 """
-    RADAM(η = 0.001, β = (0.9, 0.999))
+    Defaults: η = 0.001, β = (0.9, 0.999)
+    RADAM() => RADAM(η, β)
 
 [RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
 """
@@ -139,7 +145,8 @@ function apply!(o::RADAM, x, Δ)
 end
 
 """
-    AdaMax(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08)
+    Defaults: η = 0.001, β = (0.9, 0.999)
+    AdaMax() => AdaMax(η, β)
 
 [AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
 the ∞-norm.
@@ -163,7 +170,8 @@ function apply!(o::AdaMax, x, Δ)
 end
 
 """
-    ADAGrad(η = 0.1; ϵ = 1e-8)
+    Defaults: η = 0.1
+    ADAGrad() => ADAGrad(η)
 
 [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
 Parameters don't need tuning.
@@ -183,7 +191,8 @@ function apply!(o::ADAGrad, x, Δ)
 end
 
 """
-    ADADelta(ρ = 0.9, ϵ = 1e-8)
+    Defaults: ρ = 0.9
+    ADADelta() => ADADelta(ρ)
 
 [ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
 tuning.
@@ -205,7 +214,8 @@ function apply!(o::ADADelta, x, Δ)
 end
 
 """
-    AMSGrad(η = 0.001, β = (0.9, 0.999))
+    Defaults: η = 0.001, β = (0.9, 0.999)
+    AMSGrad() => AMSGrad(η, β)
 
 [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
 tuning.
@@ -228,7 +238,8 @@ function apply!(o::AMSGrad, x, Δ)
 end
 
 """
-    NADAM(η = 0.001, β = (0.9, 0.999))
+    Defaults: η = 0.001, β = (0.9, 0.999)
+    NADAM() => NADAM(η, β)
 
 [NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
 tuning.
@@ -252,7 +263,8 @@ function apply!(o::NADAM, x, Δ)
 end
 
 """
-    ADAMW((η = 0.001, β = (0.9, 0.999), decay = 0)
+    Defaults: η = 0.001, β = (0.9, 0.999), decay = 0
+    ADAMW() => ADAMW(η, β, decay)
 
 [ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam.
 """
@@ -287,7 +299,8 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
-`InvDecay(γ)`
+Defaults: γ = 0.001
+`InvDecay() => InvDecay(γ)`
 
 Apply inverse time decay to an optimiser
 ```julia
@@ -311,6 +324,7 @@ end
 
 """
 `ExpDecay(eta, decay, decay_step, clip)`
+Defaults: eta = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4
 
 Schedule the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
 
@@ -340,7 +354,8 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-`WeightDecay(wd)`
+`WeightDecay() => WeightDecay(wd)`
+Defaults: wd = 0
 
 Decay the weight parameter by `wd`
 """

From 0175485a80c71690aa6c1a95b562b54478226a2a Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 27 Sep 2019 22:08:25 +0530
Subject: [PATCH 027/139] fixup

---
 src/optimise/optimisers.jl | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index be400457..09a86174 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -7,7 +7,7 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-    Descent(η)
+    Descent() => Descent(η)
     Defaults: η = 0.1
 
 Classic gradient descent optimiser with learning rate `η`.
@@ -24,7 +24,7 @@ function apply!(o::Descent, x, Δ)
 end
 
 """
-    Momentum(η, ρ)
+    Momentum() => Momentum(η, ρ)
     Defaults: η = 0.01, ρ = 0.9
 
 Gradient descent with learning rate `η` and momentum `ρ`.
@@ -45,7 +45,7 @@ function apply!(o::Momentum, x, Δ)
 end
 
 """
-    Nesterov(η, ρ)
+    Nesterov() => Nesterov(η, ρ)
     Defaults: η = 0.001, ρ = 0.9
 
 Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
@@ -67,7 +67,7 @@ function apply!(o::Nesterov, x, Δ)
 end
 
 """
-    RMSProp(η, ρ)
+    RMSProp() => RMSProp(η, ρ)
     Defaults: η = 0.001, ρ = 0.9
 
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
@@ -90,8 +90,8 @@ function apply!(o::RMSProp, x, Δ)
 end
 
 """
-    Defaults: η = 0.001, β = (0.9, 0.999)
     ADAM() => ADAM(η, β)
+    Defaults: η = 0.001, β = (0.9, 0.999)
 
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 """
@@ -114,8 +114,8 @@ function apply!(o::ADAM, x, Δ)
 end
 
 """
-    Defaults: η = 0.001, β = (0.9, 0.999)
     RADAM() => RADAM(η, β)
+    Defaults: η = 0.001, β = (0.9, 0.999)
 
 [RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
 """
@@ -145,8 +145,8 @@ function apply!(o::RADAM, x, Δ)
 end
 
 """
-    Defaults: η = 0.001, β = (0.9, 0.999)
     AdaMax() => AdaMax(η, β)
+    Defaults: η = 0.001, β = (0.9, 0.999)
 
 [AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
 the ∞-norm.
@@ -170,8 +170,8 @@ function apply!(o::AdaMax, x, Δ)
 end
 
 """
-    Defaults: η = 0.1
     ADAGrad() => ADAGrad(η)
+    Defaults: η = 0.1
 
 [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
 Parameters don't need tuning.
@@ -191,8 +191,8 @@ function apply!(o::ADAGrad, x, Δ)
 end
 
 """
-    Defaults: ρ = 0.9
     ADADelta() => ADADelta(ρ)
+    Defaults: ρ = 0.9
 
 [ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
 tuning.
@@ -214,8 +214,8 @@ function apply!(o::ADADelta, x, Δ)
 end
 
 """
-    Defaults: η = 0.001, β = (0.9, 0.999)
     AMSGrad() => AMSGrad(η, β)
+    Defaults: η = 0.001, β = (0.9, 0.999)
 
 [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
 tuning.
@@ -238,8 +238,8 @@ function apply!(o::AMSGrad, x, Δ)
 end
 
 """
-    Defaults: η = 0.001, β = (0.9, 0.999)
     NADAM() => NADAM(η, β)
+    Defaults: η = 0.001, β = (0.9, 0.999)
 
 [NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
 tuning.
@@ -299,8 +299,8 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
+InvDecay() => InvDecay(γ)
 Defaults: γ = 0.001
-`InvDecay() => InvDecay(γ)`
 
 Apply inverse time decay to an optimiser
 ```julia
@@ -323,7 +323,7 @@ function apply!(o::InvDecay, x, Δ)
 end
 
 """
-`ExpDecay(eta, decay, decay_step, clip)`
+ExpDecay(eta, decay, decay_step, clip)
 Defaults: eta = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4
 
 Schedule the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
@@ -354,7 +354,7 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-`WeightDecay() => WeightDecay(wd)`
+WeightDecay() => WeightDecay(wd)
 Defaults: wd = 0
 
 Decay the weight parameter by `wd`

From 8013c728b112aec15d50c4b6e1470f24758b4c5f Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sat, 28 Sep 2019 16:09:00 +0530
Subject: [PATCH 028/139] clearer optimiser docstrings

---
 src/optimise/optimisers.jl | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 09a86174..aa5b7203 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -7,7 +7,7 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-    Descent() => Descent(η)
+    Descent(η)
     Defaults: η = 0.1
 
 Classic gradient descent optimiser with learning rate `η`.
@@ -24,7 +24,7 @@ function apply!(o::Descent, x, Δ)
 end
 
 """
-    Momentum() => Momentum(η, ρ)
+    Momentum(η, ρ)
     Defaults: η = 0.01, ρ = 0.9
 
 Gradient descent with learning rate `η` and momentum `ρ`.
@@ -45,7 +45,7 @@ function apply!(o::Momentum, x, Δ)
 end
 
 """
-    Nesterov() => Nesterov(η, ρ)
+    Nesterov(η, ρ)
     Defaults: η = 0.001, ρ = 0.9
 
 Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
@@ -67,7 +67,7 @@ function apply!(o::Nesterov, x, Δ)
 end
 
 """
-    RMSProp() => RMSProp(η, ρ)
+    RMSProp(η, ρ)
     Defaults: η = 0.001, ρ = 0.9
 
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
@@ -90,7 +90,7 @@ function apply!(o::RMSProp, x, Δ)
 end
 
 """
-    ADAM() => ADAM(η, β)
+    ADAM(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
@@ -114,7 +114,7 @@ function apply!(o::ADAM, x, Δ)
 end
 
 """
-    RADAM() => RADAM(η, β)
+    RADAM(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
@@ -145,7 +145,7 @@ function apply!(o::RADAM, x, Δ)
 end
 
 """
-    AdaMax() => AdaMax(η, β)
+    AdaMax(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
@@ -170,7 +170,7 @@ function apply!(o::AdaMax, x, Δ)
 end
 
 """
-    ADAGrad() => ADAGrad(η)
+    ADAGrad(η)
     Defaults: η = 0.1
 
 [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
@@ -191,7 +191,7 @@ function apply!(o::ADAGrad, x, Δ)
 end
 
 """
-    ADADelta() => ADADelta(ρ)
+    ADADelta(ρ)
     Defaults: ρ = 0.9
 
 [ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
@@ -214,7 +214,7 @@ function apply!(o::ADADelta, x, Δ)
 end
 
 """
-    AMSGrad() => AMSGrad(η, β)
+    AMSGrad(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
@@ -238,7 +238,7 @@ function apply!(o::AMSGrad, x, Δ)
 end
 
 """
-    NADAM() => NADAM(η, β)
+    NADAM(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
@@ -263,8 +263,8 @@ function apply!(o::NADAM, x, Δ)
 end
 
 """
+    ADAMW(η, β, decay)
     Defaults: η = 0.001, β = (0.9, 0.999), decay = 0
-    ADAMW() => ADAMW(η, β, decay)
 
 [ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam.
 """
@@ -299,7 +299,7 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
-InvDecay() => InvDecay(γ)
+InvDecay(γ)
 Defaults: γ = 0.001
 
 Apply inverse time decay to an optimiser
@@ -354,7 +354,7 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-WeightDecay() => WeightDecay(wd)
+WeightDecay(wd)
 Defaults: wd = 0
 
 Decay the weight parameter by `wd`

From d91677f651a79bcb04e9c2f31e681ae9e6f85e07 Mon Sep 17 00:00:00 2001
From: Filippo Vicentini <filippovicentini@gmail.com>
Date: Sun, 29 Sep 2019 12:23:41 +0200
Subject: [PATCH 029/139] Fix `params!` to work with complex numbers

---
 src/functor.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/functor.jl b/src/functor.jl
index 73483ab9..f69f4a65 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -40,6 +40,7 @@ end
 trainable(m) = functor(m)[1]
 
 params!(p::Params, x::AbstractArray{<:Real}, seen = IdSet()) = push!(p, x)
+params!(p::Params, x::AbstractArray{Complex{<:Real}}, seen = IdSet()) = push!(p, x)
 
 function params!(p::Params, x, seen = IdSet())
   x in seen && return

From 14e94c291e09846b222d8ea24e465e7219122b50 Mon Sep 17 00:00:00 2001
From: Filippo Vicentini <filippovicentini@gmail.com>
Date: Sun, 29 Sep 2019 12:28:01 +0200
Subject: [PATCH 030/139] Make it actually work

---
 src/functor.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/functor.jl b/src/functor.jl
index f69f4a65..8fb23089 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -40,7 +40,7 @@ end
 trainable(m) = functor(m)[1]
 
 params!(p::Params, x::AbstractArray{<:Real}, seen = IdSet()) = push!(p, x)
-params!(p::Params, x::AbstractArray{Complex{<:Real}}, seen = IdSet()) = push!(p, x)
+params!(p::Params, x::AbstractArray{<:Complex{<:Real}}, seen = IdSet()) = push!(p, x)
 
 function params!(p::Params, x, seen = IdSet())
   x in seen && return

From 606fe5885489a5c93fb17ac2a3e8f93f9c71b871 Mon Sep 17 00:00:00 2001
From: Filippo Vicentini <filippovicentini@gmail.com>
Date: Sun, 29 Sep 2019 12:33:02 +0200
Subject: [PATCH 031/139] Use <:Number

---
 src/functor.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index 8fb23089..1d3e1bb2 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -39,8 +39,7 @@ end
 
 trainable(m) = functor(m)[1]
 
-params!(p::Params, x::AbstractArray{<:Real}, seen = IdSet()) = push!(p, x)
-params!(p::Params, x::AbstractArray{<:Complex{<:Real}}, seen = IdSet()) = push!(p, x)
+params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
 
 function params!(p::Params, x, seen = IdSet())
   x in seen && return

From ec35e9cbaa31bcdb37857c5bb39bbbfc22379e4e Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Mon, 30 Sep 2019 21:02:13 +0530
Subject: [PATCH 032/139] Loss functions docs added in layers.md

---
 docs/src/models/layers.md     | 12 ++++++++++++
 docs/src/training/training.md | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index f2bd8046..c439581c 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,3 +66,15 @@ AlphaDropout
 LayerNorm
 GroupNorm
 ```
+
+## In-built loss functions:
+```@docs
+mse
+crossentropy
+logitcrossentropy
+binarycrossentropy
+logitbinarycrossentropy
+kldivergence
+poisson
+hinge
+```
\ No newline at end of file
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index cba1422c..679bbd0b 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -32,18 +32,6 @@ Flux.train!(loss, ps, data, opt)
 
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
 
-In-built loss functions:
-```@docs
-mse
-crossentropy
-logitcrossentropy
-binarycrossentropy
-logitbinarycrossentropy
-kldivergence
-poisson
-hinge
-```
-
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From ec886c8ce864721b4144cb749c458b3410c67946 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Thu, 3 Oct 2019 21:13:09 +0530
Subject: [PATCH 033/139] Added docstring for hinge loss

---
 src/layers/stateless.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index c3dd22b0..8cdac33d 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -50,11 +50,6 @@ function normalise(x::AbstractArray; dims=1)
   return (x .- μ′) ./ σ′
 end
 
-function normalise(x::AbstractArray, dims)
-  Base.depwarn("`normalise(x::AbstractArray, dims)` is deprecated, use `normalise(a, dims=dims)` instead.", :normalise)
-  normalise(x, dims = dims)
-end
-
 """
     Kullback Leibler Divergence(KL Divergence)
 KLDivergence is a measure of how much one probability distribution is different from the other.
@@ -74,4 +69,8 @@ https://isaacchanghau.github.io/post/loss_functions/
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
+"""
+    Hinge Loss function
+Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). This is usually used for measuring whether two inputs are similar or dissimilar
+"""
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)

From 63d196aa370def3ea9883fb30648f9eccdf98819 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 3 Oct 2019 19:54:23 +0200
Subject: [PATCH 034/139] Check if CUDA availability changed during init.

---
 src/Flux.jl    | 25 ++++++++++++++++---------
 src/functor.jl |  2 +-
 src/onehot.jl  |  2 +-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 0b57f81d..911d2ab5 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -22,15 +22,10 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
 
 using CUDAapi
 if has_cuda()
-  try
-    using CuArrays
-    @eval has_cuarrays() = true
-  catch ex
-    @warn "CUDA is installed, but CuArrays.jl fails to load" exception=(ex,catch_backtrace())
-    @eval has_cuarrays() = false
-  end
+  using CuArrays
+  use_cuda() = true
 else
-  has_cuarrays() = false
+  use_cuda() = false
 end
 
 include("utils.jl")
@@ -47,8 +42,20 @@ include("data/Data.jl")
 
 include("deprecations.jl")
 
-if has_cuarrays()
+if use_cuda()
   include("cuda/cuda.jl")
 end
 
+function __init__()
+  if has_cuda() != use_cuda()
+      cachefile = if VERSION >= v"1.3-"
+          Base.compilecache_path(Base.PkgId(Flux))
+      else
+          abspath(DEPOT_PATH[1], Base.cache_file_entry(Base.PkgId(Flux)))
+      end
+      rm(cachefile)
+      error("Your set-up changed, and Flux.jl needs to be reconfigured. Please load the package again.")
+  end
+end
+
 end # module
diff --git a/src/functor.jl b/src/functor.jl
index 73483ab9..a3e053b0 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -73,7 +73,7 @@ end
 
 cpu(m) = fmap(x -> adapt(Array, x), m)
 
-const gpu_adaptor = if has_cuarrays()
+const gpu_adaptor = if use_cuda()
   CuArrays.cu
 else
   identity
diff --git a/src/onehot.jl b/src/onehot.jl
index fe93c5c5..9bce5dd8 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -37,7 +37,7 @@ import Adapt: adapt, adapt_structure
 
 adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
 
-if has_cuarrays()
+if use_cuda()
   import .CuArrays: CuArray, cudaconvert
   import Base.Broadcast: BroadcastStyle, ArrayStyle
   BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()

From 2369b2b3fdc2b6fcd68b67e7f7776621474f28ed Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 3 Oct 2019 21:10:20 +0200
Subject: [PATCH 035/139] Add an environment variable to disable CUDA usage.

---
 src/Flux.jl | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 911d2ab5..c0023e2c 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -20,9 +20,18 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
   ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
   ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
 
+
+allow_cuda() = parse(Bool, get(ENV, "FLUX_USE_CUDA", "true"))
+const consider_cuda = allow_cuda()
+
 using CUDAapi
-if has_cuda()
-  using CuArrays
+if consider_cuda && has_cuda()
+  try
+    using CuArrays
+  catch
+    @error "CUDA is installed, but CuArrays.jl fails to load. Please fix the issue, or load Flux with FLUX_USE_CUDA=false."
+    rethrow()
+  end
   use_cuda() = true
 else
   use_cuda() = false
@@ -47,7 +56,9 @@ if use_cuda()
 end
 
 function __init__()
-  if has_cuda() != use_cuda()
+  # check if the GPU usage conditions that are baked in the precompilation image
+  # match the current situation, and force a recompilation if not.
+  if (allow_cuda() != consider_cuda) || (consider_cuda && has_cuda() != use_cuda())
       cachefile = if VERSION >= v"1.3-"
           Base.compilecache_path(Base.PkgId(Flux))
       else

From 8aea15e6e021e5055104694a87bc8ef6c54a2f48 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 3 Oct 2019 21:28:55 +0200
Subject: [PATCH 036/139] Demote to const variables.

---
 src/Flux.jl    | 10 ++++------
 src/functor.jl |  2 +-
 src/onehot.jl  |  2 +-
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index c0023e2c..95bdcd32 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -25,16 +25,14 @@ allow_cuda() = parse(Bool, get(ENV, "FLUX_USE_CUDA", "true"))
 const consider_cuda = allow_cuda()
 
 using CUDAapi
-if consider_cuda && has_cuda()
+const use_cuda = consider_cuda && has_cuda()
+if use_cuda
   try
     using CuArrays
   catch
     @error "CUDA is installed, but CuArrays.jl fails to load. Please fix the issue, or load Flux with FLUX_USE_CUDA=false."
     rethrow()
   end
-  use_cuda() = true
-else
-  use_cuda() = false
 end
 
 include("utils.jl")
@@ -51,14 +49,14 @@ include("data/Data.jl")
 
 include("deprecations.jl")
 
-if use_cuda()
+if use_cuda
   include("cuda/cuda.jl")
 end
 
 function __init__()
   # check if the GPU usage conditions that are baked in the precompilation image
   # match the current situation, and force a recompilation if not.
-  if (allow_cuda() != consider_cuda) || (consider_cuda && has_cuda() != use_cuda())
+  if (allow_cuda() != consider_cuda) || (consider_cuda && has_cuda() != use_cuda)
       cachefile = if VERSION >= v"1.3-"
           Base.compilecache_path(Base.PkgId(Flux))
       else
diff --git a/src/functor.jl b/src/functor.jl
index a3e053b0..798445b4 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -73,7 +73,7 @@ end
 
 cpu(m) = fmap(x -> adapt(Array, x), m)
 
-const gpu_adaptor = if use_cuda()
+const gpu_adaptor = if use_cuda
   CuArrays.cu
 else
   identity
diff --git a/src/onehot.jl b/src/onehot.jl
index 9bce5dd8..84747450 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -37,7 +37,7 @@ import Adapt: adapt, adapt_structure
 
 adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
 
-if use_cuda()
+if use_cuda
   import .CuArrays: CuArray, cudaconvert
   import Base.Broadcast: BroadcastStyle, ArrayStyle
   BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()

From b503741651c4c89605aa2ffacb0168d47364405c Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 4 Oct 2019 14:46:03 +0530
Subject: [PATCH 037/139] expanded docstrings

---
 src/optimise/optimisers.jl | 92 +++++++++++++++++++++++++++-----------
 1 file changed, 67 insertions(+), 25 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index aa5b7203..bf2122a5 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -8,7 +8,9 @@ const ϵ = 1e-8
 
 """
     Descent(η)
-    Defaults: η = 0.1
+
+    Calls to `Descent()` default with:
+      - learning rate (η): 0.1
 
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
@@ -25,7 +27,10 @@ end
 
 """
     Momentum(η, ρ)
-    Defaults: η = 0.01, ρ = 0.9
+
+    Calls to `Momentum()` default to:
+      - learning rate (η): 0.01
+      - decay (ρ): 0.9
 
 Gradient descent with learning rate `η` and momentum `ρ`.
 """
@@ -46,7 +51,10 @@ end
 
 """
     Nesterov(η, ρ)
-    Defaults: η = 0.001, ρ = 0.9
+
+    Calls to `Nesterov()` default to:
+      - learning rate (η): 0.001
+      - nesterov momentum (ρ): 0.9
 
 Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
 """
@@ -68,7 +76,10 @@ end
 
 """
     RMSProp(η, ρ)
-    Defaults: η = 0.001, ρ = 0.9
+
+    Calls to `RMSProp()` default to:
+      - learning rate (η): 0.001
+      - rho (ρ): 0.9
 
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 optimiser. Parameters other than learning rate don't need tuning. Often a good
@@ -90,8 +101,11 @@ function apply!(o::RMSProp, x, Δ)
 end
 
 """
-    ADAM(η, β)
-    Defaults: η = 0.001, β = (0.9, 0.999)
+    ADAM(η, β::Tuple)
+
+    Calls to `ADAM()` default to:
+      - learning rate (η): 0.001
+      - (beta1, beta2) (β): (0.9, 0.999)
 
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 """
@@ -114,8 +128,11 @@ function apply!(o::ADAM, x, Δ)
 end
 
 """
-    RADAM(η, β)
-    Defaults: η = 0.001, β = (0.9, 0.999)
+    RADAM(η, β::Tuple)
+
+    Calls to `RADAM()` default to:
+      - learning rate (η): 0.001
+      - (beta1, beta2) (β): (0.9, 0.999)
 
 [RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
 """
@@ -145,8 +162,11 @@ function apply!(o::RADAM, x, Δ)
 end
 
 """
-    AdaMax(η, β)
-    Defaults: η = 0.001, β = (0.9, 0.999)
+    AdaMax(η, β::Tuple)
+
+    Calls to `AdaMax()` default to:
+      - learning rate (η): 0.001
+      - (beta1, beta2) (β): (0.9, 0.999)
 
 [AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
 the ∞-norm.
@@ -171,7 +191,9 @@ end
 
 """
     ADAGrad(η)
-    Defaults: η = 0.1
+
+    Calls to `AdaGrad()` default to:
+      - learning rate (η): 0.1
 
 [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
 Parameters don't need tuning.
@@ -192,7 +214,9 @@ end
 
 """
     ADADelta(ρ)
-    Defaults: ρ = 0.9
+
+    Calls to `ADADelta()` default to:
+      rho (ρ): 0.9
 
 [ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
 tuning.
@@ -214,8 +238,11 @@ function apply!(o::ADADelta, x, Δ)
 end
 
 """
-    AMSGrad(η, β)
-    Defaults: η = 0.001, β = (0.9, 0.999)
+    AMSGrad(η, β::Tuple)
+
+    Calls to `AMSGrad()` default to:
+      - learning rate (η): 0.001
+      - (beta1, beta2) (β): (0.9, 0.999)
 
 [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
 tuning.
@@ -238,8 +265,11 @@ function apply!(o::AMSGrad, x, Δ)
 end
 
 """
-    NADAM(η, β)
-    Defaults: η = 0.001, β = (0.9, 0.999)
+    NADAM(η, β::Tuple)
+
+    Calls to `NADAM()` default to:
+      - learning rate (η): 0.001
+      - (beta1, beta2) (β): (0.9, 0.999)
 
 [NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
 tuning.
@@ -263,8 +293,11 @@ function apply!(o::NADAM, x, Δ)
 end
 
 """
-    ADAMW(η, β, decay)
-    Defaults: η = 0.001, β = (0.9, 0.999), decay = 0
+    ADAMW(η, β::Tuple, decay)
+
+    Calls to `ADAMW()` default to:
+      - learning rate (η) 0.001
+      - (beta1, beta2) (β): (0.9, 0.999)
 
 [ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam.
 """
@@ -299,8 +332,10 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
-InvDecay(γ)
-Defaults: γ = 0.001
+  InvDecay(γ)
+
+  Calls to `InvDecay()` default to:
+    - gamma (γ): 0.001
 
 Apply inverse time decay to an optimiser
 ```julia
@@ -323,10 +358,15 @@ function apply!(o::InvDecay, x, Δ)
 end
 
 """
-ExpDecay(eta, decay, decay_step, clip)
-Defaults: eta = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4
+  ExpDecay(eta, decay, decay_step, clip)
 
-Schedule the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
+  Calls to `ExpDecay()` default to:
+    - learning rate (eta): 0.001
+    - decay: 0.1
+    - decay_step: 1000
+    - clip: 1e-4
+
+Discount the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
 
 To apply exponential decay to an optimiser:
 ```julia
@@ -354,8 +394,10 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-WeightDecay(wd)
-Defaults: wd = 0
+  WeightDecay(wd)
+
+  Calls to `WeightDecay()` default to:
+    - weight decay (wd): 0
 
 Decay the weight parameter by `wd`
 """

From 3b7b780d398bef91f2e793e2293f140d8c3b9241 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 8 Oct 2019 23:04:31 -0700
Subject: [PATCH 038/139] super simple test

---
 test/layers/basic.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index cbe250fc..4edfecc7 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -19,6 +19,12 @@ import Flux: activations
     # numeric test should be put into testset of corresponding layer
   end
 
+  @testset "Activations" begin
+    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
+    X = Float32.([1.0; 1.0; 1.0])
+    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
+  end
+
   @testset "Dense" begin
     @test  length(Dense(10, 5)(randn(10))) == 5
     @test_throws DimensionMismatch Dense(10, 5)(randn(1))

From 96a23c295c88454770dd5d5a961fec4d1898dcb0 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Wed, 9 Oct 2019 14:53:03 +0530
Subject: [PATCH 039/139] Changes to docs

---
 docs/src/models/layers.md |  2 +-
 src/layers/stateless.jl   | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index c75c77b7..0007853a 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,7 +66,7 @@ LayerNorm
 GroupNorm
 ```
 
-## In-built loss functions:
+## Loss functions:
 ```@docs
 mse
 crossentropy
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 8cdac33d..4e142f07 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -51,10 +51,10 @@ function normalise(x::AbstractArray; dims=1)
 end
 
 """
-    Kullback Leibler Divergence(KL Divergence)
+    kldivergence(ŷ, y)
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
-https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+[KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
 """
 function kldivergence(ŷ, y)
   entropy = sum(y .* log.(y)) *1 //size(y,2)
@@ -63,14 +63,15 @@ function kldivergence(ŷ, y)
 end
 
 """
-    Poisson Loss function
+    poisson(ŷ, y)
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
-https://isaacchanghau.github.io/post/loss_functions/
+[Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
 """
-    Hinge Loss function
-Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). This is usually used for measuring whether two inputs are similar or dissimilar
+    hinge(ŷ, y)
+Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
+[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)

From fe52689cfe9b2b3a85e7172f5417a65b6a718d66 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 9 Oct 2019 16:16:11 +0530
Subject: [PATCH 040/139] in depth docstrings

---
 src/optimise/optimisers.jl | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index bf2122a5..14cc3fec 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -7,13 +7,32 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-    Descent(η)
-
-    Calls to `Descent()` default with:
-      - learning rate (η): 0.1
+# Descent
 
+## Description
 Classic gradient descent optimiser with learning rate `η`.
-For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
+For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
+
+## Constructors
+  - `Descent()`: Use the default learning rate (η), as described in the parameters section.
+
+  - `Descent(η)`: Provide a custom learning rate (η) to the Descent optimiser.
+
+## Parameters
+  - Learning rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
+
+## Example
+```julia-repl
+opt = Descent()
+
+ps = params(model)
+
+gs = gradient(ps) do
+  loss(x, y)
+end
+
+Flux.Optimise.update(opt, ps, gs)
+```
 """
 mutable struct Descent
   eta::Float64

From d591b2b59eba2ec360a0836184632d9da8f8dc8f Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Wed, 9 Oct 2019 21:36:40 +0530
Subject: [PATCH 041/139] Removed colon and capitalised

---
 docs/src/models/layers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 0007853a..227abe31 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,7 +66,7 @@ LayerNorm
 GroupNorm
 ```
 
-## Loss functions:
+## Loss Functions
 ```@docs
 mse
 crossentropy

From f19066ee29afaf064579f3b3cb330dc00812324a Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 10 Oct 2019 16:48:12 +0530
Subject: [PATCH 042/139] more docstrings

---
 src/optimise/optimisers.jl | 225 ++++++++++++++++++++++++++-----------
 1 file changed, 161 insertions(+), 64 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 14cc3fec..64eee42a 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -7,23 +7,20 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-# Descent
+  Descent(η)
 
 ## Description
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
 
-## Constructors
-  - `Descent()`: Use the default learning rate (η), as described in the parameters section.
-
-  - `Descent(η)`: Provide a custom learning rate (η) to the Descent optimiser.
-
 ## Parameters
-  - Learning rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
+  - Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
 
 ## Example
 ```julia-repl
-opt = Descent()
+opt = Descent() # uses default η (0.1)
+
+opt = Descent(0.3) # use provided η
 
 ps = params(model)
 
@@ -47,11 +44,18 @@ end
 """
     Momentum(η, ρ)
 
-    Calls to `Momentum()` default to:
-      - learning rate (η): 0.01
-      - decay (ρ): 0.9
-
 Gradient descent with learning rate `η` and momentum `ρ`.
+
+## Parameters
+  - Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`.
+  - Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`.
+
+## Examples
+```julia
+opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9
+
+opt = Momentum(0.01, 0.99)
+```
 """
 mutable struct Momentum
   eta::Float64
@@ -71,11 +75,18 @@ end
 """
     Nesterov(η, ρ)
 
-    Calls to `Nesterov()` default to:
-      - learning rate (η): 0.001
-      - nesterov momentum (ρ): 0.9
-
 Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
+
+## Parameters
+  - Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
+  - Nesterov Momentum (ρ): Paramters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
+
+## Examples
+```julia
+opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9
+
+opt = Nesterov(0.003, 0.95)
+```
 """
 mutable struct Nesterov
   eta::Float64
@@ -96,13 +107,21 @@ end
 """
     RMSProp(η, ρ)
 
-    Calls to `RMSProp()` default to:
-      - learning rate (η): 0.001
-      - rho (ρ): 0.9
+Implements the RMSProp algortihm. Often a good choice for recurrent networks. Paramters other than learning rate generally don't need tuning.
 
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`.
+  - Rho (ρ): Defaults to `0.9`.
+
+## Examples
+```julia
+opt = RMSProp() # uses default η = 0.001 and ρ = 0.9
+
+opt = RMSProp(0.002, 0.95)
+```
+
+## References
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
-optimiser. Parameters other than learning rate don't need tuning. Often a good
-choice for recurrent networks.
 """
 mutable struct RMSProp
   eta::Float64
@@ -122,10 +141,20 @@ end
 """
     ADAM(η, β::Tuple)
 
-    Calls to `ADAM()` default to:
-      - learning rate (η): 0.001
-      - (beta1, beta2) (β): (0.9, 0.999)
+Implements the ADAM optimiser.
 
+## Paramters
+  - Learning Rate (`η`): Defaults to `0.001`.
+  - Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+
+```julia
+opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
+
+opt = ADAM(0.001, (0.9, 0.8))
+```
+## References
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 """
 mutable struct ADAM
@@ -149,10 +178,21 @@ end
 """
     RADAM(η, β::Tuple)
 
-    Calls to `RADAM()` default to:
-      - learning rate (η): 0.001
-      - (beta1, beta2) (β): (0.9, 0.999)
+Implements the rectified ADAM optimizer.
 
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+
+```julia
+opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
+
+opt = RADAM(0.001, (0.9, 0.8))
+```
+
+## References
 [RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
 """
 mutable struct RADAM
@@ -183,12 +223,20 @@ end
 """
     AdaMax(η, β::Tuple)
 
-    Calls to `AdaMax()` default to:
-      - learning rate (η): 0.001
-      - (beta1, beta2) (β): (0.9, 0.999)
+Variant of ADAM based on ∞-norm.
 
-[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
-the ∞-norm.
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+```julia
+opt = AdaMax() # uses default η and β	
+
+opt = AdaMax(0.001, (0.9, 0.995))
+```
+## References
+[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser.
 """
 mutable struct AdaMax
   eta::Float64
@@ -211,9 +259,19 @@ end
 """
     ADAGrad(η)
 
-    Calls to `AdaGrad()` default to:
-      - learning rate (η): 0.1
+Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated.
 
+## Parameters
+  - Learning Rate (η): Defaults to `0.1`
+
+## Examples
+```julia
+opt = ADAGrad() # uses default η = 0.1
+
+opt = ADAGrad(0.001)
+```
+
+## References
 [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
 Parameters don't need tuning.
 """
@@ -234,11 +292,19 @@ end
 """
     ADADelta(ρ)
 
-    Calls to `ADADelta()` default to:
-      rho (ρ): 0.9
+Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning.
 
-[ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
-tuning.
+## Parameters
+  - Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`.
+
+## Examples
+```julia
+opt = ADADelta() # uses default ρ = 0.9
+opt = ADADelta(0.89)
+```
+
+## References
+[ADADelta](https://arxiv.org/abs/1212.5701) optimiser.
 """
 mutable struct ADADelta
   rho::Float64
@@ -259,12 +325,20 @@ end
 """
     AMSGrad(η, β::Tuple)
 
-    Calls to `AMSGrad()` default to:
-      - learning rate (η): 0.001
-      - (beta1, beta2) (β): (0.9, 0.999)
+Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning.
 
-[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
-tuning.
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`.
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+```julia
+opt = AMSGrad() # uses default η and β
+opt = AMSGrad(0.001, (0.89, 0.995))
+```
+
+## References
+[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser.
 """
 mutable struct AMSGrad
   eta::Float64
@@ -286,12 +360,20 @@ end
 """
     NADAM(η, β::Tuple)
 
-    Calls to `NADAM()` default to:
-      - learning rate (η): 0.001
-      - (beta1, beta2) (β): (0.9, 0.999)
+Nesterov variant of ADAM. Parameters don't need tuning.
 
-[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
-tuning.
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`.
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+```julia
+opt = NADAM() # uses default η and β
+opt = NADAM(0.002, (0.89, 0.995))
+```
+
+## References
+[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser.
 """
 mutable struct NADAM
   eta::Float64
@@ -314,11 +396,21 @@ end
 """
     ADAMW(η, β::Tuple, decay)
 
-    Calls to `ADAMW()` default to:
-      - learning rate (η) 0.001
-      - (beta1, beta2) (β): (0.9, 0.999)
+Variant of ADAM defined by fixing weight decay regularization.
 
-[ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam.
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`.
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999).
+  - decay: Decay applied to weights during optimisation. Defaults to 0.
+
+## Examples
+```julia
+opt = ADAMW() # uses default η, β and decay
+opt = ADAMW(0.001, (0.89, 0.995), 0.1) 
+```
+
+## References
+[ADAMW](https://arxiv.org/abs/1711.05101)
 """
 ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
   Optimiser(ADAM(η, β), WeightDecay(decay))
@@ -353,10 +445,12 @@ end
 """
   InvDecay(γ)
 
-  Calls to `InvDecay()` default to:
-    - gamma (γ): 0.001
+Applies inverse time decay to an optimiser
 
-Apply inverse time decay to an optimiser
+## Parameters
+  - gamma (γ): Defaults to `0.001`
+
+## Example
 ```julia
   Optimiser(InvDecay(..), Opt(..))
 ```
@@ -379,17 +473,20 @@ end
 """
   ExpDecay(eta, decay, decay_step, clip)
 
-  Calls to `ExpDecay()` default to:
-    - learning rate (eta): 0.001
-    - decay: 0.1
-    - decay_step: 1000
-    - clip: 1e-4
-
 Discount the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
 
+## Parameters
+  - Learning Rate (eta): Defaults to `0.001`.
+  - decay: Factor by which the learning rate is discounted. Defaults to `0.1`.
+  - decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`.
+  - clip: Minimum value of learning rate. Defaults to `1e-4`.
+
+## Example
 To apply exponential decay to an optimiser:
 ```julia
   Optimiser(ExpDecay(..), Opt(..))
+
+  opt = Optimiser(ExpDecay(), ADAM())
 ```
 """
 mutable struct ExpDecay
@@ -415,10 +512,10 @@ end
 """
   WeightDecay(wd)
 
-  Calls to `WeightDecay()` default to:
-    - weight decay (wd): 0
+Decays the weight by `wd`
 
-Decay the weight parameter by `wd`
+## Parameters
+  - weight decay (wd): 0
 """
 mutable struct WeightDecay
   wd::Real

From 623ee2c29c40ddd59c69fd2b55a6eb1f7f0b2afa Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 10 Oct 2019 20:16:00 +0530
Subject: [PATCH 043/139] typo

Co-Authored-By: Mike J Innes <mike.j.innes@gmail.com>
---
 docs/src/training/optimisers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 47f2e9e6..2d195191 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -88,7 +88,7 @@ This is the basic definition of a Momentum update rule given by:
 $v = ρ * v - η * Δ$
 $w = w - v$
 
-The `apply!` defines the update rules for an optimsier `opt`, given the parameters and gradients. It returns the updated gradients usually. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
+The `apply!` defines the update rules for an optimiser `opt`, given the parameters and gradients. It returns the updated gradients usually. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
 
 Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully. In the future, it will also be delegating immutable update operations.
 

From a55878453c9dfb499411872f4313facbe0b613cd Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 10 Oct 2019 20:16:29 +0530
Subject: [PATCH 044/139] typo

Co-Authored-By: Mike J Innes <mike.j.innes@gmail.com>
---
 docs/src/training/optimisers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 2d195191..e3178504 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -94,7 +94,7 @@ Flux internally calls on this function via the `update!` function. It shares the
 
 ## Composing Optimisers
 
-Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimsers listed in it sequentially. Each optimiser produces a modified gradient
+Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
 that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
 
 ``julia

From 4477dd8d544c53c1f74f3d2e638e90df8895f8a6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 10 Oct 2019 20:27:11 +0530
Subject: [PATCH 045/139] reviews

---
 docs/src/training/optimisers.md | 23 ++++++++++++++---------
 src/optimise/optimisers.jl      |  1 -
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index e3178504..c5f44a95 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -66,14 +66,16 @@ Flux's optimsers are built around a `struct` that holds all the optimiser parame
 In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
 
 ```julia
-mutable struct Momentum{T,S,D}
-  eta::T
-  rho::S
-  velocity::D
+mutable struct Momentum
+  eta
+  rho
+  velocity
 end
+
+Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
 ```
 
-The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state. **Note that this behaviour is set to change in consequent versions of Flux**. We can now define the rule applied when this optimiser is invoked.
+The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
 
 ```julia
 function apply!(o::Momentum, x, Δ)
@@ -85,19 +87,22 @@ end
 ```
 
 This is the basic definition of a Momentum update rule given by:
-$v = ρ * v - η * Δ$
-$w = w - v$
+
+```math
+v = ρ * v - η * Δ
+w = w - v
+```
 
 The `apply!` defines the update rules for an optimiser `opt`, given the parameters and gradients. It returns the updated gradients usually. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
 
-Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully. In the future, it will also be delegating immutable update operations.
+Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully.
 
 ## Composing Optimisers
 
 Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
 that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
 
-``julia
+```julia
 opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
 ```
 
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 64eee42a..8567c7da 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -9,7 +9,6 @@ const ϵ = 1e-8
 """
   Descent(η)
 
-## Description
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
 

From 776023ddad9ffa45d5de0838a4fbad9b9a43c390 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 10 Oct 2019 20:35:28 +0530
Subject: [PATCH 046/139] fixes

---
 docs/src/training/optimisers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index c5f44a95..5e8b95de 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -93,7 +93,7 @@ v = ρ * v - η * Δ
 w = w - v
 ```
 
-The `apply!` defines the update rules for an optimiser `opt`, given the parameters and gradients. It returns the updated gradients usually. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
+The `apply!` defines the update rules for an optimiser `opt`, given the parameters and gradients. It returns the updated gradients. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
 
 Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully.
 

From b8b4bc48b94b9faeddc3afef8c2d2b057079bb97 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Mon, 21 Oct 2019 10:31:44 -0400
Subject: [PATCH 047/139] Backticks and examples for normalise

---
 src/layers/stateless.jl | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 4c216672..ff1cbc39 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -42,7 +42,25 @@ logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
 """
     normalise(x::AbstractArray; dims=1)
 
-    Normalises x to mean 0 and standard deviation 1, across the dimensions given by dims. Defaults to normalising over columns.
+Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
+
+    julia> a = reshape(collect(1:9), 3, 3)
+    3×3 Array{Int64,2}:
+     1  4  7
+     2  5  8
+     3  6  9
+
+    julia> normalise(a)
+    3×3 Array{Float64,2}:
+     -1.22474  -1.22474  -1.22474
+      0.0       0.0       0.0
+      1.22474   1.22474   1.22474
+
+    julia> normalise(a, dims=2)
+    3×3 Array{Float64,2}:
+     -1.22474  0.0  1.22474
+     -1.22474  0.0  1.22474
+     -1.22474  0.0  1.22474
 """
 function normalise(x::AbstractArray; dims=1)
   μ′ = mean(x, dims = dims)

From a9955fec8ae2ae699b4d08ff6ccbf50cc5824e2b Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 22 Oct 2019 16:25:55 +0530
Subject: [PATCH 048/139] correct train! syntax

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4e06793e..d8af28ae 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ model = Chain(
 
 loss(x, y) = crossentropy(model(x), y)
 
-Flux.train!(loss, data, ADAM(...))
+Flux.train!(loss, params(model), data, ADAM(...))
 ```
 
 Yet you can easily strip away the layers, and directly write the mathematics for your problem. Flux will seamlessly take gradients of any Julia code, so your model looks just like the paper.

From 7ead2d6c7b4054d862e4919c2e8c8e9159d2839f Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Tue, 22 Oct 2019 13:36:39 +0100
Subject: [PATCH 049/139] typo

---
 src/optimise/optimisers.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 8567c7da..ea2ef067 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -27,7 +27,7 @@ gs = gradient(ps) do
   loss(x, y)
 end
 
-Flux.Optimise.update(opt, ps, gs)
+Flux.Optimise.update!(opt, ps, gs)
 ```
 """
 mutable struct Descent
@@ -230,7 +230,7 @@ Variant of ADAM based on ∞-norm.
 
 ## Examples
 ```julia
-opt = AdaMax() # uses default η and β	
+opt = AdaMax() # uses default η and β
 
 opt = AdaMax(0.001, (0.9, 0.995))
 ```
@@ -405,7 +405,7 @@ Variant of ADAM defined by fixing weight decay regularization.
 ## Examples
 ```julia
 opt = ADAMW() # uses default η, β and decay
-opt = ADAMW(0.001, (0.89, 0.995), 0.1) 
+opt = ADAMW(0.001, (0.89, 0.995), 0.1)
 ```
 
 ## References

From e0c1c0e057dd9bf030f7289ad16283536f3313f4 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 17 Oct 2019 11:01:28 -0400
Subject: [PATCH 050/139] Fix problem in crossentropy breaking GPU compilation

---
 src/layers/stateless.jl | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 4c216672..6d710c6b 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -4,10 +4,20 @@ using NNlib: logsoftmax, logσ
 
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
-function crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
-  -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
+  return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
 end
 
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
+  return -sum(y .* log.(ŷ)) .* weight * 1 // size(y, 2)
+end
+
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
+  return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
+end
+
+crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
+
 function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
   return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
 end

From f7ce717aaa3387393a0acf2e84b9e69faacb7f94 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Wed, 23 Oct 2019 09:22:22 -0400
Subject: [PATCH 051/139] Add tests

---
 test/cuda/cuda.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 59bc7f50..68820476 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -28,6 +28,8 @@ cm = gpu(m)
 x = [1,2,3]
 cx = gpu(x)
 @test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
+@test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
+@test_broken Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=[1.0;2.0;3.0])
 
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)

From 8913c9c7413b3715f7720fc6606ea00f8dcd4c9d Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Wed, 23 Oct 2019 09:53:09 -0400
Subject: [PATCH 052/139] Make the vector of weights test pass on GPU

---
 test/cuda/cuda.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 68820476..9bafe44a 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -29,7 +29,7 @@ x = [1,2,3]
 cx = gpu(x)
 @test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
 @test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
-@test_broken Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=[1.0;2.0;3.0])
+@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
 
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)

From 7b41bc4ab5b9539b6f084934bfa88080bea2e76b Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 24 Oct 2019 12:40:38 +0200
Subject: [PATCH 053/139] Change `gate` function to `view` instead of copy

Only for vector input as copying a matrix may be more efficient due to
caching. A matrix is sliced per row, meaning the view will not be
aligned.
---
 src/layers/recurrent.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index f2344af8..499a21ab 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -1,5 +1,5 @@
 gate(h, n) = (1:h) .+ h*(n-1)
-gate(x::AbstractVector, h, n) = x[gate(h,n)]
+gate(x::AbstractVector, h, n) = @view x[gate(h,n)]
 gate(x::AbstractMatrix, h, n) = x[gate(h,n),:]
 
 # Stateful recurrence

From 39ab740fb7d08fda4a9e3cbdd569ecb684990582 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Sat, 2 Nov 2019 11:18:06 +0100
Subject: [PATCH 054/139] Check for CUDA availability at run time.

---
 Project.toml      |  2 +-
 src/Flux.jl       | 40 ++++++++++++++--------------------------
 src/cuda/cuda.jl  | 10 +++-------
 src/functor.jl    |  8 +-------
 src/onehot.jl     | 10 ++++------
 test/cuda/cuda.jl |  8 +++-----
 6 files changed, 26 insertions(+), 52 deletions(-)

diff --git a/Project.toml b/Project.toml
index 5e357c59..aa055223 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,7 +5,7 @@ version = "0.9.0"
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
+CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
diff --git a/src/Flux.jl b/src/Flux.jl
index 95bdcd32..61939fac 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -21,19 +21,9 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
   ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
 
 
-allow_cuda() = parse(Bool, get(ENV, "FLUX_USE_CUDA", "true"))
-const consider_cuda = allow_cuda()
-
-using CUDAapi
-const use_cuda = consider_cuda && has_cuda()
-if use_cuda
-  try
-    using CuArrays
-  catch
-    @error "CUDA is installed, but CuArrays.jl fails to load. Please fix the issue, or load Flux with FLUX_USE_CUDA=false."
-    rethrow()
-  end
-end
+ENV["CUDA_INIT_SILENT"] = true
+using CUDAdrv, CuArrays
+const use_cuda = Ref(false)
 
 include("utils.jl")
 include("onehot.jl")
@@ -49,21 +39,19 @@ include("data/Data.jl")
 
 include("deprecations.jl")
 
-if use_cuda
-  include("cuda/cuda.jl")
-end
+include("cuda/cuda.jl")
 
 function __init__()
-  # check if the GPU usage conditions that are baked in the precompilation image
-  # match the current situation, and force a recompilation if not.
-  if (allow_cuda() != consider_cuda) || (consider_cuda && has_cuda() != use_cuda)
-      cachefile = if VERSION >= v"1.3-"
-          Base.compilecache_path(Base.PkgId(Flux))
-      else
-          abspath(DEPOT_PATH[1], Base.cache_file_entry(Base.PkgId(Flux)))
-      end
-      rm(cachefile)
-      error("Your set-up changed, and Flux.jl needs to be reconfigured. Please load the package again.")
+  if !CUDAdrv.functional()
+    @warn "CUDA available, but CUDAdrv.jl failed to load"
+  elseif length(devices()) == 0
+    @warn "CUDA available, but no GPU detected"
+  elseif !CuArrays.functional()
+    @warn "CUDA GPU available, but CuArrays.jl failed to load"
+  elseif !CuArrays.has_cudnn()
+    @warn "CUDA GPU available, but CuArrays.jl did not find libcudnn"
+  else
+    use_cuda[] = true
   end
 end
 
diff --git a/src/cuda/cuda.jl b/src/cuda/cuda.jl
index 00f0d0f2..20aae69c 100644
--- a/src/cuda/cuda.jl
+++ b/src/cuda/cuda.jl
@@ -2,12 +2,8 @@ module CUDA
 
 using ..CuArrays
 
-if CuArrays.libcudnn !== nothing  # TODO: use CuArrays.has_cudnn()
-  using CuArrays: CUDNN
-  include("curnn.jl")
-  include("cudnn.jl")
-else
-  @warn "CUDNN is not installed, some functionality will not be available."
-end
+using CuArrays: CUDNN
+include("curnn.jl")
+include("cudnn.jl")
 
 end
diff --git a/src/functor.jl b/src/functor.jl
index b96d21c8..a36b5765 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -73,13 +73,7 @@ end
 
 cpu(m) = fmap(x -> adapt(Array, x), m)
 
-const gpu_adaptor = if use_cuda
-  CuArrays.cu
-else
-  identity
-end
-
-gpu(x) = fmap(gpu_adaptor, x)
+gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x
 
 # Precision
 
diff --git a/src/onehot.jl b/src/onehot.jl
index 84747450..754d0607 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -37,12 +37,10 @@ import Adapt: adapt, adapt_structure
 
 adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
 
-if use_cuda
-  import .CuArrays: CuArray, cudaconvert
-  import Base.Broadcast: BroadcastStyle, ArrayStyle
-  BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
-  cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
-end
+import .CuArrays: CuArray, cudaconvert
+import Base.Broadcast: BroadcastStyle, ArrayStyle
+BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
+cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
 
 """
     onehot(l, labels[, unk])
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 9bafe44a..ebceee82 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -53,8 +53,6 @@ end
   @test y[3,:] isa CuArray
 end
 
-if CuArrays.libcudnn != nothing
-  @info "Testing Flux/CUDNN"
-  include("cudnn.jl")
-  include("curnn.jl")
-end
+@info "Testing Flux/CUDNN"
+include("cudnn.jl")
+include("curnn.jl")

From a82b76cf24d3ff9b8f9065f9f83694f2625c295c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 4 Nov 2019 15:27:11 +0100
Subject: [PATCH 055/139] Conditionally include the CUDNN glue code.

---
 src/Flux.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 61939fac..694bd10f 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -39,8 +39,6 @@ include("data/Data.jl")
 
 include("deprecations.jl")
 
-include("cuda/cuda.jl")
-
 function __init__()
   if !CUDAdrv.functional()
     @warn "CUDA available, but CUDAdrv.jl failed to load"
@@ -48,10 +46,16 @@ function __init__()
     @warn "CUDA available, but no GPU detected"
   elseif !CuArrays.functional()
     @warn "CUDA GPU available, but CuArrays.jl failed to load"
-  elseif !CuArrays.has_cudnn()
-    @warn "CUDA GPU available, but CuArrays.jl did not find libcudnn"
   else
     use_cuda[] = true
+
+    # FIXME: this functionality should be conditional at run time by checking `use_cuda`
+    #        (or even better, get moved to CuArrays.jl as much as possible)
+    if CuArrays.has_cudnn()
+      include(joinpath(@__DIR__, "cuda/cuda.jl"))
+    else
+      @warn "CUDA GPU available, but CuArrays.jl did not find libcudnn. Some functionality will not be available."
+    end
   end
 end
 

From dbcdf4d1bd5c12a1c38cdf58ebe193df02f620fe Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 4 Nov 2019 15:30:03 +0100
Subject: [PATCH 056/139] Bump GPU packages.

---
 Manifest.toml | 30 ++++++++++++++----------------
 Project.toml  |  4 ++--
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 87f5075f..6ac817a6 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -51,16 +51,16 @@ uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 version = "1.2.0"
 
 [[CUDAdrv]]
-deps = ["CUDAapi", "Libdl", "Printf"]
-git-tree-sha1 = "9ce99b5732c70e06ed97c042187baed876fb1698"
+deps = ["CEnum", "Printf"]
+git-tree-sha1 = "90fa52c4acb2fadf7be48b0d73d9865c16ab9908"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "3.1.0"
+version = "4.0.1"
 
 [[CUDAnative]]
-deps = ["Adapt", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Printf", "TimerOutputs"]
-git-tree-sha1 = "52ae1ce10ebfa686e227655c47b19add89308623"
+deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
+git-tree-sha1 = "5afb86987488ce2f31f9e5426f551d2480d17666"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.3.1"
+version = "2.5.0"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -105,12 +105,10 @@ uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 version = "4.0.0"
 
 [[CuArrays]]
-deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "45683305171430978c17f496969dc9b6d3094a51"
-repo-rev = "master"
-repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
+deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
+git-tree-sha1 = "5eaae49796a3fec88cb2ad5f3f206f4bbb6598bc"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.3.0"
+version = "1.4.0"
 
 [[DataAPI]]
 git-tree-sha1 = "8903f0219d3472543fc4b2f5ebaf675a07f817c0"
@@ -171,10 +169,10 @@ uuid = "f6369f11-7733-5829-9624-2563aa707210"
 version = "0.10.3"
 
 [[GPUArrays]]
-deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "StaticArrays", "Test"]
-git-tree-sha1 = "77e27264276fe97a7e7fb928bf8999a145abc018"
+deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "Test"]
+git-tree-sha1 = "8d74ced24448c52b539a23d107bd2424ee139c0f"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "1.0.3"
+version = "1.0.4"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
@@ -200,9 +198,9 @@ version = "0.7.2"
 
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "4a05f742837779a00bd8c9a18da6817367c4245d"
+git-tree-sha1 = "3680605a77f20bec59eea00389eb7aafe973abbb"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.3.0"
+version = "1.3.1"
 
 [[LibGit2]]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
diff --git a/Project.toml b/Project.toml
index aa055223..8a2d3148 100644
--- a/Project.toml
+++ b/Project.toml
@@ -26,8 +26,8 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"
 
 [compat]
-CUDAapi = "1.1"
-CuArrays = "1.2"
+CUDAdrv = "4.0.1"
+CuArrays = "1.4"
 NNlib = "0.6"
 Zygote = "0.3"
 julia = "1"

From 33d276cdb7c163bd04083de0e0c7176d20253848 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 4 Nov 2019 15:37:25 +0100
Subject: [PATCH 057/139] Fix GPU-less tests.

---
 test/cuda/cuda.jl | 10 +++++++---
 test/runtests.jl  |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index ebceee82..d2907995 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -53,6 +53,10 @@ end
   @test y[3,:] isa CuArray
 end
 
-@info "Testing Flux/CUDNN"
-include("cudnn.jl")
-include("curnn.jl")
+if CuArrays.has_cudnn()
+  @info "Testing Flux/CUDNN"
+  include("cudnn.jl")
+  include("curnn.jl")
+else
+  @warn "CUDNN unavailable, not testing GPU DNN support"
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 61def2b1..1505e96a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -19,7 +19,7 @@ include("layers/normalisation.jl")
 include("layers/stateless.jl")
 include("layers/conv.jl")
 
-if isdefined(Flux, :CUDA)
+if Flux.use_cuda[]
   include("cuda/cuda.jl")
 else
   @warn "CUDA unavailable, not testing GPU support"

From 916d3dabbd23d4d1400e84d1529f4bff7b7e2ff7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 4 Nov 2019 15:38:42 +0100
Subject: [PATCH 058/139] Bump Julia version.

---
 .gitlab-ci.yml | 10 ----------
 .travis.yml    |  5 +++--
 Project.toml   |  2 +-
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9af14c6a..ca44819a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -18,16 +18,6 @@ include:
                           Pkg.build();
                           Pkg.test(; coverage=true);'
 
-test:v1.0:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.0'
-
-test:v1.1:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.1'
-
 test:v1.2:
    extends: .flux
    variables:
diff --git a/.travis.yml b/.travis.yml
index a9cd86ea..4f8acced 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,8 @@ os:
   # - osx
 
 julia:
-  - 1.1
+  - 1.2
+  - 1.3
   - nightly
 
 matrix:
@@ -16,7 +17,7 @@ matrix:
 jobs:
   include:
     - stage: "Documentation"
-      julia: 1.0
+      julia: 1.2
       os: linux
       script:
         - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
diff --git a/Project.toml b/Project.toml
index 8a2d3148..8e986d73 100644
--- a/Project.toml
+++ b/Project.toml
@@ -30,7 +30,7 @@ CUDAdrv = "4.0.1"
 CuArrays = "1.4"
 NNlib = "0.6"
 Zygote = "0.3"
-julia = "1"
+julia = "1.2"
 
 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

From 6e8f8c1f46fc17ca612bf68dba287ad699d16b2c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 4 Nov 2019 15:40:28 +0100
Subject: [PATCH 059/139] Use latest GPU CI templates.

---
 .gitlab-ci.yml | 54 +++++++++++++++++++-------------------------------
 1 file changed, 20 insertions(+), 34 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ca44819a..3b87749f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,41 +1,27 @@
-before_script:
-  - export CI_DISABLE_CURNN_TEST=true
-
-variables:
-  CI_IMAGE_TAG: 'cuda'
-
 include:
-  - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/common.yml'
+  - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v6.yml'
 
-.flux:
-  extends: .test
-  script:
-    - julia -e 'using InteractiveUtils;
-                versioninfo()'
-    - mkdir $JULIA_DEPOT_PATH # Pkg3.jl#325
-    - julia --project -e 'using Pkg;
-                          Pkg.instantiate();
-                          Pkg.build();
-                          Pkg.test(; coverage=true);'
+image: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
 
-test:v1.2:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.2'
 
-test:v1.3:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.3'
+julia:1.2:
+  extends:
+    - .julia:1.2
+    - .test
+  tags:
+    - nvidia
 
-test:v1.0:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.0'
-
-test:dev:
-  extends: .flux
-  variables:
-    CI_VERSION_TAG: 'dev'
+julia:1.3:
+  extends:
+    - .julia:1.3
+    - .test
+  tags:
+    - nvidia
 
+julia:nightly:
+  extends:
+    - .julia:nightly
+    - .test
+  tags:
+    - nvidia
   allow_failure: true

From c9f369de86aac520f24c60160d09268ef129053e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 6 Nov 2019 07:53:20 +0100
Subject: [PATCH 060/139] Update packages.

---
 Manifest.toml | 63 ++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 6ac817a6..e0ad5716 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -28,10 +28,10 @@ uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
 version = "0.8.10"
 
 [[BinaryProvider]]
-deps = ["Libdl", "Logging", "SHA"]
-git-tree-sha1 = "c7361ce8a2129f20b0e05a89f7070820cfed6648"
+deps = ["Libdl", "SHA"]
+git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
 uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.6"
+version = "0.5.8"
 
 [[CEnum]]
 git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
@@ -40,9 +40,9 @@ version = "0.2.0"
 
 [[CSTParser]]
 deps = ["Tokenize"]
-git-tree-sha1 = "c69698c3d4a7255bc1b4bc2afc09f59db910243b"
+git-tree-sha1 = "99dda94f5af21a4565dc2b97edf6a95485f116c3"
 uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
-version = "0.6.2"
+version = "1.0.0"
 
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
@@ -52,15 +52,15 @@ version = "1.2.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "Printf"]
-git-tree-sha1 = "90fa52c4acb2fadf7be48b0d73d9865c16ab9908"
+git-tree-sha1 = "96eabc95ebb83e361311330ffb574a3e2df73251"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "4.0.1"
+version = "4.0.2"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "5afb86987488ce2f31f9e5426f551d2480d17666"
+git-tree-sha1 = "861a1a9e9741cc55c973a4688079f467a72337a7"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.5.0"
+version = "2.5.1"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -88,9 +88,9 @@ version = "0.2.0"
 
 [[Compat]]
 deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f"
+git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "2.1.0"
+version = "2.2.0"
 
 [[Conda]]
 deps = ["JSON", "VersionParsing"]
@@ -106,20 +106,20 @@ version = "4.0.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "5eaae49796a3fec88cb2ad5f3f206f4bbb6598bc"
+git-tree-sha1 = "0d22d5a55e30e98617f258bb23688f141bfeae36"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.4.0"
+version = "1.4.1"
 
 [[DataAPI]]
-git-tree-sha1 = "8903f0219d3472543fc4b2f5ebaf675a07f817c0"
+git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.0.1"
+version = "1.1.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a"
+git-tree-sha1 = "1fe8fad5fc84686dcbc674aa255bc867a64f8132"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.0"
+version = "0.17.5"
 
 [[Dates]]
 deps = ["Printf"]
@@ -153,9 +153,9 @@ version = "1.0.1"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "8fba6ddaf66b45dec830233cea0aae43eb1261ad"
+git-tree-sha1 = "de38b0253ade98340fabaf220f368f6144541938"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.6.4"
+version = "0.7.4"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -163,10 +163,10 @@ uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 version = "0.6.1"
 
 [[ForwardDiff]]
-deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"]
-git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b"
+deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
+git-tree-sha1 = "adf88d6da1f0294058f38295becf8807986bb7d0"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.3"
+version = "0.10.5"
 
 [[GPUArrays]]
 deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "Test"]
@@ -198,9 +198,9 @@ version = "0.7.2"
 
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "3680605a77f20bec59eea00389eb7aafe973abbb"
+git-tree-sha1 = "74fe444b8b6d1ac01d639b2f9eaf395bcc2e24fc"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.3.1"
+version = "1.3.2"
 
 [[LibGit2]]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
@@ -232,9 +232,10 @@ uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
 version = "0.5.0"
 
 [[Missings]]
-git-tree-sha1 = "29858ce6c8ae629cf2d733bffa329619a1c843d0"
+deps = ["DataAPI"]
+git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5"
 uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "0.4.2"
+version = "0.4.3"
 
 [[Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
@@ -259,12 +260,12 @@ version = "1.1.0"
 
 [[Parsers]]
 deps = ["Dates", "Test"]
-git-tree-sha1 = "ef0af6c8601db18c282d092ccbd2f01f3f0cd70b"
+git-tree-sha1 = "c56ecb484f286639f161e712b8311f5ab77e8d32"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "0.3.7"
+version = "0.3.8"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
@@ -326,9 +327,9 @@ version = "0.8.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6"
+git-tree-sha1 = "1e9c5d89cba8047d518f1ffef432906ef1a3e8bd"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.11.0"
+version = "0.12.0"
 
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]

From 61078f3ef0fb0eba1fdfaa450ef6df911c12300d Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 6 Nov 2019 12:23:12 +0000
Subject: [PATCH 061/139] use release versions of packages

---
 Manifest.toml | 18 +++++++-----------
 Project.toml  |  3 +--
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index e0ad5716..53a9501a 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -58,9 +58,9 @@ version = "4.0.2"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "861a1a9e9741cc55c973a4688079f467a72337a7"
+git-tree-sha1 = "f4a95ba943507f1586c29208957141fc49d9d718"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.5.1"
+version = "2.5.2"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -176,9 +176,9 @@ version = "1.0.4"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "e23faa71b8f54c3fdc99b230b9c2906cafdddca5"
+git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.2.3"
+version = "0.3.0"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
@@ -389,16 +389,12 @@ version = "0.8.3"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "38241b40ebd8748bcacad5e6c7ba3ab3cc7a15c9"
-repo-rev = "master"
-repo-url = "https://github.com/FluxML/Zygote.jl.git"
+git-tree-sha1 = "b2e42a21dc3d1ecd3cbe8c83a454ca56fbf423c4"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.3.4"
+version = "0.4.0"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]
-git-tree-sha1 = "c4c29b30b8ff3be13d4244e78be7df2a42bc54d0"
-repo-rev = "master"
-repo-url = "https://github.com/FluxML/ZygoteRules.jl.git"
+git-tree-sha1 = "b3b4882cc9accf6731a08cc39543fbc6b669dca8"
 uuid = "700de1a5-db45-46bc-99cf-38207098b444"
 version = "0.2.0"
diff --git a/Project.toml b/Project.toml
index 8e986d73..76f7169a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,13 +23,12 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"
 
 [compat]
 CUDAdrv = "4.0.1"
 CuArrays = "1.4"
 NNlib = "0.6"
-Zygote = "0.3"
+Zygote = "0.4"
 julia = "1.2"
 
 [extras]

From 8a0745faab0f61a9fb2ecd1770b75f4b2e165ec9 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 6 Nov 2019 18:40:51 +0100
Subject: [PATCH 062/139] Restore Julia 1.0 compatibility.

---
 .gitlab-ci.yml | 14 ++++++++++++++
 Manifest.toml  | 18 +++++++++---------
 Project.toml   |  4 ++--
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3b87749f..b55f4618 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,6 +4,20 @@ include:
 image: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
 
 
+julia:1.0:
+  extends:
+    - .julia:1.0
+    - .test
+  tags:
+    - nvidia
+
+julia:1.1:
+  extends:
+    - .julia:1.1
+    - .test
+  tags:
+    - nvidia
+
 julia:1.2:
   extends:
     - .julia:1.2
diff --git a/Manifest.toml b/Manifest.toml
index 53a9501a..f5a589fd 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -58,9 +58,9 @@ version = "4.0.2"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "f4a95ba943507f1586c29208957141fc49d9d718"
+git-tree-sha1 = "dd642afe5fd6633663a8c3d42f3b7638f2210b79"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.5.2"
+version = "2.5.3"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -106,9 +106,9 @@ version = "4.0.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "0d22d5a55e30e98617f258bb23688f141bfeae36"
+git-tree-sha1 = "bc94d6cb335d418088f12641751aab63ff56509d"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.4.1"
+version = "1.4.2"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -153,9 +153,9 @@ version = "1.0.1"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "de38b0253ade98340fabaf220f368f6144541938"
+git-tree-sha1 = "6827a8f73ff12707f209c920d204238a16892b55"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.7.4"
+version = "0.8.0"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -169,10 +169,10 @@ uuid = "f6369f11-7733-5829-9624-2563aa707210"
 version = "0.10.5"
 
 [[GPUArrays]]
-deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "Test"]
-git-tree-sha1 = "8d74ced24448c52b539a23d107bd2424ee139c0f"
+deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
+git-tree-sha1 = "a0a3b927b1a06e63fb8b91950cc7df340b7d912c"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "1.0.4"
+version = "2.0.0"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
diff --git a/Project.toml b/Project.toml
index 76f7169a..587a459b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -26,10 +26,10 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 CUDAdrv = "4.0.1"
-CuArrays = "1.4"
+CuArrays = "1.4.2"
 NNlib = "0.6"
 Zygote = "0.4"
-julia = "1.2"
+julia = "1"
 
 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

From 3dceef427f69418220692b931d819c49e77f0810 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 8 Nov 2019 16:48:11 +0100
Subject: [PATCH 063/139] Fix binarycrossentropy on CuArrays

---
 src/layers/stateless.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index b8ce3c7d..5f9c1090 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -1,3 +1,4 @@
+using CuArrays
 using NNlib: logsoftmax, logσ
 
 # Cost functions
@@ -35,6 +36,9 @@ Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerica
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
+
 """
     logitbinarycrossentropy(logŷ, y)
 

From a00d8d94ec15080aada5c1cb938ce7cab365d99e Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 8 Nov 2019 17:28:38 +0100
Subject: [PATCH 064/139] Add test for CUDA binarycrossentropy

---
 test/cuda/cuda.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index d2907995..ddd92e1e 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -31,6 +31,10 @@ cx = gpu(x)
 @test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
 @test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
 
+x = σ.([-1.1491, 0.8619, 0.3127])
+y = [1, 1, 0.]
+@test Flux.binarycrossentropy.(x,y) ≈ Flux.binarycrossentropy.(cu(x),cu(y))
+
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)
 @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)

From 7e1ffd65072246ec634e57619174b55f007a5af3 Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Fri, 8 Nov 2019 21:39:00 +0100
Subject: [PATCH 065/139] Extend docs about `train!`

Related to #921: explain why it is not needed to pass the model as argument.
---
 docs/src/training/training.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 679bbd0b..380910c3 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -1,6 +1,6 @@
 # Training
 
-To actually train a model we need three things:
+To actually train a model we need three things, in addition to the tracked parameters that will be fitted:
 
 * A *objective function*, that evaluates how well a model is doing given some input data.
 * A collection of data points that will be provided to the objective function.
@@ -11,6 +11,7 @@ With these we can call `Flux.train!`:
 ```julia
 Flux.train!(objective, params, data, opt)
 ```
+At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!`. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately (see below) provides more flexibility, and the possibility of optimizing the calculations.
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
 

From 074eb47246cffff9c3e4f99706963de42648a1f5 Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Tue, 12 Nov 2019 23:29:38 +0100
Subject: [PATCH 066/139] Update training.md

---
 docs/src/training/training.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 380910c3..350287fc 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -11,7 +11,6 @@ With these we can call `Flux.train!`:
 ```julia
 Flux.train!(objective, params, data, opt)
 ```
-At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!`. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately (see below) provides more flexibility, and the possibility of optimizing the calculations.
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
 
@@ -33,6 +32,8 @@ Flux.train!(loss, ps, data, opt)
 
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
 
+At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
+
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From ba4e3be0d33f79145a62254a235967206a27b97c Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Thu, 14 Nov 2019 16:22:31 +0100
Subject: [PATCH 067/139] explanations about params in `train!`

---
 docs/src/training/training.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 350287fc..a5474529 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -1,8 +1,9 @@
 # Training
 
-To actually train a model we need three things, in addition to the tracked parameters that will be fitted:
+To actually train a model we need four things:
 
 * A *objective function*, that evaluates how well a model is doing given some input data.
+* The parameters of the model.
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 
@@ -34,6 +35,12 @@ The objective will almost always be defined in terms of some *cost function* tha
 
 At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
 
+## Model parameters
+
+The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `params(m)`.
+
+Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
+
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From cdaaca8cfa880b2f45f30379639f347b3ebfd175 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 00:54:49 -0700
Subject: [PATCH 068/139] make activations zygote friendly

---
 src/layers/basic.jl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index f42a9619..e8dde1a3 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -44,17 +44,15 @@ end
 # it might be replaced in the future for better performance
 # see issue https://github.com/FluxML/Flux.jl/issues/702
 # Johnny Chen -- @johnnychen94
+# only slightly changed to better handle interaction with Zygote @dsweber2
 """
     activations(c::Chain, input)
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  rst = []
-  for l in c
-    x = get(rst, length(rst), input)
-    push!(rst, l(x))
-  end
-  return rst
+  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
+                       [l for l in c], dims=1, init=input) 
+  return copy(buffed)
 end
 
 
From d0202a2945bf86a7827075c77642405b25c752fe Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 01:34:12 -0700
Subject: [PATCH 069/139] adding the extra commits broke the accumulate version

---
 src/layers/basic.jl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index e8dde1a3..2d86da85 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -50,9 +50,12 @@ end
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
-                       [l for l in c], dims=1, init=input) 
-  return copy(buffed)
+  res = Zygote.Buffer([], length(c))
+  res[1] = c[1](input)
+  for (i,l) in enumerate(c[2:end])
+    res[i+1] = l(res[i])
+  end
+  return copy(res)
 end
 
 
From 99679f7e16b2244ace129e9c6288b4ab2159a452 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 10:46:56 -0700
Subject: [PATCH 070/139] deal with empty Chain

---
 src/layers/basic.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 2d86da85..c3783567 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -51,9 +51,11 @@ Calculate the forward results of each layers in Chain `c` with `input` as model
 """
 function activations(c::Chain, input)
   res = Zygote.Buffer([], length(c))
-  res[1] = c[1](input)
-  for (i,l) in enumerate(c[2:end])
-    res[i+1] = l(res[i])
+  if length(c) > 0
+    res[1] = c[1](input)
+    for (i,l) in enumerate(c[2:end])
+      res[i+1] = l(res[i])
+    end
   end
   return copy(res)
 end

From 6475f6a43eba8feab5f34a7dc2cf0f86d1d7c0fc Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Wed, 11 Sep 2019 17:36:37 -0700
Subject: [PATCH 071/139] recursive way of doing activations

---
 src/layers/basic.jl | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index c3783567..b92bc919 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -50,16 +50,17 @@ end
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  res = Zygote.Buffer([], length(c))
-  if length(c) > 0
-    res[1] = c[1](input)
-    for (i,l) in enumerate(c[2:end])
-      res[i+1] = l(res[i])
-    end
-  end
-  return copy(res)
+    extraChain(c.layers, input)
 end
 
+function extraChain(fs::Tuple, x)
+    res = first(fs)(x)
+    return (res, extraChain(Base.tail(fs), res)...)
+end
+
+extraChain(::Tuple{}, x) = []
+
+
 
 """
     Dense(in::Integer, out::Integer, σ = identity)

From db92b0e3ce3d5cb06a11b6cf77e74e1e0d56b2f1 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 8 Oct 2019 23:04:31 -0700
Subject: [PATCH 072/139] super simple test

---
 test/layers/basic.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index cbe250fc..4edfecc7 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -19,6 +19,12 @@ import Flux: activations
     # numeric test should be put into testset of corresponding layer
   end
 
+  @testset "Activations" begin
+    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
+    X = Float32.([1.0; 1.0; 1.0])
+    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
+  end
+
   @testset "Dense" begin
     @test  length(Dense(10, 5)(randn(10))) == 5
     @test_throws DimensionMismatch Dense(10, 5)(randn(1))

From 0fe3ac4e770de17a46d37809238a6deae06f98a3 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 8 Oct 2019 23:05:22 -0700
Subject: [PATCH 073/139] bring activations into function call

---
 src/layers/basic.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index b92bc919..db491424 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -31,6 +31,8 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 (c::Chain)(x) = applychain(c.layers, x)
 
+(c::Chain)(x, i) = extraChain(c.layers, x)[i]
+
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
 function Base.show(io::IO, c::Chain)

From 58c794702d030b61a3744f1a180e9ab65113682b Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Thu, 14 Nov 2019 14:05:53 -0800
Subject: [PATCH 074/139] simpler test

---
 src/layers/basic.jl  |  4 ++--
 test/layers/basic.jl | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index db491424..75f18e3c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -31,7 +31,7 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 (c::Chain)(x) = applychain(c.layers, x)
 
-(c::Chain)(x, i) = extraChain(c.layers, x)[i]
+(c::Chain)(x) = extraChain(c.layers, x)
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
@@ -60,7 +60,7 @@ function extraChain(fs::Tuple, x)
     return (res, extraChain(Base.tail(fs), res)...)
 end
 
-extraChain(::Tuple{}, x) = []
+extraChain(::Tuple{}, x) = ()
 
 
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 4edfecc7..0ff1776d 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -4,11 +4,13 @@ import Flux: activations
 @testset "basic" begin
   @testset "helpers" begin
     @testset "activations" begin
-      dummy_model = Chain(Dense(10,5,σ),Dense(5,2),softmax)
-      x = rand(10)
-      @test activations(Chain(), x) == []
-      @test activations(dummy_model, x)[1] == dummy_model[1](x)
-      @test activations(dummy_model, x)[2] == x |> dummy_model[1] |> dummy_model[2]
+      dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x))
+      x = randn(10)
+      @test activations(dummy_model, x)[1] == x.^2
+      @test activations(dummy_model, x)[2] == (x.^2 .- 3)
+      @test activations(dummy_model, x)[3] == tan.(x.^2 .- 3)
+
+      @test activations(Chain(), x) == ()
       @test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type
     end
   end

From 2471596cdb47f681549fa943e2c7c83662cb2f1e Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 15 Nov 2019 11:50:13 +0000
Subject: [PATCH 075/139] test on 1.0

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 4f8acced..c2eb9ae0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,6 +6,7 @@ os:
   # - osx
 
 julia:
+  - 1.0
   - 1.2
   - 1.3
   - nightly

From 665e4419199c38a1490edba8862f0cb8f2edb8c6 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 15 Nov 2019 12:12:28 +0000
Subject: [PATCH 076/139] pkg up

---
 Manifest.toml | 59 ++++++++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 38 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index f5a589fd..653be3dc 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -38,29 +38,23 @@ git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 version = "0.2.0"
 
-[[CSTParser]]
-deps = ["Tokenize"]
-git-tree-sha1 = "99dda94f5af21a4565dc2b97edf6a95485f116c3"
-uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
-version = "1.0.0"
-
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "e063efb91cfefd7e6afd92c435d01398107a500b"
+git-tree-sha1 = "6eee47385c81ed3b3f716b745697869c712c2df3"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "1.2.0"
+version = "2.0.0"
 
 [[CUDAdrv]]
-deps = ["CEnum", "Printf"]
-git-tree-sha1 = "96eabc95ebb83e361311330ffb574a3e2df73251"
+deps = ["CEnum", "CUDAapi", "Printf"]
+git-tree-sha1 = "0f39fddace3324707469ace7fbcbc7b28d5cf921"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "4.0.2"
+version = "4.0.4"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "dd642afe5fd6633663a8c3d42f3b7638f2210b79"
+git-tree-sha1 = "93f6c917ab2a9b5bb54f8f738f4ec1a6693cb716"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.5.3"
+version = "2.5.5"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -98,17 +92,11 @@ git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
 uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 version = "1.3.0"
 
-[[Crayons]]
-deps = ["Test"]
-git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
-uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
-version = "4.0.0"
-
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "bc94d6cb335d418088f12641751aab63ff56509d"
+git-tree-sha1 = "6a05c9e40b99a6e9a7973ca93397a38d3e8a7b4b"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.4.2"
+version = "1.4.6"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -164,9 +152,9 @@ version = "0.6.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "adf88d6da1f0294058f38295becf8807986bb7d0"
+git-tree-sha1 = "4407e7b76999eca2646abdb68203bd4302476168"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.5"
+version = "0.10.6"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
@@ -216,10 +204,10 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
 [[MacroTools]]
-deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"]
-git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76"
+deps = ["Compat", "DataStructures", "Test"]
+git-tree-sha1 = "82921f0e3bde6aebb8e524efc20f4042373c0c06"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.1"
+version = "0.5.2"
 
 [[Markdown]]
 deps = ["Base64"]
@@ -327,9 +315,9 @@ version = "0.8.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "1e9c5d89cba8047d518f1ffef432906ef1a3e8bd"
+git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.12.0"
+version = "0.12.1"
 
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
@@ -346,15 +334,10 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[TimerOutputs]]
-deps = ["Crayons", "Printf", "Test", "Unicode"]
-git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
+deps = ["Printf"]
+git-tree-sha1 = "8f22dc0c23e1cd4ab8070a01ba32285926f104f1"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.0"
-
-[[Tokenize]]
-git-tree-sha1 = "dfcdbbfb2d0370716c815cbd6f8a364efb6f42cf"
-uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
-version = "0.5.6"
+version = "0.5.2"
 
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
@@ -389,9 +372,9 @@ version = "0.8.3"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "b2e42a21dc3d1ecd3cbe8c83a454ca56fbf423c4"
+git-tree-sha1 = "e4245b9c5362346e154b62842a89a18e0210b92b"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.0"
+version = "0.4.1"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]

From e24215ca982024ec8fe02a2c79fbaeb4e8dcfd91 Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Fri, 15 Nov 2019 15:59:42 +0000
Subject: [PATCH 077/139] guard test on 1.0

---
 test/layers/normalisation.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 22a5d283..4399a256 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -191,6 +191,7 @@ end
 
 end
 
+if VERSION >= v"1.1"
 @testset "GroupNorm" begin
   # begin tests
   squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
@@ -289,5 +290,5 @@ end
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test BN(x) ≈ GN(x)
   end
-
+end
 end

From 20eb840882752228a49130aed0712da389f6db1a Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Fri, 15 Nov 2019 12:03:08 -0800
Subject: [PATCH 078/139] keeping activations separate

---
 src/layers/basic.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 75f18e3c..2a465208 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -31,8 +31,6 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 (c::Chain)(x) = applychain(c.layers, x)
 
-(c::Chain)(x) = extraChain(c.layers, x)
-
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
 function Base.show(io::IO, c::Chain)

From a0e3729679376c984de2eb06b9848b12acb89b9f Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Fri, 15 Nov 2019 21:17:45 +0100
Subject: [PATCH 079/139] Update docs/src/training/training.md

Co-Authored-By: Mike J Innes <mike.j.innes@gmail.com>
---
 docs/src/training/training.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index a5474529..47bda1f5 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -3,7 +3,7 @@
 To actually train a model we need four things:
 
 * A *objective function*, that evaluates how well a model is doing given some input data.
-* The parameters of the model.
+* The trainable parameters of the model.
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 

From 4530ac65c7f23c2cfb5f95f49b5fe4a7dd4f946d Mon Sep 17 00:00:00 2001
From: Troels Arnfred Bojesen <tr-ab@online.no>
Date: Tue, 19 Nov 2019 16:50:40 +0900
Subject: [PATCH 080/139] Fix Glorot initialization, add He initialization

Should fix the issue reported at https://github.com/FluxML/Flux.jl/issues/442 .
Adds He weight initialization as a bonus :-)
---
 src/utils.jl  | 10 ++++++++--
 test/utils.jl | 45 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 246c30d7..d3d01a11 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,6 +1,12 @@
 # Arrays
-glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0/sum(dims))
-glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0/sum(dims))
+nfan(n_in, n_out) = n_in, n_out #fan-in, fan-out
+nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
+
+glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
+glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
+
+he_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / first(nfan(dims...)))
+he_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / first(nfan(dims...)))
 
 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
diff --git a/test/utils.jl b/test/utils.jl
index 18a57139..99492d4e 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,6 +1,7 @@
 using Flux
-using Flux: throttle, glorot_uniform, glorot_normal, stack, unstack
-using StatsBase: std
+using Flux: throttle, nfan, glorot_uniform, glorot_normal, he_uniform, he_normal,
+  stack, unstack
+using StatsBase: var
 using Random
 using Test
 
@@ -56,18 +57,36 @@ end
   # Set random seed so that these tests don't fail randomly
   Random.seed!(0)
 
-  # glorot_uniform should yield a kernel with stddev ~= sqrt(6/(n_in + n_out)),
-  # and glorot_normal should yield a kernel with stddev != 2/(n_in _ n_out)
-  for (n_in, n_out) in [(100, 100), (100, 400)]
-    v = glorot_uniform(n_in, n_out)
-    @test minimum(v) > -1.1*sqrt(6/(n_in + n_out))
-    @test minimum(v) < -0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) >  0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) <  1.1*sqrt(6/(n_in + n_out))
+  @testset "Fan in/out" begin
+    @test nfan(100, 200) == (100, 200) #For Dense layer
+    @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
+    @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
+    @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
+  end
 
-    v = glorot_normal(n_in, n_out)
-    @test std(v) > 0.9*sqrt(2/(n_in + n_out))
-    @test std(v) < 1.1*sqrt(2/(n_in + n_out))
+  @testset "glorot" begin
+    # glorot_uniform and glorot_normal should both yield a kernel with
+    # variance ≈ 2/(fan_in + fan_out)
+    for dims ∈ [(100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+      for init ∈ [glorot_uniform, glorot_normal]
+        v = init(dims...)
+        fan_in, fan_out = nfan(dims...)
+        σ2 = 2 / (fan_in + fan_out)
+        @test 0.9σ2 < var(v) < 1.1σ2
+      end
+    end
+  end
+
+  @testset "he" begin
+    # he_uniform and he_normal should both yield a kernel with variance ≈ 2/fan_in
+    for dims ∈ [(100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+      for init ∈ [he_uniform, he_normal]
+        v = init(dims...)
+        fan_in, fan_out = nfan(dims...)
+        σ2 = 2 / fan_in
+        @test 0.9σ2 < var(v) < 1.1σ2
+      end
+    end
   end
 end
 

From df7ffb0ef852579a1348a4b66bf29e7181f2a5c9 Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <baggepinnen@gmail.com>
Date: Tue, 19 Nov 2019 16:27:44 +0800
Subject: [PATCH 081/139] Fix AMSGrad on GPU

The previous initialization created a CPU array. Now, the same type of array as `x` is created.
---
 src/optimise/optimisers.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index ea2ef067..23adc6ec 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -349,10 +349,10 @@ AMSGrad(η = 0.001, β = (0.9, 0.999)) = AMSGrad(η, β, IdDict())
 
 function apply!(o::AMSGrad, x, Δ)
   η, β = o.eta, o.beta
-  mt, vt, v̂t = get!(o.state, x, (fill(ϵ, size(x)), fill(ϵ, size(x)), fill(ϵ, size(x))))
+  mt, vt, v̂t = get!(o.state, x, (fill!(zero(x), ϵ), fill!(zero(x), ϵ), fill!(zero(x), ϵ)))
   @. mt = β[1] * mt + (1 - β[1]) * Δ
   @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
-  @. v̂t = max.(v̂t, vt)
+  @. v̂t = max(v̂t, vt)
   @. Δ = η * mt / (√v̂t + ϵ)
 end
 

From 2da22f31f076ff0a7a1b185a214509c58240ca6a Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <baggepinnen@gmail.com>
Date: Tue, 19 Nov 2019 16:31:04 +0800
Subject: [PATCH 082/139] Avoid unnecessary conversion

This initialization works for both cpu and gpu
---
 src/optimise/optimisers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index ea2ef067..93237048 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -283,7 +283,7 @@ ADAGrad(η = 0.1) = ADAGrad(η, IdDict())
 
 function apply!(o::ADAGrad, x, Δ)
   η = o.eta
-  acc = get!(o.acc, x, fill(ϵ, size(x)))::typeof(x)
+  acc = get!(o.acc, x, fill!(zero(x), ϵ))::typeof(x)
   @. acc += Δ^2
   @. Δ *= η / (√acc + ϵ)
 end

From 2b8057324858d10f96213c40cd596ae54fd0b54a Mon Sep 17 00:00:00 2001
From: Troels Arnfred Bojesen <tr-ab@online.no>
Date: Tue, 19 Nov 2019 18:16:29 +0900
Subject: [PATCH 083/139] Fix Glorot initialization, add He initialization

Should fix #442 .
Adds He weight initialization as a bonus :-)
---
 src/utils.jl  |  8 +++++---
 test/utils.jl | 10 ++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index d3d01a11..b2fe76bf 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,12 +1,14 @@
 # Arrays
-nfan(n_in, n_out) = n_in, n_out #fan-in, fan-out
+nfan() = 1, 1 #fan_in, fan_out
+nfan(n) = 1, n #A vector is treated as a n×1 matrix
+nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
 nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
 
 glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
 glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
 
-he_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / first(nfan(dims...)))
-he_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / first(nfan(dims...)))
+he_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / last(nfan(dims...)))
+he_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / last(nfan(dims...)))
 
 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
diff --git a/test/utils.jl b/test/utils.jl
index 99492d4e..22b8f26a 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -58,7 +58,9 @@ end
   Random.seed!(0)
 
   @testset "Fan in/out" begin
-    @test nfan(100, 200) == (100, 200) #For Dense layer
+    @test nfan() == (1, 1) #For a constant
+    @test nfan(100) == (1, 100) #For vector
+    @test nfan(100, 200) == (200, 100) #For Dense layer
     @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
     @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
     @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
@@ -67,7 +69,7 @@ end
   @testset "glorot" begin
     # glorot_uniform and glorot_normal should both yield a kernel with
     # variance ≈ 2/(fan_in + fan_out)
-    for dims ∈ [(100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
       for init ∈ [glorot_uniform, glorot_normal]
         v = init(dims...)
         fan_in, fan_out = nfan(dims...)
@@ -79,11 +81,11 @@ end
 
   @testset "he" begin
     # he_uniform and he_normal should both yield a kernel with variance ≈ 2/fan_in
-    for dims ∈ [(100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
       for init ∈ [he_uniform, he_normal]
         v = init(dims...)
         fan_in, fan_out = nfan(dims...)
-        σ2 = 2 / fan_in
+        σ2 = 2 / fan_out
         @test 0.9σ2 < var(v) < 1.1σ2
       end
     end

From 69bf84278f348d804d096d1d4c33c49e514780e2 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 7 Nov 2019 13:07:12 +0100
Subject: [PATCH 084/139] Remove wrong warning.

---
 src/Flux.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 694bd10f..a6132a0b 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -41,7 +41,7 @@ include("deprecations.jl")
 
 function __init__()
   if !CUDAdrv.functional()
-    @warn "CUDA available, but CUDAdrv.jl failed to load"
+    # nothing to do here, the user doesn't have CUDA
   elseif length(devices()) == 0
     @warn "CUDA available, but no GPU detected"
   elseif !CuArrays.functional()

From bd734ed9571bbbb2afa8205eaafcac91e055419e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Nov 2019 15:55:25 +0100
Subject: [PATCH 085/139] Bump CUDA dependencies.

---
 Manifest.toml | 12 ++++++------
 Project.toml  |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 653be3dc..bb488879 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -94,9 +94,9 @@ version = "1.3.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "6a05c9e40b99a6e9a7973ca93397a38d3e8a7b4b"
+git-tree-sha1 = "4757376a85ffb27d4c4f6cdf9635261e6c3a5fec"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.4.6"
+version = "1.4.7"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -141,9 +141,9 @@ version = "1.0.1"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "6827a8f73ff12707f209c920d204238a16892b55"
+git-tree-sha1 = "b2cf74f09216cfe3c241e8484178ec0ea941870f"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.0"
+version = "0.8.1"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -335,9 +335,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[TimerOutputs]]
 deps = ["Printf"]
-git-tree-sha1 = "8f22dc0c23e1cd4ab8070a01ba32285926f104f1"
+git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.2"
+version = "0.5.3"
 
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
diff --git a/Project.toml b/Project.toml
index 587a459b..eae220d8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,8 +25,8 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-CUDAdrv = "4.0.1"
-CuArrays = "1.4.2"
+CUDAdrv = "4.0.3"
+CuArrays = "1.4.3"
 NNlib = "0.6"
 Zygote = "0.4"
 julia = "1"

From c45cec4cba587da9461bfb55ffe276758f442031 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Nov 2019 16:05:41 +0100
Subject: [PATCH 086/139] Simplify warning.

---
 Project.toml |  2 --
 src/Flux.jl  | 13 ++++---------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/Project.toml b/Project.toml
index eae220d8..7f4ab464 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,7 +5,6 @@ version = "0.9.0"
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
@@ -25,7 +24,6 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-CUDAdrv = "4.0.3"
 CuArrays = "1.4.3"
 NNlib = "0.6"
 Zygote = "0.4"
diff --git a/src/Flux.jl b/src/Flux.jl
index a6132a0b..d0e0d5bf 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -21,8 +21,7 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
   ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
 
 
-ENV["CUDA_INIT_SILENT"] = true
-using CUDAdrv, CuArrays
+using CuArrays
 const use_cuda = Ref(false)
 
 include("utils.jl")
@@ -40,12 +39,8 @@ include("data/Data.jl")
 include("deprecations.jl")
 
 function __init__()
-  if !CUDAdrv.functional()
-    # nothing to do here, the user doesn't have CUDA
-  elseif length(devices()) == 0
-    @warn "CUDA available, but no GPU detected"
-  elseif !CuArrays.functional()
-    @warn "CUDA GPU available, but CuArrays.jl failed to load"
+  if !CuArrays.functional()
+    # nothing to do here, and either CuArrays or one of its dependencies will have warned
   else
     use_cuda[] = true
 
@@ -54,7 +49,7 @@ function __init__()
     if CuArrays.has_cudnn()
       include(joinpath(@__DIR__, "cuda/cuda.jl"))
     else
-      @warn "CUDA GPU available, but CuArrays.jl did not find libcudnn. Some functionality will not be available."
+      @warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
     end
   end
 end

From af96a197c1d019ac0ac6cbc2c97c64d688f8aa80 Mon Sep 17 00:00:00 2001
From: Troels Arnfred Bojesen <tr-ab@online.no>
Date: Wed, 20 Nov 2019 13:20:42 +0900
Subject: [PATCH 087/139] Fix Glorot initialization

Should fix #442
---
 src/utils.jl  |  3 ---
 test/utils.jl | 15 +--------------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index b2fe76bf..324d87c8 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -7,9 +7,6 @@ nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of conv
 glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
 glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
 
-he_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / last(nfan(dims...)))
-he_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / last(nfan(dims...)))
-
 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
 
diff --git a/test/utils.jl b/test/utils.jl
index 22b8f26a..1c275e85 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,6 +1,5 @@
 using Flux
-using Flux: throttle, nfan, glorot_uniform, glorot_normal, he_uniform, he_normal,
-  stack, unstack
+using Flux: throttle, nfan, glorot_uniform, glorot_normal, stack, unstack
 using StatsBase: var
 using Random
 using Test
@@ -78,18 +77,6 @@ end
       end
     end
   end
-
-  @testset "he" begin
-    # he_uniform and he_normal should both yield a kernel with variance ≈ 2/fan_in
-    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
-      for init ∈ [he_uniform, he_normal]
-        v = init(dims...)
-        fan_in, fan_out = nfan(dims...)
-        σ2 = 2 / fan_out
-        @test 0.9σ2 < var(v) < 1.1σ2
-      end
-    end
-  end
 end
 
 @testset "Params" begin

From a0314ce682945fe0e582be7cd0d92a07b305407a Mon Sep 17 00:00:00 2001
From: matsueushi <matsueushi@gmail.com>
Date: Fri, 22 Nov 2019 05:23:24 +0000
Subject: [PATCH 088/139] Fix logitbinarycrossentropy on CuArrays

---
 src/layers/stateless.jl | 3 +++
 test/cuda/cuda.jl       | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 5f9c1090..870a6cdf 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -53,6 +53,9 @@ but it is more numerically stable.
 """
 logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
 
+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+
 """
     normalise(x::AbstractArray; dims=1)
 
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index ddd92e1e..1576d88f 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -31,9 +31,10 @@ cx = gpu(x)
 @test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
 @test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
 
-x = σ.([-1.1491, 0.8619, 0.3127])
+x = [-1.1491, 0.8619, 0.3127]
 y = [1, 1, 0.]
-@test Flux.binarycrossentropy.(x,y) ≈ Flux.binarycrossentropy.(cu(x),cu(y))
+@test Flux.binarycrossentropy.(σ.(x),y) ≈ Flux.binarycrossentropy.(cu(σ.(x)),cu(y))
+@test Flux.logitbinarycrossentropy.(x,y) ≈ Flux.logitbinarycrossentropy.(cu(x),cu(y))
 
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)

From 4ece13c6491059eee466e32d8506193c69184880 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 22 Nov 2019 18:03:51 +0100
Subject: [PATCH 089/139] Don't include the CUDA module during precompilation.

If we do, we could end up replacing it at runtime.
---
 src/Flux.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/Flux.jl b/src/Flux.jl
index d0e0d5bf..905cb638 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -39,6 +39,12 @@ include("data/Data.jl")
 include("deprecations.jl")
 
 function __init__()
+  precompiling = ccall(:jl_generating_output, Cint, ()) != 0
+
+  # we don't want to include the CUDA module when precompiling,
+  # or we could end up replacing it at run time (triggering a warning)
+  precompiling && return
+
   if !CuArrays.functional()
     # nothing to do here, and either CuArrays or one of its dependencies will have warned
   else

From 5f21238d1a6235940127b30763d05d9998a14cdb Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 24 Nov 2019 13:25:02 +0530
Subject: [PATCH 090/139] no grad dims helper

---
 src/Flux.jl         | 2 +-
 src/layers/conv.jl  | 2 ++
 test/layers/conv.jl | 6 ++++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index d0e0d5bf..4c5aa2ab 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -6,7 +6,7 @@ using Base: tail
 using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using MacroTools: @forward
 @reexport using NNlib
-using Zygote: Params, @adjoint, gradient, pullback
+using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 519f129f..d33c8da5 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -118,6 +118,8 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
     )
 end
 
+@nograd conv_transpose_dims
+
 function (c::ConvTranspose)(x::AbstractArray)
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
   σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index aa3925f1..4bf80234 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -1,5 +1,6 @@
 using Flux, Test
 using Flux: maxpool, meanpool
+using Flux: gradient
 
 @testset "Pooling" begin
   x = randn(Float32, 10, 10, 3, 2)
@@ -54,6 +55,11 @@ end
   y = Conv((3,3), 1 => 1)(x)
   x_hat = ConvTranspose((3, 3), 1 => 1)(y)
   @test size(x_hat) == size(x)
+  m = ConvTranspose((3,3), 2=>1)
+  x = rand(10,10,2,1)
+
+  # Test that the gradient call does not throw: #900
+  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
 end
 
 @testset "CrossCor" begin

From c031ae1a949fe77b328edc272826650aa7fcce50 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 24 Nov 2019 13:31:31 +0530
Subject: [PATCH 091/139] correct channel value

---
 test/layers/conv.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 4bf80234..b4136062 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -55,9 +55,8 @@ end
   y = Conv((3,3), 1 => 1)(x)
   x_hat = ConvTranspose((3, 3), 1 => 1)(y)
   @test size(x_hat) == size(x)
-  m = ConvTranspose((3,3), 2=>1)
-  x = rand(10,10,2,1)
 
+  m = ConvTranspose((3,3), 1=>1)
   # Test that the gradient call does not throw: #900
   @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
 end

From 59bb0d81b020a33155e56add14f50ef20397ceaa Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 26 Nov 2019 16:23:09 +0530
Subject: [PATCH 092/139] add TODO

---
 src/layers/conv.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index d33c8da5..f4de3ffc 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -118,6 +118,7 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
     )
 end
 
+# TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
 @nograd conv_transpose_dims
 
 function (c::ConvTranspose)(x::AbstractArray)

From 1c0e9acc45bd85c56d95f476ab203e7f72481728 Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Tue, 26 Nov 2019 15:33:41 +0000
Subject: [PATCH 093/139] Update CuArrays to include the workspace fix.

---
 Manifest.toml | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index bb488879..c0618c8e 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -94,7 +94,9 @@ version = "1.3.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "4757376a85ffb27d4c4f6cdf9635261e6c3a5fec"
+git-tree-sha1 = "7e00178b18672ee2cf37244ac2a273b6b0701b04"
+repo-rev = "master"
+repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 version = "1.4.7"
 
@@ -105,9 +107,9 @@ version = "1.1.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "1fe8fad5fc84686dcbc674aa255bc867a64f8132"
+git-tree-sha1 = "a1b652fb77ae8ca7ea328fa7ba5aa151036e5c10"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.5"
+version = "0.17.6"
 
 [[Dates]]
 deps = ["Printf"]
@@ -124,10 +126,10 @@ uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 version = "0.0.4"
 
 [[DiffRules]]
-deps = ["Random", "Test"]
-git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
+deps = ["NaNMath", "Random", "SpecialFunctions"]
+git-tree-sha1 = "f734b5f6bc9c909027ef99f6d91d5d9e4b111eed"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "0.0.10"
+version = "0.1.0"
 
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
@@ -141,9 +143,9 @@ version = "1.0.1"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "b2cf74f09216cfe3c241e8484178ec0ea941870f"
+git-tree-sha1 = "1a9fe4e1323f38de0ba4da49eafd15b25ec62298"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.1"
+version = "0.8.2"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -235,10 +237,9 @@ uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 version = "0.6.0"
 
 [[NaNMath]]
-deps = ["Compat"]
-git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
+git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "0.3.2"
+version = "0.3.3"
 
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@@ -248,9 +249,9 @@ version = "1.1.0"
 
 [[Parsers]]
 deps = ["Dates", "Test"]
-git-tree-sha1 = "c56ecb484f286639f161e712b8311f5ab77e8d32"
+git-tree-sha1 = "0139ba59ce9bc680e2925aec5b7db79065d60556"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "0.3.8"
+version = "0.3.10"
 
 [[Pkg]]
 deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]

From 99f98ca800ff959ab8a5e9c34758eb2a6f3ad00d Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 28 Nov 2019 16:00:21 +0000
Subject: [PATCH 094/139] Update README.md

---
 README.md | 88 ++-----------------------------------------------------
 1 file changed, 2 insertions(+), 86 deletions(-)

diff --git a/README.md b/README.md
index d8af28ae..4196b926 100644
--- a/README.md
+++ b/README.md
@@ -7,93 +7,9 @@
 Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, and provides lightweight abstractions on top of Julia's native GPU and AD support. Flux makes the easy things easy while remaining fully hackable.
 
 ```julia
-julia> Pkg.add("Flux")
+] add Flux
 ```
 
 See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
 
-If you use Flux in research, please cite the following paper:
-
-```
-@article{innes:2018,
-  author    = {Mike Innes},
-  title     = {Flux: Elegant Machine Learning with Julia},
-  journal   = {Journal of Open Source Software},
-  year      = {2018},
-  doi       = {10.21105/joss.00602},
-}
-```
-
-## Features
-
-Flux has powerful high-level features, and common architectures can be defined in a few lines.
-
-```julia
-model = Chain(
-  Dense(768, 128, σ),
-  LSTM(128, 256),
-  LSTM(256, 128),
-  Dense(128, 10),
-  softmax)
-
-loss(x, y) = crossentropy(model(x), y)
-
-Flux.train!(loss, params(model), data, ADAM(...))
-```
-
-Yet you can easily strip away the layers, and directly write the mathematics for your problem. Flux will seamlessly take gradients of any Julia code, so your model looks just like the paper.
-
-```julia
-W = param(randn(2, 10))
-b = param(randn(2))
-
-y(x) = σ.(W * x .+ b)
-```
-
-If that's *still* not enough, you can go as deep as you want, even writing your own CUDA kernels with [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl)! All this can be freely mixed-and-matched in a single model or script, and it all runs interactively via Jupyter or Juno.
-
-```julia
-function gpu_add(a, b, c)
-  i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-  c[i] = a[i] + b[i]
-  return nothing
-end
-```
-
-Unusual architectures are no problem in Flux, as you can use all the loops, control flow and even macros that you're used to. Here's a Tree RNN in 4 lines.
-
-```julia
-tree() = rand() < 0.5 ? rand(10) : (tree(), tree()) # dummy data
-
-shrink = Dense(20, 10)
-combine(a, b) = shrink([a; b])
-
-model(x) = x
-model(x::Tuple) = combine(model(x[1]), model(x[2]))
-
-model(tree()) # Sample output
-```
-
-Despite this flexibility, Julia's advanced compiler lets us do some powerful optimisations. For example, this definition of `sigmoid` automatically gets fused into a *single* GPU kernel – so it's really fast.
-
-```julia
-sigmoid(xs) = 1 ./ (1 .+ exp.(.-xs))
-```
-
-Similarly, Flux is the first dynamic framework to support [compiling to the browser](https://fluxml.github.io/experiments/) and model import via [formats like ONNX](https://github.com/FluxML/ONNX.jl/), both of which are thinly-veiled compiler problems.
-
-For more on our philosophy on machine learning, check out our article [On Machine Learning & Programming Languages](https://julialang.org/blog/2017/12/ml&pl).
-
-## Contributing & Help
-
-For general questions and help, check out Julia's [community forum](https://discourse.julialang.org/c/domain/ML).
-
-Flux development is carried out via our [GitHub issues](https://github.com/FluxML/Flux.jl/issues), so feel free to open feature requests or PRs here.
-
-For more informal discussions we'd love to have you on the [Julia slack](https://slackinvite.julialang.org/), where we hang out on the #machine-learning channel.
-
-## Related Packages
-
-Check out [Metalhead.jl](https://github.com/FluxML/Metalhead.jl) for common computer vision datasets and trained models.
-
-[MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) provides further common datasets.
+If you use Flux in research, please see [CITATION.bib] for papers to cite.

From 75d609ecc87875ebb885f20a2e54d22f6b18cc8b Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 28 Nov 2019 16:00:55 +0000
Subject: [PATCH 095/139] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4196b926..ef090f5b 100644
--- a/README.md
+++ b/README.md
@@ -12,4 +12,4 @@ Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, a
 
 See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
 
-If you use Flux in research, please see [CITATION.bib] for papers to cite.
+If you use Flux in research, please see [our papers](CITATION.bib) for appropriate citations.

From 4481c74f50e9b9ce03bd1d21027d0cf99e44b7b7 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 28 Nov 2019 21:45:06 +0530
Subject: [PATCH 096/139] v0.10 changes

---
 NEWS.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 26853df3..80239760 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,18 @@
+# v0.10.0
+* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
+  - The dependency on Tracker.jl has been removed.
+  - This means Flux now does not depend on using a specialised `TrackedArray` type, and can be used with normal Array implementations directly.
+  - Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
+* The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
+* `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
+* Added a new [RADAM optimiser](https://github.com/FluxML/Flux.jl/pull/842)
+* Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
+* [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
+* [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
+* @treelike has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
+* `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
+
+
 # v0.9.0
 * [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
 * New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.

From 1ae554d82c8572bafa2287dee249581aad14596e Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 28 Nov 2019 21:47:37 +0530
Subject: [PATCH 097/139] rm new line

---
 NEWS.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 80239760..d4375458 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -12,7 +12,6 @@
 * @treelike has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
 * `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
 
-
 # v0.9.0
 * [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
 * New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.

From c17dc34e383c27f4edbe93c30bc6aa092eeba3a0 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 28 Nov 2019 21:49:34 +0530
Subject: [PATCH 098/139] phew

Co-Authored-By: Mike J Innes <mike.j.innes@gmail.com>
---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index d4375458..7c964956 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,7 +9,7 @@
 * Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
 * [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
 * [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
-* @treelike has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
+* `@treelike` has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
 * `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
 
 # v0.9.0

From b65b491e516cb3ff209a4d2c93b551116a6ee2ac Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Thu, 28 Nov 2019 16:23:22 +0000
Subject: [PATCH 099/139] compat, pkg up

---
 Manifest.toml | 14 ++++++--------
 Project.toml  |  9 +++++++++
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index c0618c8e..be9bf768 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,5 +1,3 @@
-# This file is machine-generated - editing it directly is not advised
-
 [[AbstractFFTs]]
 deps = ["LinearAlgebra"]
 git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40"
@@ -132,7 +130,7 @@ uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
 version = "0.1.0"
 
 [[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
+deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 
 [[FFTW]]
@@ -154,9 +152,9 @@ version = "0.6.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "4407e7b76999eca2646abdb68203bd4302476168"
+git-tree-sha1 = "da46ac97b17793eba44ff366dc6cb70f1238a738"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.6"
+version = "0.10.7"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
@@ -171,7 +169,7 @@ uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
 version = "0.3.0"
 
 [[InteractiveUtils]]
-deps = ["Markdown"]
+deps = ["LinearAlgebra", "Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[JSON]]
@@ -254,7 +252,7 @@ uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 version = "0.3.10"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
@@ -353,7 +351,7 @@ uuid = "30578b45-9adc-5946-b283-645ec420af67"
 version = "0.4.0"
 
 [[UUIDs]]
-deps = ["Random", "SHA"]
+deps = ["Random"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [[Unicode]]
diff --git a/Project.toml b/Project.toml
index 7f4ab464..bc5e9de8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -24,8 +24,17 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
+AbstractTrees = "0.2"
+Adapt = "1"
+CodecZlib = "0.5, 0.6"
+Colors = "0.8, 0.9"
 CuArrays = "1.4.3"
+Juno = "0.5, 0.6, 0.7"
+MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
+Reexport = "0.2"
+StatsBase = "0"
+ZipFile = "0.7, 0.8"
 Zygote = "0.4"
 julia = "1"
 

From 73d572b1a9e60f61b46390d0050ccb5a347dd7be Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 28 Nov 2019 23:57:01 +0530
Subject: [PATCH 100/139] rm RADAM

---
 NEWS.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index d4375458..faf3fe49 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,7 +5,6 @@
   - Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
 * The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
 * `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
-* Added a new [RADAM optimiser](https://github.com/FluxML/Flux.jl/pull/842)
 * Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
 * [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
 * [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.

From 4b63e69b656e7f41cd37dec4378e703a7f81ff07 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 29 Nov 2019 00:02:59 +0530
Subject: [PATCH 101/139] bump version to v0.10

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 7f4ab464..a64f272b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.9.0"
+version = "0.10.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

From 6e94e59afd8ff518c4f793b985d66193e0ffdc06 Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <baggepinnen@gmail.com>
Date: Tue, 3 Dec 2019 15:27:44 +0800
Subject: [PATCH 102/139] Improve docs for decay optimisers

---
 src/optimise/optimisers.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index c9c40764..888d3087 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -444,7 +444,8 @@ end
 """
   InvDecay(γ)
 
-Applies inverse time decay to an optimiser
+Applies inverse time decay to an optimiser, i.e., the step effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimisers step size is not modified.
+```
 
 ## Parameters
   - gamma (γ): Defaults to `0.001`
@@ -472,7 +473,7 @@ end
 """
   ExpDecay(eta, decay, decay_step, clip)
 
-Discount the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
+Discount the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`. The wrapped optimisers step size is being modified by the outer optimiser.
 
 ## Parameters
   - Learning Rate (eta): Defaults to `0.001`.

From e67f09c06d73bc8e0b0702732f63a77eee26e151 Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <baggepinnen@gmail.com>
Date: Tue, 3 Dec 2019 15:32:23 +0800
Subject: [PATCH 103/139] Correct some comments in decay docs

---
 src/optimise/optimisers.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 888d3087..fb3b9fc5 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -444,7 +444,7 @@ end
 """
   InvDecay(γ)
 
-Applies inverse time decay to an optimiser, i.e., the step effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimisers step size is not modified.
+Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
 ```
 
 ## Parameters
@@ -473,7 +473,7 @@ end
 """
   ExpDecay(eta, decay, decay_step, clip)
 
-Discount the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`. The wrapped optimisers step size is being modified by the outer optimiser.
+Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
 
 ## Parameters
   - Learning Rate (eta): Defaults to `0.001`.

From b4ed16ad9cd52905a94ea18b70148724998742ab Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Tue, 3 Dec 2019 22:48:48 -0600
Subject: [PATCH 104/139] Added outdims for some basic layers

---
 src/layers/basic.jl | 35 +++++++++++++++++++++++++++++++
 src/layers/conv.jl  | 51 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 2a465208..f2e7645d 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -39,6 +39,17 @@ function Base.show(io::IO, c::Chain)
   print(io, ")")
 end
 
+"""
+    outdims(c::Chain, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+outdims(m, (10, 10)) == (6, 6)
+```
+"""
+outdims(c::Chain, isize::Tuple) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -116,6 +127,19 @@ end
 (a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::Dense, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Dense(10, 5)
+outdims(m, (5, 2)) == (5,)
+outdims(m, (10,)) == (5,)
+```
+"""
+outdims(l::Dense, isize) = (size(l.W)[2],)
+
 """
     Diagonal(in::Integer)
 
@@ -145,6 +169,17 @@ function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", length(l.α), ")")
 end
 
+"""
+    outdims(l::Diagonal, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Diagonal(10)
+outdims(m, (10,)) == (10,)
+```
+"""
+outdims(l::Diagonal, isize) = (length(l.α),)
 
 """
     Maxout(over)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index f4de3ffc..eeeea82b 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,5 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv
 
+_convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
+
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
 """
@@ -68,6 +70,18 @@ end
 (a::Conv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::Conv, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Conv((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::Conv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     ConvTranspose(size, in=>out)
     ConvTranspose(size, in=>out, relu)
@@ -140,6 +154,7 @@ end
 
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
+
 """
     DepthwiseConv(size, in=>out)
     DepthwiseConv(size, in=>out, relu)
@@ -204,6 +219,18 @@ end
 (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::DepthwiseConv, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = DepthwiseConv((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::DepthwiseConv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     CrossCor(size, in=>out)
     CrossCor(size, in=>out, relu)
@@ -304,6 +331,18 @@ function Base.show(io::IO, m::MaxPool)
   print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
+"""
+    outdims(l::MaxPool, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = MaxPool((2, 2))
+outdims(m, (10, 10)) == (5, 5)
+```
+"""
+outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
+
 """
     MeanPool(k)
 
@@ -331,3 +370,15 @@ end
 function Base.show(io::IO, m::MeanPool)
   print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
+
+"""
+    outdims(l::MeanPool, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = MeanPool((2, 2))
+outdims(m, (10, 10)) == (5, 5)
+```
+"""
+outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
\ No newline at end of file

From 31dda0ce6cd8c264d083d453823f4f13fa755da5 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 5 Dec 2019 21:57:10 -0600
Subject: [PATCH 105/139] Updated with all basic and conv layers outdims

---
 src/layers/basic.jl | 16 ++++++++++++++--
 src/layers/conv.jl  | 25 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index f2e7645d..8794b58c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -40,7 +40,7 @@ function Base.show(io::IO, c::Chain)
 end
 
 """
-    outdims(c::Chain, isize::Tuple)
+    outdims(c::Chain, isize)
 
 Calculate the output dimensions given the input dimensions, `isize`.
 
@@ -49,7 +49,7 @@ m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
 outdims(m, (10, 10)) == (6, 6)
 ```
 """
-outdims(c::Chain, isize::Tuple) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
+outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -228,6 +228,18 @@ function (mo::Maxout)(input::AbstractArray)
     mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end
 
+"""
+    outdims(c::Maxout, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Maxout(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::Maxout, isize) = outdims(first(l.over))
+
 """
     SkipConnection(layers, connection)
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index eeeea82b..2e3e87d7 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,6 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv
 
 _convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
+_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad))
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -155,6 +156,18 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::ConvTranspose, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = ConvTranspose((3, 3), 3 => 16)
+outdims(m, (8, 8)) == (10, 10)
+```
+"""
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     DepthwiseConv(size, in=>out)
     DepthwiseConv(size, in=>out, relu)
@@ -302,6 +315,18 @@ end
 (a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::CrossCor, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = CrossCor((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+```
+"""
+outdims(l::CrossCor{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+
 """
     MaxPool(k)
 

From 6265b1fa39c5d7d289ccd5a00c94ae9f448377fc Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 5 Dec 2019 22:54:25 -0600
Subject: [PATCH 106/139] Added tests for outdims

---
 src/layers/basic.jl  |  8 ++++----
 src/layers/conv.jl   |  8 ++++----
 test/layers/basic.jl | 15 +++++++++++++++
 test/layers/conv.jl  | 20 ++++++++++++++++++++
 4 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 8794b58c..b62d8bb9 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -49,7 +49,7 @@ m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
 outdims(m, (10, 10)) == (6, 6)
 ```
 """
-outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))
+outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(isize)
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -138,7 +138,7 @@ outdims(m, (5, 2)) == (5,)
 outdims(m, (10,)) == (5,)
 ```
 """
-outdims(l::Dense, isize) = (size(l.W)[2],)
+outdims(l::Dense, isize) = (size(l.W)[1],)
 
 """
     Diagonal(in::Integer)
@@ -234,11 +234,11 @@ end
 Calculate the output dimensions given the input dimensions, `isize`.
 
 ```julia
-m = Maxout(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
 outdims(m, (10, 10)) == (8, 8)
 ```
 """
-outdims(l::Maxout, isize) = outdims(first(l.over))
+outdims(l::Maxout, isize) = outdims(first(l.over), isize)
 
 """
     SkipConnection(layers, connection)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 2e3e87d7..6ce9bcbf 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,7 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv
 
 _convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
-_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad))
+_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad)
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -238,7 +238,7 @@ end
 Calculate the output dimensions given the input dimensions, `isize`.
 
 ```julia
-m = DepthwiseConv((3, 3), 3 => 16)
+m = DepthwiseConv((3, 3), 3 => 6)
 outdims(m, (10, 10)) == (8, 8)
 ```
 """
@@ -366,7 +366,7 @@ m = MaxPool((2, 2))
 outdims(m, (10, 10)) == (5, 5)
 ```
 """
-outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
+outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
 
 """
     MeanPool(k)
@@ -406,4 +406,4 @@ m = MeanPool((2, 2))
 outdims(m, (10, 10)) == (5, 5)
 ```
 """
-outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.weight, l.stride, l.pad[1:N])
\ No newline at end of file
+outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
\ No newline at end of file
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 0ff1776d..421c7721 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -92,4 +92,19 @@ import Flux: activations
       @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
     end
   end
+
+  @testset "output dimensions" begin
+    m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+    @test Flux.outdims(m, (10, 10)) == (6, 6)
+
+    m = Dense(10, 5)
+    @test Flux.outdims(m, (5, 2)) == (5,)
+    @test Flux.outdims(m, (10,)) == (5,)
+
+    m = Flux.Diagonal(10)
+    @test Flux.outdims(m, (10,)) == (10,)
+
+    m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
+    @test Flux.outdims(m, (10, 10)) == (8, 8)
+  end
 end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index b4136062..5701df80 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -107,3 +107,23 @@ end
     true
   end
 end
+
+@testset "conv output dimensions" begin
+  m = Conv((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+
+  m = ConvTranspose((3, 3), 3 => 16)
+  @test Flux.outdims(m, (8, 8)) == (10, 10)
+
+  m = DepthwiseConv((3, 3), 3 => 6)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+
+  m = CrossCor((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+
+  m = MaxPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+
+  m = MeanPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+end
\ No newline at end of file

From a64378b11272444f8803ec0155262d47ab0cef71 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 7 Dec 2019 13:21:26 -0600
Subject: [PATCH 107/139] Switched to using NNlib for conv.jl outdims.

---
 src/layers/basic.jl | 20 -------------
 src/layers/conv.jl  | 73 ++++++++++-----------------------------------
 2 files changed, 15 insertions(+), 78 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index b62d8bb9..6f056429 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -169,16 +169,6 @@ function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", length(l.α), ")")
 end
 
-"""
-    outdims(l::Diagonal, isize)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = Diagonal(10)
-outdims(m, (10,)) == (10,)
-```
-"""
 outdims(l::Diagonal, isize) = (length(l.α),)
 
 """
@@ -228,16 +218,6 @@ function (mo::Maxout)(input::AbstractArray)
     mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end
 
-"""
-    outdims(c::Maxout, isize)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
-outdims(m, (10, 10)) == (8, 8)
-```
-"""
 outdims(l::Maxout, isize) = outdims(first(l.over), isize)
 
 """
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 6ce9bcbf..7b32f999 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,6 +1,8 @@
-using NNlib: conv, ∇conv_data, depthwiseconv
+using NNlib: conv, ∇conv_data, depthwiseconv, output_size
+
+# pad dims of x with dims of y until ndims(x) == ndims(y)
+_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
 
-_convoutdims(isize, ksize, ssize, pad) = Int.(floor.((isize .- ksize .+ 2 .* pad) ./ ssize .+ 1))
 _convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad)
 
 expand(N, i::Tuple) = i
@@ -75,13 +77,16 @@ end
     outdims(l::Conv, isize::Tuple)
 
 Calculate the output dimensions given the input dimensions, `isize`.
+Batch size and channel size are ignored as per `NNlib.jl`.
 
 ```julia
 m = Conv((3, 3), 3 => 16)
 outdims(m, (10, 10)) == (8, 8)
+outdims(m, (10, 10, 1, 3)) == (8, 8)
 ```
 """
-outdims(l::Conv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::Conv, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
     ConvTranspose(size, in=>out)
@@ -156,17 +161,7 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-"""
-    outdims(l::ConvTranspose, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = ConvTranspose((3, 3), 3 => 16)
-outdims(m, (8, 8)) == (10, 10)
-```
-"""
-outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.pad[1:N])
 
 """
     DepthwiseConv(size, in=>out)
@@ -232,17 +227,8 @@ end
 (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-"""
-    outdims(l::DepthwiseConv, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = DepthwiseConv((3, 3), 3 => 6)
-outdims(m, (10, 10)) == (8, 8)
-```
-"""
-outdims(l::DepthwiseConv{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::DepthwiseConv, isize) =
+  output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
     CrossCor(size, in=>out)
@@ -315,17 +301,8 @@ end
 (a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-"""
-    outdims(l::CrossCor, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = CrossCor((3, 3), 3 => 16)
-outdims(m, (10, 10)) == (8, 8)
-```
-"""
-outdims(l::CrossCor{N}, isize) where N = _convoutdims(isize, size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::CrossCor, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
     MaxPool(k)
@@ -356,17 +333,7 @@ function Base.show(io::IO, m::MaxPool)
   print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
-"""
-    outdims(l::MaxPool, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = MaxPool((2, 2))
-outdims(m, (10, 10)) == (5, 5)
-```
-"""
-outdims(l::MaxPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
+outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
 
 """
     MeanPool(k)
@@ -396,14 +363,4 @@ function Base.show(io::IO, m::MeanPool)
   print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
-"""
-    outdims(l::MeanPool, isize::Tuple)
-
-Calculate the output dimensions given the input dimensions, `isize`.
-
-```julia
-m = MeanPool((2, 2))
-outdims(m, (10, 10)) == (5, 5)
-```
-"""
-outdims(l::MeanPool{N}, isize) where N = _convoutdims(isize, l.k, l.stride, l.pad[1:N])
\ No newline at end of file
+outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
\ No newline at end of file

From 0cdd11c0dc8e8e82a90467cc66e3b8330ad57682 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 7 Dec 2019 14:05:50 -0600
Subject: [PATCH 108/139] Added tests for varying padding, stride, and dilation
 with outdims.

---
 src/layers/conv.jl  |  4 ++--
 test/layers/conv.jl | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 7b32f999..03de438a 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -3,7 +3,7 @@ using NNlib: conv, ∇conv_data, depthwiseconv, output_size
 # pad dims of x with dims of y until ndims(x) == ndims(y)
 _paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
 
-_convtransoutdims(isize, ksize, ssize, pad) = Int.(ssize .* (isize .- 1) .+ ksize .- 2 .* pad)
+_convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+ (ksize .- 1).*dsize .- (pad[1:2:end] .+ pad[2:2:end])
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -161,7 +161,7 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
-outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.pad[1:N])
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
 
 """
     DepthwiseConv(size, in=>out)
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 5701df80..1a22b385 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -111,19 +111,51 @@ end
 @testset "conv output dimensions" begin
   m = Conv((3, 3), 3 => 16)
   @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = Conv((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
 
   m = ConvTranspose((3, 3), 3 => 16)
   @test Flux.outdims(m, (8, 8)) == (10, 10)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (2, 2)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (4, 4)) == (5, 5)
 
   m = DepthwiseConv((3, 3), 3 => 6)
   @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
 
   m = CrossCor((3, 3), 3 => 16)
   @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = CrossCor((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
 
   m = MaxPool((2, 2))
   @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MaxPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MaxPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
 
   m = MeanPool((2, 2))
   @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MeanPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MeanPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
 end
\ No newline at end of file

From 04991d3261f006f134beb6333f504ad27e11a706 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sat, 7 Dec 2019 14:06:11 -0600
Subject: [PATCH 109/139] Added entry to docs for outdims

---
 docs/src/models/basics.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index d83fc462..c6dc4e19 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -219,3 +219,13 @@ Flux.@functor Affine
 ```
 
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
+
+## Utility functions
+
+Flux provides some utility functions to help you generate models in an automated fashion.
+
+`outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
+
+```@docs
+outdims
+```

From 8a93be8c6c9d4686c63284153d9cf8cf07f376a1 Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Mon, 9 Dec 2019 20:39:46 +0530
Subject: [PATCH 110/139] Change loss to cost

---
 docs/src/models/layers.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 227abe31..5f2ab3ce 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,7 +66,7 @@ LayerNorm
 GroupNorm
 ```
 
-## Loss Functions
+## Cost Functions
 ```@docs
 mse
 crossentropy
@@ -76,4 +76,4 @@ logitbinarycrossentropy
 kldivergence
 poisson
 hinge
-```
\ No newline at end of file
+```

From 2f854bdfc0d7064f4e28988d6418d9b09324c11e Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Tue, 10 Dec 2019 09:57:08 -0600
Subject: [PATCH 111/139] Recommitting to trigger new build


From 8a1e2f19d74ecad494ca4d9cb195cee8b85e26bd Mon Sep 17 00:00:00 2001
From: "Viral B. Shah" <viral@mayin.org>
Date: Thu, 19 Dec 2019 09:44:17 -0500
Subject: [PATCH 112/139] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ef090f5b..730e2801 100644
--- a/README.md
+++ b/README.md
@@ -12,4 +12,4 @@ Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, a
 
 See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
 
-If you use Flux in research, please see [our papers](CITATION.bib) for appropriate citations.
+If you use Flux in your research, please [cite](CITATION.bib) our work.

From 0fdcc0092337f80c06f3a578c797ec56a13e9348 Mon Sep 17 00:00:00 2001
From: Elliot Saba <staticfloat@gmail.com>
Date: Mon, 23 Dec 2019 00:41:18 -0800
Subject: [PATCH 113/139] Give `NNPACK` a bit of numerical leeway

---
 test/layers/conv.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index b4136062..1e7ede01 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -66,7 +66,7 @@ end
   w = rand(2,2,1,1)
   y = CrossCor(w, [0.0])
 
-  @test sum(w .* x[1:2, 1:2, :, :]) == y(x)[1, 1, 1, 1]
+  @test isapprox(sum(w .* x[1:2, 1:2, :, :]), y(x)[1, 1, 1, 1], rtol=1e-7)
 
   r = zeros(Float32, 28, 28, 1, 5)
   m = Chain(
@@ -89,17 +89,17 @@ end
   l = Conv((3,3), 1=>1)
   expected = zeros(eltype(l.weight),5,5,1,1)
   expected[2:end-1,2:end-1,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)
 
   l = Conv((3,1), 1=>1)
   expected = zeros(eltype(l.weight),5,7,1,1)
   expected[2:end-1,4,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)
 
   l = Conv((1,3), 1=>1)
   expected = zeros(eltype(l.weight),7,5,1,1)
   expected[4,2:end-1,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)
 
   @test begin
     # we test that the next expression does not throw

From f00b5325568415aaf32024e7eda0090d2cb0e036 Mon Sep 17 00:00:00 2001
From: aminya <aminyahyaabadi74@gmail.com>
Date: Mon, 6 Jan 2020 03:17:25 +0330
Subject: [PATCH 114/139] Adding CompatHelper

---
 .github/workflows/CompatHelper.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 .github/workflows/CompatHelper.yml

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
new file mode 100644
index 00000000..9777033d
--- /dev/null
+++ b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,24 @@
+name: CompatHelper
+
+on:
+  schedule:
+    - cron: '00 00 * * *'
+
+jobs:
+  CompatHelper:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        julia-version: [1.3]
+        julia-arch: [x64]
+        os: [ubuntu-latest]
+    steps:
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: ${{ matrix.julia-version }}
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'

From 17732e702358364ae041840261186fb915c335ea Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Mon, 6 Jan 2020 11:53:47 +0000
Subject: [PATCH 115/139] restructure; closes #747

---
 Manifest.toml |  6 ++++--
 src/utils.jl  | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index be9bf768..1aadc0f0 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -371,9 +371,11 @@ version = "0.8.3"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "e4245b9c5362346e154b62842a89a18e0210b92b"
+git-tree-sha1 = "04384d940b67d604dd393688fa60c1f0175e5faf"
+repo-rev = "buffer-push"
+repo-url = "https://github.com/FluxML/Zygote.jl.git"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.1"
+version = "0.4.3"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]
diff --git a/src/utils.jl b/src/utils.jl
index 324d87c8..2dba21c7 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -103,6 +103,48 @@ function batchseq(xs, pad = nothing, n = maximum(length(x) for x in xs))
   [batch([xs_[j][i] for j = 1:length(xs_)]) for i = 1:n]
 end
 
+# Flattening models to weight vectors, and back
+
+function _restructure(m, xs)
+  i = 0
+  fmap(m) do x
+    x isa AbstractArray || return x
+    x = reshape(xs[i.+(1:length(x))], size(x))
+    i += length(x)
+    return x
+  end
+end
+
+"""
+    destructure(m)
+
+Flatten a model's parameters into a single weight vector.
+
+    julia> m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+
+    julia> θ, re = destructure(m);
+
+    julia> θ
+    67-element Array{Float32,1}:
+    -0.1407104
+    ...
+
+The second return value `re` allows you to reconstruct the original network after making
+modifications to the weight vector (for example, with a hypernetwork).
+
+    julia> re(θ .* 2)
+    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+"""
+function destructure(m)
+  xs = Zygote.Buffer([])
+  fmap(m) do x
+    x isa AbstractArray && push!(xs, x)
+    return x
+  end
+  return vcat(vec.(copy(xs))...), p -> _restructure(m, p)
+end
+
 # Other
 
 """

From f96270c2133f560134d789669d23d7761a235df7 Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Thu, 9 Jan 2020 17:16:41 +0000
Subject: [PATCH 116/139] free zygote

---
 Manifest.toml | 168 ++++++++++++++++++++++----------------------------
 1 file changed, 73 insertions(+), 95 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 1aadc0f0..115fa5f5 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,8 +1,10 @@
+# This file is machine-generated - editing it directly is not advised
+
 [[AbstractFFTs]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40"
+git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "0.4.1"
+version = "0.5.0"
 
 [[AbstractTrees]]
 deps = ["Markdown", "Test"]
@@ -19,12 +21,6 @@ version = "1.0.0"
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
-[[BinDeps]]
-deps = ["Compat", "Libdl", "SHA", "URIParser"]
-git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
-uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
-version = "0.8.10"
-
 [[BinaryProvider]]
 deps = ["Libdl", "SHA"]
 git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
@@ -38,21 +34,21 @@ version = "0.2.0"
 
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "6eee47385c81ed3b3f716b745697869c712c2df3"
+git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "2.0.0"
+version = "2.1.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "0f39fddace3324707469ace7fbcbc7b28d5cf921"
+git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "4.0.4"
+version = "5.0.1"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "93f6c917ab2a9b5bb54f8f738f4ec1a6693cb716"
+git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.5.5"
+version = "2.7.0"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -62,9 +58,9 @@ version = "0.6.0"
 
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965"
+git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.8.0"
+version = "0.8.1"
 
 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
@@ -78,25 +74,13 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"
 
-[[Compat]]
-deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
-uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "2.2.0"
-
-[[Conda]]
-deps = ["JSON", "VersionParsing"]
-git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
-uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
-version = "1.3.0"
-
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "7e00178b18672ee2cf37244ac2a273b6b0701b04"
+git-tree-sha1 = "5203ed37039c74c5eab31e9fcdc40f23c7e943a3"
 repo-rev = "master"
 repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.4.7"
+version = "1.6.0"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -105,9 +89,9 @@ version = "1.1.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "a1b652fb77ae8ca7ea328fa7ba5aa151036e5c10"
+git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.6"
+version = "0.17.7"
 
 [[Dates]]
 deps = ["Printf"]
@@ -118,26 +102,32 @@ deps = ["Mmap"]
 uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 
 [[DiffResults]]
-deps = ["Compat", "StaticArrays"]
-git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c"
+deps = ["StaticArrays"]
+git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc"
 uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-version = "0.0.4"
+version = "1.0.2"
 
 [[DiffRules]]
 deps = ["NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "f734b5f6bc9c909027ef99f6d91d5d9e4b111eed"
+git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "0.1.0"
+version = "1.0.0"
 
 [[Distributed]]
-deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
+deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 
 [[FFTW]]
-deps = ["AbstractFFTs", "BinaryProvider", "Conda", "Libdl", "LinearAlgebra", "Reexport", "Test"]
-git-tree-sha1 = "6c5b420da0b8c12098048561b8d58f81adea506f"
+deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
+git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
 uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.0.1"
+version = "1.2.0"
+
+[[FFTW_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
+uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
+version = "3.3.9+3"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
@@ -152,15 +142,15 @@ version = "0.6.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "da46ac97b17793eba44ff366dc6cb70f1238a738"
+git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.7"
+version = "0.10.8"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
-git-tree-sha1 = "a0a3b927b1a06e63fb8b91950cc7df340b7d912c"
+git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "2.0.0"
+version = "2.0.1"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
@@ -168,15 +158,15 @@ git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
 version = "0.3.0"
 
-[[InteractiveUtils]]
-deps = ["LinearAlgebra", "Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+[[IntelOpenMP_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
+uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
+version = "2018.0.3+0"
 
-[[JSON]]
-deps = ["Dates", "Mmap", "Parsers", "Unicode"]
-git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
-uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-version = "0.21.0"
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[Juno]]
 deps = ["Base64", "Logging", "Media", "Profile", "Test"]
@@ -186,9 +176,9 @@ version = "0.7.2"
 
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "74fe444b8b6d1ac01d639b2f9eaf395bcc2e24fc"
+git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.3.2"
+version = "1.3.3"
 
 [[LibGit2]]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
@@ -203,11 +193,17 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
+[[MKL_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
+uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
+version = "2019.0.117+0"
+
 [[MacroTools]]
-deps = ["Compat", "DataStructures", "Test"]
-git-tree-sha1 = "82921f0e3bde6aebb8e524efc20f4042373c0c06"
+deps = ["DataStructures", "Markdown", "Random"]
+git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.2"
+version = "0.5.3"
 
 [[Markdown]]
 deps = ["Base64"]
@@ -229,30 +225,30 @@ version = "0.4.3"
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[NNlib]]
-deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8"
+deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
+git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.0"
+version = "0.6.2"
 
 [[NaNMath]]
 git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
 version = "0.3.3"
 
+[[OpenSpecFun_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
+uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
+version = "0.5.3+1"
+
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
 git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.1.0"
 
-[[Parsers]]
-deps = ["Dates", "Test"]
-git-tree-sha1 = "0139ba59ce9bc680e2925aec5b7db79065d60556"
-uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "0.3.10"
-
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
@@ -278,10 +274,10 @@ uuid = "189a3867-3050-52da-a836-e630ba90ab69"
 version = "0.2.0"
 
 [[Requires]]
-deps = ["Test"]
-git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
+deps = ["UUIDs"]
+git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "0.5.2"
+version = "1.0.0"
 
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@@ -289,10 +285,6 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
-[[SharedArrays]]
-deps = ["Distributed", "Mmap", "Random", "Serialization"]
-uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -307,10 +299,10 @@ deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[SpecialFunctions]]
-deps = ["BinDeps", "BinaryProvider", "Libdl"]
-git-tree-sha1 = "3bdd374b6fd78faf0119b8c5d538788dbf910c6e"
+deps = ["OpenSpecFun_jll"]
+git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.8.0"
+version = "0.9.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
@@ -344,25 +336,13 @@ git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
 uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 version = "0.9.5"
 
-[[URIParser]]
-deps = ["Test", "Unicode"]
-git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
-uuid = "30578b45-9adc-5946-b283-645ec420af67"
-version = "0.4.0"
-
 [[UUIDs]]
-deps = ["Random"]
+deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
-[[VersionParsing]]
-deps = ["Compat"]
-git-tree-sha1 = "c9d5aa108588b978bd859554660c8a5c4f2f7669"
-uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
-version = "1.1.3"
-
 [[ZipFile]]
 deps = ["BinaryProvider", "Libdl", "Printf"]
 git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
@@ -371,11 +351,9 @@ version = "0.8.3"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "04384d940b67d604dd393688fa60c1f0175e5faf"
-repo-rev = "buffer-push"
-repo-url = "https://github.com/FluxML/Zygote.jl.git"
+git-tree-sha1 = "7e293d7bef87c2cf2847e99ed0da4edadb75fe90"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.3"
+version = "0.4.4"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]

From 0411b9a3e8036e7103d770e9276039cca04b6eef Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 12 Jan 2020 17:35:04 +0530
Subject: [PATCH 117/139] rm second slash

---
 bors.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bors.toml b/bors.toml
index 4e6e5d26..65571c49 100644
--- a/bors.toml
+++ b/bors.toml
@@ -1,4 +1,4 @@
 status = [
-  "ci/gitlab/%"
+  "ci/gitlab%"
 ]
 timeout-sec = 14400

From 58a7941386603fff8513cae2d90a0c02bc7e5787 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 13 Jan 2020 11:24:04 +0530
Subject: [PATCH 118/139] reduce bors timeout

---
 bors.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bors.toml b/bors.toml
index 65571c49..19f7c434 100644
--- a/bors.toml
+++ b/bors.toml
@@ -1,4 +1,4 @@
 status = [
   "ci/gitlab%"
 ]
-timeout-sec = 14400
+timeout-sec = 3600

From da9f295a8eccb25a8bc42c53d445c0abefd9cef5 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 13 Jan 2020 13:41:25 +0530
Subject: [PATCH 119/139] bump version to 10.1

---
 Project.toml | 2 +-
 bors.toml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 225cc463..3b2a4b8b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.0"
+version = "0.10.1"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
diff --git a/bors.toml b/bors.toml
index 19f7c434..4390d210 100644
--- a/bors.toml
+++ b/bors.toml
@@ -1,4 +1,4 @@
 status = [
   "ci/gitlab%"
 ]
-timeout-sec = 3600
+timeout-sec = 7200

From d7953ff57392a6c7d038199429572354fbde03f7 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 13 Jan 2020 13:45:40 +0530
Subject: [PATCH 120/139] test on julia 1.3+

---
 .gitlab-ci.yml | 40 ++++++++++++++++++++--------------------
 .travis.yml    |  2 --
 Project.toml   |  2 +-
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b55f4618..cf9c6013 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,26 +4,26 @@ include:
 image: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
 
 
-julia:1.0:
-  extends:
-    - .julia:1.0
-    - .test
-  tags:
-    - nvidia
-
-julia:1.1:
-  extends:
-    - .julia:1.1
-    - .test
-  tags:
-    - nvidia
-
-julia:1.2:
-  extends:
-    - .julia:1.2
-    - .test
-  tags:
-    - nvidia
+# julia:1.0:
+#   extends:
+#     - .julia:1.0
+#     - .test
+#   tags:
+#     - nvidia
+# 
+# julia:1.1:
+#   extends:
+#     - .julia:1.1
+#     - .test
+#   tags:
+#     - nvidia
+# 
+# julia:1.2:
+#   extends:
+#     - .julia:1.2
+#     - .test
+#   tags:
+#     - nvidia
 
 julia:1.3:
   extends:
diff --git a/.travis.yml b/.travis.yml
index c2eb9ae0..c54639df 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,8 +6,6 @@ os:
   # - osx
 
 julia:
-  - 1.0
-  - 1.2
   - 1.3
   - nightly
 
diff --git a/Project.toml b/Project.toml
index 3b2a4b8b..225cc463 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.1"
+version = "0.10.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

From de40476beb059e35e9d99723e956d534b3ddc8f9 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 13 Jan 2020 14:10:34 +0530
Subject: [PATCH 121/139] doc tests on julia 1.3

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index c54639df..e02f470f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -16,7 +16,7 @@ matrix:
 jobs:
   include:
     - stage: "Documentation"
-      julia: 1.2
+      julia: 1.3
       os: linux
       script:
         - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));

From e2a97aec245fb8017a18c601718f11537f506dc9 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 13 Jan 2020 16:16:24 +0530
Subject: [PATCH 122/139] up cuda+zygote deps

---
 Manifest.toml | 12 +++++-------
 Project.toml  |  1 -
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 115fa5f5..645b4ac4 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -76,9 +76,7 @@ version = "0.2.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "5203ed37039c74c5eab31e9fcdc40f23c7e943a3"
-repo-rev = "master"
-repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
+git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 version = "1.6.0"
 
@@ -131,9 +129,9 @@ version = "3.3.9+3"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "1a9fe4e1323f38de0ba4da49eafd15b25ec62298"
+git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.2"
+version = "0.8.4"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -351,9 +349,9 @@ version = "0.8.3"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "7e293d7bef87c2cf2847e99ed0da4edadb75fe90"
+git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.4"
+version = "0.4.5"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]
diff --git a/Project.toml b/Project.toml
index 225cc463..59324a7d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,7 +28,6 @@ AbstractTrees = "0.2"
 Adapt = "1"
 CodecZlib = "0.5, 0.6"
 Colors = "0.8, 0.9"
-CuArrays = "1.4.3"
 Juno = "0.5, 0.6, 0.7"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"

From e1698e66174d7b0a75b9f6f99149331bcafa9658 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 13 Jan 2020 16:18:20 +0530
Subject: [PATCH 123/139] up cuarrays

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index 59324a7d..c23df12a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,6 +28,7 @@ AbstractTrees = "0.2"
 Adapt = "1"
 CodecZlib = "0.5, 0.6"
 Colors = "0.8, 0.9"
+CuArrays = "1.6"
 Juno = "0.5, 0.6, 0.7"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"

From cd4626d5a7a05138a978b5491e87a55959e63ce9 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 13 Jan 2020 17:38:59 +0530
Subject: [PATCH 124/139] compat bounds for a couple more packages

---
 Manifest.toml | 18 ++++++++++++------
 Project.toml  |  4 ++--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 645b4ac4..12986ccd 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -7,10 +7,10 @@ uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 version = "0.5.0"
 
 [[AbstractTrees]]
-deps = ["Markdown", "Test"]
-git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b"
+deps = ["Markdown"]
+git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.2.1"
+version = "0.3.0"
 
 [[Adapt]]
 deps = ["LinearAlgebra"]
@@ -342,10 +342,16 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [[ZipFile]]
-deps = ["BinaryProvider", "Libdl", "Printf"]
-git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
+deps = ["Libdl", "Printf", "Zlib_jll"]
+git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.8.3"
+version = "0.9.0"
+
+[[Zlib_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.11+7"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
diff --git a/Project.toml b/Project.toml
index c23df12a..e5cb9a09 100644
--- a/Project.toml
+++ b/Project.toml
@@ -24,7 +24,7 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-AbstractTrees = "0.2"
+AbstractTrees = "0.2, 0.3"
 Adapt = "1"
 CodecZlib = "0.5, 0.6"
 Colors = "0.8, 0.9"
@@ -34,7 +34,7 @@ MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
 Reexport = "0.2"
 StatsBase = "0"
-ZipFile = "0.7, 0.8"
+ZipFile = "0.7, 0.8, 0.9"
 Zygote = "0.4"
 julia = "1"
 

From 048c31f60968a563272d84a182e76a3bc7d834e9 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 13 Jan 2020 18:16:29 +0530
Subject: [PATCH 125/139] bump Flux version to v0.10.1

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index e5cb9a09..96d1d853 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.0"
+version = "0.10.1"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

From 747e01ea024134b09fdf64fe83c38fb71fe98536 Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Mon, 13 Jan 2020 18:33:30 +0530
Subject: [PATCH 126/139] Test to check for spurious promotions

---
 test/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 87c495f1..7cb8ed2e 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -75,7 +75,7 @@ const ϵ = 1e-7
     for T in (Float32, Float64)
       y = rand(T, 2)
       ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
         fwd, back = Flux.pullback(f, ŷ, y)
         @test fwd isa T
         @test eltype(back(one(T))[1]) == T

From 7797e31b44962cb6dc9cc202821f34fa75eb77d3 Mon Sep 17 00:00:00 2001
From: Lyndon White <oxinabox@ucc.asn.au>
Date: Thu, 16 Jan 2020 21:57:59 +0000
Subject: [PATCH 127/139] Add custom training loops to docs

---
 docs/src/training/training.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 47bda1f5..b42db7c9 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -110,3 +110,30 @@ cb = function ()
   accuracy() > 0.9 && Flux.stop()
 end
 ```
+
+## Custom Training loops
+
+The `Flux.train!` function can be very convenient, especially for simple problems.
+Its also very flexible with the use of callbacks.
+But for some problems its much cleaner to write your own custom training loop.
+An example follows that works similar to the default `Flux.train` but with no callbacks.
+You don't need callbacks if you just code the calls to your functions directly into the loop.
+E.g. in the places marked with comments.
+
+```
+function my_custom_train!(loss, ps, data, opt)
+  ps = Params(ps)
+  for d in data
+    gs = gradient(ps) do
+      training_loss = loss(d...)
+      # Insert what ever code you want here that needs Training loss, e.g. logging
+      return training_loss
+    end
+    # insert what ever code you want here that needs gradient
+    # E.g. logging with TensorBoardLogger.jl as histogram so you can see if it is becoming huge
+    update!(opt, ps, gs)
+    # Here you might like to check validation set accuracy, and break out to do early stopping
+  end
+end
+```
+You could simplify this further, for example by hard-coding in the loss function.

From d88f63adb495eed0442c872de3da17f3ee789913 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 29 Jan 2020 12:15:41 +0100
Subject: [PATCH 128/139] Remove unused imports.

---
 src/cuda/curnn.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl
index fb454729..51e26a3e 100644
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@@ -1,6 +1,5 @@
 import ..Flux: Flux, relu
 using CuArrays.CUDAnative
-using CuArrays: @cuindex, cudims
 
 CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuArray{T,2},<:CuArray{T,1}}
 CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}}

From e66a7f130f065225a5538ca05dcba64bc6d88773 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 31 Jan 2020 08:22:21 +0100
Subject: [PATCH 129/139] Don't compare CPU with GPU arrays.

---
 test/cuda/cuda.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 1576d88f..1dc13939 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -33,8 +33,8 @@ cx = gpu(x)
 
 x = [-1.1491, 0.8619, 0.3127]
 y = [1, 1, 0.]
-@test Flux.binarycrossentropy.(σ.(x),y) ≈ Flux.binarycrossentropy.(cu(σ.(x)),cu(y))
-@test Flux.logitbinarycrossentropy.(x,y) ≈ Flux.logitbinarycrossentropy.(cu(x),cu(y))
+@test Flux.binarycrossentropy.(σ.(x),y) ≈ Array(Flux.binarycrossentropy.(cu(σ.(x)),cu(y)))
+@test Flux.logitbinarycrossentropy.(x,y) ≈ Array(Flux.logitbinarycrossentropy.(cu(x),cu(y)))
 
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)

From e2c2ec5575b29501963db7ba681ddabb56b75537 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 31 Jan 2020 08:22:54 +0100
Subject: [PATCH 130/139] Don't invoke GPU crossentropy with integers.

Broadcasting log on integers does not work.
---
 test/cuda/cuda.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 1dc13939..c75cfb4e 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -25,7 +25,7 @@ cm = gpu(m)
 @test all(p isa CuArray for p in params(cm))
 @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
 
-x = [1,2,3]
+x = [1.,2.,3.]
 cx = gpu(x)
 @test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
 @test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)

From 6499344af397db698706b3325d2ba6831178ac65 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 6 Feb 2020 13:42:17 +0100
Subject: [PATCH 131/139] nograd for onecold, onehot, onehotbatch

---
 src/onehot.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/onehot.jl b/src/onehot.jl
index 754d0607..7a3123ec 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -125,6 +125,4 @@ onecold(y::AbstractMatrix, labels...) =
 onecold(y::OneHotMatrix, labels...) =
   mapreduce(x -> Flux.onecold(x, labels...), |, y.data, dims = 2, init = 0)
 
-# TODO probably still want this as a custom adjoint Zygote
-# onecold(x::TrackedVector, l...) = onecold(data(x), l...)
-# onecold(x::TrackedMatrix, l...) = onecold(data(x), l...)
+@nograd onecold, onehot, onehotbatch

From d7b20d1a780d32d111030f4a7a7f62cd62b2eb11 Mon Sep 17 00:00:00 2001
From: Julia TagBot <50554310+JuliaTagBot@users.noreply.github.com>
Date: Sat, 8 Feb 2020 20:02:52 +0700
Subject: [PATCH 132/139] Install TagBot as a GitHub Action

---
 .github/workflows/TagBot.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 .github/workflows/TagBot.yml

diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 00000000..d77d3a0c
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,11 @@
+name: TagBot
+on:
+  schedule:
+    - cron: 0 * * * *
+jobs:
+  TagBot:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}

From c37fc3cfa63a82deec33d40f837b880341440c7a Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Sun, 9 Feb 2020 19:45:04 -0600
Subject: [PATCH 133/139] Recommitting to trigger build


From ae0455517a57159fb5d05c9b5a0e2531f78ebc93 Mon Sep 17 00:00:00 2001
From: Marco <mcognetta@users.noreply.github.com>
Date: Mon, 10 Feb 2020 00:03:11 -0800
Subject: [PATCH 134/139] Remove outdated reference to truncate!

---
 src/layers/recurrent.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 499a21ab..647dda25 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -45,8 +45,7 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
 """
     reset!(rnn)
 
-Reset the hidden state of a recurrent layer back to its original value. See also
-`truncate!`.
+Reset the hidden state of a recurrent layer back to its original value.
 
 Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
 

From 6ea7b95384b34b0b2aacc10bc225480f4a9555a0 Mon Sep 17 00:00:00 2001
From: matsueushi <matsueushi@gmail.com>
Date: Sat, 15 Feb 2020 20:06:15 -0500
Subject: [PATCH 135/139] Remove unused using

---
 src/optimise/optimisers.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index fb3b9fc5..cf4496f4 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -1,5 +1,4 @@
 using Flux
-using Base: @get!
 using MacroTools: @forward
 
 const ϵ = 1e-8

From 9bb388d953ce6676860ff82028b9c1f98c88bbfb Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Sun, 16 Feb 2020 18:29:18 +0100
Subject: [PATCH 136/139] update Juno compat

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 96d1d853..f76063bd 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.1"
+version = "0.10.2"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -29,7 +29,7 @@ Adapt = "1"
 CodecZlib = "0.5, 0.6"
 Colors = "0.8, 0.9"
 CuArrays = "1.6"
-Juno = "0.5, 0.6, 0.7"
+Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
 Reexport = "0.2"

From f5b9cf659cb14f0b05ab98b2fef70f705adfc8c3 Mon Sep 17 00:00:00 2001
From: Kyle Daruwalla <daruwalla@wisc.edu>
Date: Thu, 20 Feb 2020 23:38:56 -0600
Subject: [PATCH 137/139] Updated docs to specify exactly what layers support
 outdims

---
 docs/src/models/basics.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index c6dc4e19..6e8d0b76 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -225,6 +225,17 @@ This enables a useful extra set of functionality for our `Affine` layer, such as
 Flux provides some utility functions to help you generate models in an automated fashion.
 
 `outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
+Currently limited to the following layers:
+- `Chain`
+- `Dense`
+- `Conv`
+- `Diagonal`
+- `Maxout`
+- `ConvTranspose`
+- `DepthwiseConv`
+- `CrossCor`
+- `MaxPool`
+- `MeanPool`
 
 ```@docs
 outdims

From 6ced7e1ecff379cf3df3f62f05557317dc56e41f Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sun, 23 Feb 2020 13:42:11 -0500
Subject: [PATCH 138/139] expand Colors compat

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f76063bd..71282a10 100644
--- a/Project.toml
+++ b/Project.toml
@@ -27,7 +27,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 AbstractTrees = "0.2, 0.3"
 Adapt = "1"
 CodecZlib = "0.5, 0.6"
-Colors = "0.8, 0.9"
+Colors = "0.8, 0.9, 0.10, 0.11"
 CuArrays = "1.6"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"

From db4eaf254b5de8902349afbd705243c22d0ec91a Mon Sep 17 00:00:00 2001
From: Bulat Suleymanov <motjumi@gmail.com>
Date: Mon, 24 Feb 2020 13:16:51 +0500
Subject: [PATCH 139/139] Edit description of convolutional layer

---
 src/layers/conv.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index f4de3ffc..829051ae 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -17,7 +17,7 @@ Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
     out = 16
     Conv((2, 2), 1=>16, relu)
 
-Data should be stored in WHCN order (width, height, # channels, # batches).
+Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.