From 540b7366ec0edd711953223ef44bf342d691127f Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 00:54:49 -0700
Subject: [PATCH 01/46] make activations zygote friendly

---
 src/layers/basic.jl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 83eeee21..b4b869c5 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -45,17 +45,15 @@ end
 # it might be replaced in the future for better performance
 # see issue https://github.com/FluxML/Flux.jl/issues/702
 # Johnny Chen -- @johnnychen94
+# only slightly changed to better handle interaction with Zygote @dsweber2
 """
     activations(c::Chain, input)
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  rst = []
-  for l in c
-    x = get(rst, length(rst), input)
-    push!(rst, l(x))
-  end
-  return rst
+  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
+                       [l for l in c], dims=1, init=input) 
+  return copy(buffed)
 end
 
 

From 38790dd4db5520e6e587783804d1144a3b75ac9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <m.giordano@ucl.ac.uk>
Date: Sun, 8 Sep 2019 16:15:35 +0100
Subject: [PATCH 02/46] Restore purity

---
 .gitattributes | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitattributes b/.gitattributes
index e02ed0b7..4992eb2c 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 paper/* linguist-documentation
+CITATION.bib linguist-detectable=false

From 82261b5bb7e6783d6a273c8e7803c4fbb28a3dd8 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 00:54:49 -0700
Subject: [PATCH 03/46] make activations zygote friendly

---
 src/layers/basic.jl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 13d56472..fd187d8c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -45,17 +45,15 @@ end
 # it might be replaced in the future for better performance
 # see issue https://github.com/FluxML/Flux.jl/issues/702
 # Johnny Chen -- @johnnychen94
+# only slightly changed to better handle interaction with Zygote @dsweber2
 """
     activations(c::Chain, input)
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  rst = []
-  for l in c
-    x = get(rst, length(rst), input)
-    push!(rst, l(x))
-  end
-  return rst
+  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
+                       [l for l in c], dims=1, init=input) 
+  return copy(buffed)
 end
 
 

From 1bb25dc1f9c54666d73b516629e0c89033e1c0e2 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 01:34:12 -0700
Subject: [PATCH 04/46] adding the extra commits broke the accumulate version

---
 src/layers/basic.jl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index fd187d8c..e1e9ab45 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -51,9 +51,12 @@ end
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
-                       [l for l in c], dims=1, init=input) 
-  return copy(buffed)
+  res = Zygote.Buffer([], length(c))
+  res[1] = c[1](input)
+  for (i,l) in enumerate(c[2:end])
+    res[i+1] = l(res[i])
+  end
+  return copy(res)
 end
 
 

From f41219133e8a233c8e0056972641378c4e83c427 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 10:46:56 -0700
Subject: [PATCH 05/46] deal with empty Chain

---
 src/layers/basic.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index e1e9ab45..9ef6f195 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -52,9 +52,11 @@ Calculate the forward results of each layers in Chain `c` with `input` as model
 """
 function activations(c::Chain, input)
   res = Zygote.Buffer([], length(c))
-  res[1] = c[1](input)
-  for (i,l) in enumerate(c[2:end])
-    res[i+1] = l(res[i])
+  if length(c) > 0
+    res[1] = c[1](input)
+    for (i,l) in enumerate(c[2:end])
+      res[i+1] = l(res[i])
+    end
   end
   return copy(res)
 end

From 46abfbbd5cd4579e66912996c5ff4b568a01d1ea Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Wed, 11 Sep 2019 17:36:37 -0700
Subject: [PATCH 06/46] recursive way of doing activations

---
 src/layers/basic.jl | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 9ef6f195..e2e3e56a 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -51,16 +51,17 @@ end
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  res = Zygote.Buffer([], length(c))
-  if length(c) > 0
-    res[1] = c[1](input)
-    for (i,l) in enumerate(c[2:end])
-      res[i+1] = l(res[i])
-    end
-  end
-  return copy(res)
+    extraChain(c.layers, input)
 end
 
+function extraChain(fs::Tuple, x)
+    res = first(fs)(x)
+    return (res, extraChain(Base.tail(fs), res)...)
+end
+
+extraChain(::Tuple{}, x) = []
+
+
 
 """
     Dense(in::Integer, out::Integer, σ = identity)

From 3b7b780d398bef91f2e793e2293f140d8c3b9241 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 8 Oct 2019 23:04:31 -0700
Subject: [PATCH 07/46] super simple test

---
 test/layers/basic.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index cbe250fc..4edfecc7 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -19,6 +19,12 @@ import Flux: activations
     # numeric test should be put into testset of corresponding layer
   end
 
+  @testset "Activations" begin
+    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
+    X = Float32.([1.0; 1.0; 1.0])
+    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
+  end
+
   @testset "Dense" begin
     @test  length(Dense(10, 5)(randn(10))) == 5
     @test_throws DimensionMismatch Dense(10, 5)(randn(1))

From 3dceef427f69418220692b931d819c49e77f0810 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 8 Nov 2019 16:48:11 +0100
Subject: [PATCH 08/46] Fix binarycrossentropy on CuArrays

---
 src/layers/stateless.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index b8ce3c7d..5f9c1090 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -1,3 +1,4 @@
+using CuArrays
 using NNlib: logsoftmax, logσ
 
 # Cost functions
@@ -35,6 +36,9 @@ Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerica
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
+
 """
     logitbinarycrossentropy(logŷ, y)
 

From a00d8d94ec15080aada5c1cb938ce7cab365d99e Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 8 Nov 2019 17:28:38 +0100
Subject: [PATCH 09/46] Add test for CUDA binarycrossentropy

---
 test/cuda/cuda.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index d2907995..ddd92e1e 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -31,6 +31,10 @@ cx = gpu(x)
 @test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
 @test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
 
+x = σ.([-1.1491, 0.8619, 0.3127])
+y = [1, 1, 0.]
+@test Flux.binarycrossentropy.(x,y) ≈ Flux.binarycrossentropy.(cu(x),cu(y))
+
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)
 @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)

From 7e1ffd65072246ec634e57619174b55f007a5af3 Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Fri, 8 Nov 2019 21:39:00 +0100
Subject: [PATCH 10/46] Extend docs about `train!`

Related to #921: explain why it is not needed to pass the model as argument.
---
 docs/src/training/training.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 679bbd0b..380910c3 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -1,6 +1,6 @@
 # Training
 
-To actually train a model we need three things:
+To actually train a model we need three things, in addition to the tracked parameters that will be fitted:
 
 * A *objective function*, that evaluates how well a model is doing given some input data.
 * A collection of data points that will be provided to the objective function.
@@ -11,6 +11,7 @@ With these we can call `Flux.train!`:
 ```julia
 Flux.train!(objective, params, data, opt)
 ```
+At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!`. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately (see below) provides more flexibility, and the possibility of optimizing the calculations.
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
 

From 074eb47246cffff9c3e4f99706963de42648a1f5 Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Tue, 12 Nov 2019 23:29:38 +0100
Subject: [PATCH 11/46] Update training.md

---
 docs/src/training/training.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 380910c3..350287fc 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -11,7 +11,6 @@ With these we can call `Flux.train!`:
 ```julia
 Flux.train!(objective, params, data, opt)
 ```
-At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!`. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately (see below) provides more flexibility, and the possibility of optimizing the calculations.
 
 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
 
@@ -33,6 +32,8 @@ Flux.train!(loss, ps, data, opt)
 
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
 
+At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
+
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From ba4e3be0d33f79145a62254a235967206a27b97c Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Thu, 14 Nov 2019 16:22:31 +0100
Subject: [PATCH 12/46] explanations about params in `train!`

---
 docs/src/training/training.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 350287fc..a5474529 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -1,8 +1,9 @@
 # Training
 
-To actually train a model we need three things, in addition to the tracked parameters that will be fitted:
+To actually train a model we need four things:
 
 * A *objective function*, that evaluates how well a model is doing given some input data.
+* The parameters of the model.
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 
@@ -34,6 +35,12 @@ The objective will almost always be defined in terms of some *cost function* tha
 
 At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
 
+## Model parameters
+
+The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `params(m)`.
+
+Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
+
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From cdaaca8cfa880b2f45f30379639f347b3ebfd175 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 00:54:49 -0700
Subject: [PATCH 13/46] make activations zygote friendly

---
 src/layers/basic.jl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index f42a9619..e8dde1a3 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -44,17 +44,15 @@ end
 # it might be replaced in the future for better performance
 # see issue https://github.com/FluxML/Flux.jl/issues/702
 # Johnny Chen -- @johnnychen94
+# only slightly changed to better handle interaction with Zygote @dsweber2
 """
     activations(c::Chain, input)
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  rst = []
-  for l in c
-    x = get(rst, length(rst), input)
-    push!(rst, l(x))
-  end
-  return rst
+  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
+                       [l for l in c], dims=1, init=input) 
+  return copy(buffed)
 end
 
 

From d0202a2945bf86a7827075c77642405b25c752fe Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 01:34:12 -0700
Subject: [PATCH 14/46] adding the extra commits broke the accumulate version

---
 src/layers/basic.jl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index e8dde1a3..2d86da85 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -50,9 +50,12 @@ end
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  buffed = accumulate!((x,y)->y(x), Zygote.Buffer([], length(c)),
-                       [l for l in c], dims=1, init=input) 
-  return copy(buffed)
+  res = Zygote.Buffer([], length(c))
+  res[1] = c[1](input)
+  for (i,l) in enumerate(c[2:end])
+    res[i+1] = l(res[i])
+  end
+  return copy(res)
 end
 
 

From 99679f7e16b2244ace129e9c6288b4ab2159a452 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 10 Sep 2019 10:46:56 -0700
Subject: [PATCH 15/46] deal with empty Chain

---
 src/layers/basic.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 2d86da85..c3783567 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -51,9 +51,11 @@ Calculate the forward results of each layers in Chain `c` with `input` as model
 """
 function activations(c::Chain, input)
   res = Zygote.Buffer([], length(c))
-  res[1] = c[1](input)
-  for (i,l) in enumerate(c[2:end])
-    res[i+1] = l(res[i])
+  if length(c) > 0
+    res[1] = c[1](input)
+    for (i,l) in enumerate(c[2:end])
+      res[i+1] = l(res[i])
+    end
   end
   return copy(res)
 end

From 6475f6a43eba8feab5f34a7dc2cf0f86d1d7c0fc Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Wed, 11 Sep 2019 17:36:37 -0700
Subject: [PATCH 16/46] recursive way of doing activations

---
 src/layers/basic.jl | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index c3783567..b92bc919 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -50,16 +50,17 @@ end
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  res = Zygote.Buffer([], length(c))
-  if length(c) > 0
-    res[1] = c[1](input)
-    for (i,l) in enumerate(c[2:end])
-      res[i+1] = l(res[i])
-    end
-  end
-  return copy(res)
+    extraChain(c.layers, input)
 end
 
+function extraChain(fs::Tuple, x)
+    res = first(fs)(x)
+    return (res, extraChain(Base.tail(fs), res)...)
+end
+
+extraChain(::Tuple{}, x) = []
+
+
 
 """
     Dense(in::Integer, out::Integer, σ = identity)

From db92b0e3ce3d5cb06a11b6cf77e74e1e0d56b2f1 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 8 Oct 2019 23:04:31 -0700
Subject: [PATCH 17/46] super simple test

---
 test/layers/basic.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index cbe250fc..4edfecc7 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -19,6 +19,12 @@ import Flux: activations
     # numeric test should be put into testset of corresponding layer
   end
 
+  @testset "Activations" begin
+    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
+    X = Float32.([1.0; 1.0; 1.0])
+    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
+  end
+
   @testset "Dense" begin
     @test  length(Dense(10, 5)(randn(10))) == 5
     @test_throws DimensionMismatch Dense(10, 5)(randn(1))

From 0fe3ac4e770de17a46d37809238a6deae06f98a3 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Tue, 8 Oct 2019 23:05:22 -0700
Subject: [PATCH 18/46] bring activations into function call

---
 src/layers/basic.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index b92bc919..db491424 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -31,6 +31,8 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 (c::Chain)(x) = applychain(c.layers, x)
 
+(c::Chain)(x, i) = extraChain(c.layers, x)[i]
+
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
 function Base.show(io::IO, c::Chain)

From 58c794702d030b61a3744f1a180e9ab65113682b Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Thu, 14 Nov 2019 14:05:53 -0800
Subject: [PATCH 19/46] simpler test

---
 src/layers/basic.jl  |  4 ++--
 test/layers/basic.jl | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index db491424..75f18e3c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -31,7 +31,7 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 (c::Chain)(x) = applychain(c.layers, x)
 
-(c::Chain)(x, i) = extraChain(c.layers, x)[i]
+(c::Chain)(x) = extraChain(c.layers, x)
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
@@ -60,7 +60,7 @@ function extraChain(fs::Tuple, x)
     return (res, extraChain(Base.tail(fs), res)...)
 end
 
-extraChain(::Tuple{}, x) = []
+extraChain(::Tuple{}, x) = ()
 
 
 
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 4edfecc7..0ff1776d 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -4,11 +4,13 @@ import Flux: activations
 @testset "basic" begin
   @testset "helpers" begin
     @testset "activations" begin
-      dummy_model = Chain(Dense(10,5,σ),Dense(5,2),softmax)
-      x = rand(10)
-      @test activations(Chain(), x) == []
-      @test activations(dummy_model, x)[1] == dummy_model[1](x)
-      @test activations(dummy_model, x)[2] == x |> dummy_model[1] |> dummy_model[2]
+      dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x))
+      x = randn(10)
+      @test activations(dummy_model, x)[1] == x.^2
+      @test activations(dummy_model, x)[2] == (x.^2 .- 3)
+      @test activations(dummy_model, x)[3] == tan.(x.^2 .- 3)
+
+      @test activations(Chain(), x) == ()
       @test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type
     end
   end

From 2471596cdb47f681549fa943e2c7c83662cb2f1e Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 15 Nov 2019 11:50:13 +0000
Subject: [PATCH 20/46] test on 1.0

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 4f8acced..c2eb9ae0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,6 +6,7 @@ os:
   # - osx
 
 julia:
+  - 1.0
   - 1.2
   - 1.3
   - nightly

From 665e4419199c38a1490edba8862f0cb8f2edb8c6 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 15 Nov 2019 12:12:28 +0000
Subject: [PATCH 21/46] pkg up

---
 Manifest.toml | 59 ++++++++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 38 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index f5a589fd..653be3dc 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -38,29 +38,23 @@ git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 version = "0.2.0"
 
-[[CSTParser]]
-deps = ["Tokenize"]
-git-tree-sha1 = "99dda94f5af21a4565dc2b97edf6a95485f116c3"
-uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
-version = "1.0.0"
-
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "e063efb91cfefd7e6afd92c435d01398107a500b"
+git-tree-sha1 = "6eee47385c81ed3b3f716b745697869c712c2df3"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "1.2.0"
+version = "2.0.0"
 
 [[CUDAdrv]]
-deps = ["CEnum", "Printf"]
-git-tree-sha1 = "96eabc95ebb83e361311330ffb574a3e2df73251"
+deps = ["CEnum", "CUDAapi", "Printf"]
+git-tree-sha1 = "0f39fddace3324707469ace7fbcbc7b28d5cf921"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "4.0.2"
+version = "4.0.4"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "dd642afe5fd6633663a8c3d42f3b7638f2210b79"
+git-tree-sha1 = "93f6c917ab2a9b5bb54f8f738f4ec1a6693cb716"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.5.3"
+version = "2.5.5"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -98,17 +92,11 @@ git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
 uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 version = "1.3.0"
 
-[[Crayons]]
-deps = ["Test"]
-git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
-uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
-version = "4.0.0"
-
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "bc94d6cb335d418088f12641751aab63ff56509d"
+git-tree-sha1 = "6a05c9e40b99a6e9a7973ca93397a38d3e8a7b4b"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.4.2"
+version = "1.4.6"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -164,9 +152,9 @@ version = "0.6.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "adf88d6da1f0294058f38295becf8807986bb7d0"
+git-tree-sha1 = "4407e7b76999eca2646abdb68203bd4302476168"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.5"
+version = "0.10.6"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
@@ -216,10 +204,10 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
 [[MacroTools]]
-deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"]
-git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76"
+deps = ["Compat", "DataStructures", "Test"]
+git-tree-sha1 = "82921f0e3bde6aebb8e524efc20f4042373c0c06"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.1"
+version = "0.5.2"
 
 [[Markdown]]
 deps = ["Base64"]
@@ -327,9 +315,9 @@ version = "0.8.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "1e9c5d89cba8047d518f1ffef432906ef1a3e8bd"
+git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.12.0"
+version = "0.12.1"
 
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
@@ -346,15 +334,10 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[TimerOutputs]]
-deps = ["Crayons", "Printf", "Test", "Unicode"]
-git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
+deps = ["Printf"]
+git-tree-sha1 = "8f22dc0c23e1cd4ab8070a01ba32285926f104f1"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.0"
-
-[[Tokenize]]
-git-tree-sha1 = "dfcdbbfb2d0370716c815cbd6f8a364efb6f42cf"
-uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
-version = "0.5.6"
+version = "0.5.2"
 
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
@@ -389,9 +372,9 @@ version = "0.8.3"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "b2e42a21dc3d1ecd3cbe8c83a454ca56fbf423c4"
+git-tree-sha1 = "e4245b9c5362346e154b62842a89a18e0210b92b"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.0"
+version = "0.4.1"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]

From e24215ca982024ec8fe02a2c79fbaeb4e8dcfd91 Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Fri, 15 Nov 2019 15:59:42 +0000
Subject: [PATCH 22/46] guard test on 1.0

---
 test/layers/normalisation.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 22a5d283..4399a256 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -191,6 +191,7 @@ end
 
 end
 
+if VERSION >= v"1.1"
 @testset "GroupNorm" begin
   # begin tests
   squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
@@ -289,5 +290,5 @@ end
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test BN(x) ≈ GN(x)
   end
-
+end
 end

From 20eb840882752228a49130aed0712da389f6db1a Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@gmail.com>
Date: Fri, 15 Nov 2019 12:03:08 -0800
Subject: [PATCH 23/46] keeping activations separate

---
 src/layers/basic.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 75f18e3c..2a465208 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -31,8 +31,6 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
 
 (c::Chain)(x) = applychain(c.layers, x)
 
-(c::Chain)(x) = extraChain(c.layers, x)
-
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
 function Base.show(io::IO, c::Chain)

From a0e3729679376c984de2eb06b9848b12acb89b9f Mon Sep 17 00:00:00 2001
From: Helios De Rosario <heliosdrm@users.noreply.github.com>
Date: Fri, 15 Nov 2019 21:17:45 +0100
Subject: [PATCH 24/46] Update docs/src/training/training.md

Co-Authored-By: Mike J Innes <mike.j.innes@gmail.com>
---
 docs/src/training/training.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index a5474529..47bda1f5 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -3,7 +3,7 @@
 To actually train a model we need four things:
 
 * A *objective function*, that evaluates how well a model is doing given some input data.
-* The parameters of the model.
+* The trainable parameters of the model.
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 

From 4530ac65c7f23c2cfb5f95f49b5fe4a7dd4f946d Mon Sep 17 00:00:00 2001
From: Troels Arnfred Bojesen <tr-ab@online.no>
Date: Tue, 19 Nov 2019 16:50:40 +0900
Subject: [PATCH 25/46] Fix Glorot initialization, add He initialization

Should fix the issue reported at https://github.com/FluxML/Flux.jl/issues/442 .
Adds He weight initialization as a bonus :-)
---
 src/utils.jl  | 10 ++++++++--
 test/utils.jl | 45 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 246c30d7..d3d01a11 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,6 +1,12 @@
 # Arrays
-glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0/sum(dims))
-glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0/sum(dims))
+nfan(n_in, n_out) = n_in, n_out #fan-in, fan-out
+nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
+
+glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
+glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
+
+he_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / first(nfan(dims...)))
+he_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / first(nfan(dims...)))
 
 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
diff --git a/test/utils.jl b/test/utils.jl
index 18a57139..99492d4e 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,6 +1,7 @@
 using Flux
-using Flux: throttle, glorot_uniform, glorot_normal, stack, unstack
-using StatsBase: std
+using Flux: throttle, nfan, glorot_uniform, glorot_normal, he_uniform, he_normal,
+  stack, unstack
+using StatsBase: var
 using Random
 using Test
 
@@ -56,18 +57,36 @@ end
   # Set random seed so that these tests don't fail randomly
   Random.seed!(0)
 
-  # glorot_uniform should yield a kernel with stddev ~= sqrt(6/(n_in + n_out)),
-  # and glorot_normal should yield a kernel with stddev != 2/(n_in _ n_out)
-  for (n_in, n_out) in [(100, 100), (100, 400)]
-    v = glorot_uniform(n_in, n_out)
-    @test minimum(v) > -1.1*sqrt(6/(n_in + n_out))
-    @test minimum(v) < -0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) >  0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) <  1.1*sqrt(6/(n_in + n_out))
+  @testset "Fan in/out" begin
+    @test nfan(100, 200) == (100, 200) #For Dense layer
+    @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
+    @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
+    @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
+  end
 
-    v = glorot_normal(n_in, n_out)
-    @test std(v) > 0.9*sqrt(2/(n_in + n_out))
-    @test std(v) < 1.1*sqrt(2/(n_in + n_out))
+  @testset "glorot" begin
+    # glorot_uniform and glorot_normal should both yield a kernel with
+    # variance ≈ 2/(fan_in + fan_out)
+    for dims ∈ [(100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+      for init ∈ [glorot_uniform, glorot_normal]
+        v = init(dims...)
+        fan_in, fan_out = nfan(dims...)
+        σ2 = 2 / (fan_in + fan_out)
+        @test 0.9σ2 < var(v) < 1.1σ2
+      end
+    end
+  end
+
+  @testset "he" begin
+    # he_uniform and he_normal should both yield a kernel with variance ≈ 2/fan_in
+    for dims ∈ [(100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+      for init ∈ [he_uniform, he_normal]
+        v = init(dims...)
+        fan_in, fan_out = nfan(dims...)
+        σ2 = 2 / fan_in
+        @test 0.9σ2 < var(v) < 1.1σ2
+      end
+    end
   end
 end
 

From df7ffb0ef852579a1348a4b66bf29e7181f2a5c9 Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <baggepinnen@gmail.com>
Date: Tue, 19 Nov 2019 16:27:44 +0800
Subject: [PATCH 26/46] Fix AMSGrad on GPU

The previous initialization created a CPU array. Now, the same type of array as `x` is created.
---
 src/optimise/optimisers.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index ea2ef067..23adc6ec 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -349,10 +349,10 @@ AMSGrad(η = 0.001, β = (0.9, 0.999)) = AMSGrad(η, β, IdDict())
 
 function apply!(o::AMSGrad, x, Δ)
   η, β = o.eta, o.beta
-  mt, vt, v̂t = get!(o.state, x, (fill(ϵ, size(x)), fill(ϵ, size(x)), fill(ϵ, size(x))))
+  mt, vt, v̂t = get!(o.state, x, (fill!(zero(x), ϵ), fill!(zero(x), ϵ), fill!(zero(x), ϵ)))
   @. mt = β[1] * mt + (1 - β[1]) * Δ
   @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
-  @. v̂t = max.(v̂t, vt)
+  @. v̂t = max(v̂t, vt)
   @. Δ = η * mt / (√v̂t + ϵ)
 end
 

From 2da22f31f076ff0a7a1b185a214509c58240ca6a Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <baggepinnen@gmail.com>
Date: Tue, 19 Nov 2019 16:31:04 +0800
Subject: [PATCH 27/46] Avoid unnecessary conversion

This initialization works for both cpu and gpu
---
 src/optimise/optimisers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index ea2ef067..93237048 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -283,7 +283,7 @@ ADAGrad(η = 0.1) = ADAGrad(η, IdDict())
 
 function apply!(o::ADAGrad, x, Δ)
   η = o.eta
-  acc = get!(o.acc, x, fill(ϵ, size(x)))::typeof(x)
+  acc = get!(o.acc, x, fill!(zero(x), ϵ))::typeof(x)
   @. acc += Δ^2
   @. Δ *= η / (√acc + ϵ)
 end

From 2b8057324858d10f96213c40cd596ae54fd0b54a Mon Sep 17 00:00:00 2001
From: Troels Arnfred Bojesen <tr-ab@online.no>
Date: Tue, 19 Nov 2019 18:16:29 +0900
Subject: [PATCH 28/46] Fix Glorot initialization, add He initialization

Should fix #442 .
Adds He weight initialization as a bonus :-)
---
 src/utils.jl  |  8 +++++---
 test/utils.jl | 10 ++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index d3d01a11..b2fe76bf 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,12 +1,14 @@
 # Arrays
-nfan(n_in, n_out) = n_in, n_out #fan-in, fan-out
+nfan() = 1, 1 #fan_in, fan_out
+nfan(n) = 1, n #A vector is treated as a n×1 matrix
+nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
 nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
 
 glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
 glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
 
-he_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / first(nfan(dims...)))
-he_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / first(nfan(dims...)))
+he_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / last(nfan(dims...)))
+he_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / last(nfan(dims...)))
 
 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
diff --git a/test/utils.jl b/test/utils.jl
index 99492d4e..22b8f26a 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -58,7 +58,9 @@ end
   Random.seed!(0)
 
   @testset "Fan in/out" begin
-    @test nfan(100, 200) == (100, 200) #For Dense layer
+    @test nfan() == (1, 1) #For a constant
+    @test nfan(100) == (1, 100) #For vector
+    @test nfan(100, 200) == (200, 100) #For Dense layer
     @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
     @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
     @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
@@ -67,7 +69,7 @@ end
   @testset "glorot" begin
     # glorot_uniform and glorot_normal should both yield a kernel with
     # variance ≈ 2/(fan_in + fan_out)
-    for dims ∈ [(100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
       for init ∈ [glorot_uniform, glorot_normal]
         v = init(dims...)
         fan_in, fan_out = nfan(dims...)
@@ -79,11 +81,11 @@ end
 
   @testset "he" begin
     # he_uniform and he_normal should both yield a kernel with variance ≈ 2/fan_in
-    for dims ∈ [(100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
       for init ∈ [he_uniform, he_normal]
         v = init(dims...)
         fan_in, fan_out = nfan(dims...)
-        σ2 = 2 / fan_in
+        σ2 = 2 / fan_out
         @test 0.9σ2 < var(v) < 1.1σ2
       end
     end

From 69bf84278f348d804d096d1d4c33c49e514780e2 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 7 Nov 2019 13:07:12 +0100
Subject: [PATCH 29/46] Remove wrong warning.

---
 src/Flux.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 694bd10f..a6132a0b 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -41,7 +41,7 @@ include("deprecations.jl")
 
 function __init__()
   if !CUDAdrv.functional()
-    @warn "CUDA available, but CUDAdrv.jl failed to load"
+    # nothing to do here, the user doesn't have CUDA
   elseif length(devices()) == 0
     @warn "CUDA available, but no GPU detected"
   elseif !CuArrays.functional()

From bd734ed9571bbbb2afa8205eaafcac91e055419e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Nov 2019 15:55:25 +0100
Subject: [PATCH 30/46] Bump CUDA dependencies.

---
 Manifest.toml | 12 ++++++------
 Project.toml  |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 653be3dc..bb488879 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -94,9 +94,9 @@ version = "1.3.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "6a05c9e40b99a6e9a7973ca93397a38d3e8a7b4b"
+git-tree-sha1 = "4757376a85ffb27d4c4f6cdf9635261e6c3a5fec"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.4.6"
+version = "1.4.7"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -141,9 +141,9 @@ version = "1.0.1"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "6827a8f73ff12707f209c920d204238a16892b55"
+git-tree-sha1 = "b2cf74f09216cfe3c241e8484178ec0ea941870f"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.0"
+version = "0.8.1"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -335,9 +335,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[TimerOutputs]]
 deps = ["Printf"]
-git-tree-sha1 = "8f22dc0c23e1cd4ab8070a01ba32285926f104f1"
+git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.2"
+version = "0.5.3"
 
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
diff --git a/Project.toml b/Project.toml
index 587a459b..eae220d8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,8 +25,8 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-CUDAdrv = "4.0.1"
-CuArrays = "1.4.2"
+CUDAdrv = "4.0.3"
+CuArrays = "1.4.3"
 NNlib = "0.6"
 Zygote = "0.4"
 julia = "1"

From c45cec4cba587da9461bfb55ffe276758f442031 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Nov 2019 16:05:41 +0100
Subject: [PATCH 31/46] Simplify warning.

---
 Project.toml |  2 --
 src/Flux.jl  | 13 ++++---------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/Project.toml b/Project.toml
index eae220d8..7f4ab464 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,7 +5,6 @@ version = "0.9.0"
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
@@ -25,7 +24,6 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-CUDAdrv = "4.0.3"
 CuArrays = "1.4.3"
 NNlib = "0.6"
 Zygote = "0.4"
diff --git a/src/Flux.jl b/src/Flux.jl
index a6132a0b..d0e0d5bf 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -21,8 +21,7 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
   ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
 
 
-ENV["CUDA_INIT_SILENT"] = true
-using CUDAdrv, CuArrays
+using CuArrays
 const use_cuda = Ref(false)
 
 include("utils.jl")
@@ -40,12 +39,8 @@ include("data/Data.jl")
 include("deprecations.jl")
 
 function __init__()
-  if !CUDAdrv.functional()
-    # nothing to do here, the user doesn't have CUDA
-  elseif length(devices()) == 0
-    @warn "CUDA available, but no GPU detected"
-  elseif !CuArrays.functional()
-    @warn "CUDA GPU available, but CuArrays.jl failed to load"
+  if !CuArrays.functional()
+    # nothing to do here, and either CuArrays or one of its dependencies will have warned
   else
     use_cuda[] = true
 
@@ -54,7 +49,7 @@ function __init__()
     if CuArrays.has_cudnn()
       include(joinpath(@__DIR__, "cuda/cuda.jl"))
     else
-      @warn "CUDA GPU available, but CuArrays.jl did not find libcudnn. Some functionality will not be available."
+      @warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
     end
   end
 end

From af96a197c1d019ac0ac6cbc2c97c64d688f8aa80 Mon Sep 17 00:00:00 2001
From: Troels Arnfred Bojesen <tr-ab@online.no>
Date: Wed, 20 Nov 2019 13:20:42 +0900
Subject: [PATCH 32/46] Fix Glorot initialization

Should fix #442
---
 src/utils.jl  |  3 ---
 test/utils.jl | 15 +--------------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index b2fe76bf..324d87c8 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -7,9 +7,6 @@ nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of conv
 glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
 glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
 
-he_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / last(nfan(dims...)))
-he_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / last(nfan(dims...)))
-
 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
 
diff --git a/test/utils.jl b/test/utils.jl
index 22b8f26a..1c275e85 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,6 +1,5 @@
 using Flux
-using Flux: throttle, nfan, glorot_uniform, glorot_normal, he_uniform, he_normal,
-  stack, unstack
+using Flux: throttle, nfan, glorot_uniform, glorot_normal, stack, unstack
 using StatsBase: var
 using Random
 using Test
@@ -78,18 +77,6 @@ end
       end
     end
   end
-
-  @testset "he" begin
-    # he_uniform and he_normal should both yield a kernel with variance ≈ 2/fan_in
-    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
-      for init ∈ [he_uniform, he_normal]
-        v = init(dims...)
-        fan_in, fan_out = nfan(dims...)
-        σ2 = 2 / fan_out
-        @test 0.9σ2 < var(v) < 1.1σ2
-      end
-    end
-  end
 end
 
 @testset "Params" begin

From a0314ce682945fe0e582be7cd0d92a07b305407a Mon Sep 17 00:00:00 2001
From: matsueushi <matsueushi@gmail.com>
Date: Fri, 22 Nov 2019 05:23:24 +0000
Subject: [PATCH 33/46] Fix logitbinarycrossentropy on CuArrays

---
 src/layers/stateless.jl | 3 +++
 test/cuda/cuda.jl       | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 5f9c1090..870a6cdf 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -53,6 +53,9 @@ but it is more numerically stable.
 """
 logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
 
+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+
 """
     normalise(x::AbstractArray; dims=1)
 
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index ddd92e1e..1576d88f 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -31,9 +31,10 @@ cx = gpu(x)
 @test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
 @test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
 
-x = σ.([-1.1491, 0.8619, 0.3127])
+x = [-1.1491, 0.8619, 0.3127]
 y = [1, 1, 0.]
-@test Flux.binarycrossentropy.(x,y) ≈ Flux.binarycrossentropy.(cu(x),cu(y))
+@test Flux.binarycrossentropy.(σ.(x),y) ≈ Flux.binarycrossentropy.(cu(σ.(x)),cu(y))
+@test Flux.logitbinarycrossentropy.(x,y) ≈ Flux.logitbinarycrossentropy.(cu(x),cu(y))
 
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)

From 4ece13c6491059eee466e32d8506193c69184880 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 22 Nov 2019 18:03:51 +0100
Subject: [PATCH 34/46] Don't include the CUDA module during precompilation.

If we do, we could end up replacing it at runtime.
---
 src/Flux.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/Flux.jl b/src/Flux.jl
index d0e0d5bf..905cb638 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -39,6 +39,12 @@ include("data/Data.jl")
 include("deprecations.jl")
 
 function __init__()
+  precompiling = ccall(:jl_generating_output, Cint, ()) != 0
+
+  # we don't want to include the CUDA module when precompiling,
+  # or we could end up replacing it at run time (triggering a warning)
+  precompiling && return
+
   if !CuArrays.functional()
     # nothing to do here, and either CuArrays or one of its dependencies will have warned
   else

From 5f21238d1a6235940127b30763d05d9998a14cdb Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 24 Nov 2019 13:25:02 +0530
Subject: [PATCH 35/46] no grad dims helper

---
 src/Flux.jl         | 2 +-
 src/layers/conv.jl  | 2 ++
 test/layers/conv.jl | 6 ++++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index d0e0d5bf..4c5aa2ab 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -6,7 +6,7 @@ using Base: tail
 using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using MacroTools: @forward
 @reexport using NNlib
-using Zygote: Params, @adjoint, gradient, pullback
+using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 519f129f..d33c8da5 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -118,6 +118,8 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
     )
 end
 
+@nograd conv_transpose_dims
+
 function (c::ConvTranspose)(x::AbstractArray)
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
   σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index aa3925f1..4bf80234 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -1,5 +1,6 @@
 using Flux, Test
 using Flux: maxpool, meanpool
+using Flux: gradient
 
 @testset "Pooling" begin
   x = randn(Float32, 10, 10, 3, 2)
@@ -54,6 +55,11 @@ end
   y = Conv((3,3), 1 => 1)(x)
   x_hat = ConvTranspose((3, 3), 1 => 1)(y)
   @test size(x_hat) == size(x)
+  m = ConvTranspose((3,3), 2=>1)
+  x = rand(10,10,2,1)
+
+  # Test that the gradient call does not throw: #900
+  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
 end
 
 @testset "CrossCor" begin

From c031ae1a949fe77b328edc272826650aa7fcce50 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 24 Nov 2019 13:31:31 +0530
Subject: [PATCH 36/46] correct channel value

---
 test/layers/conv.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 4bf80234..b4136062 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -55,9 +55,8 @@ end
   y = Conv((3,3), 1 => 1)(x)
   x_hat = ConvTranspose((3, 3), 1 => 1)(y)
   @test size(x_hat) == size(x)
-  m = ConvTranspose((3,3), 2=>1)
-  x = rand(10,10,2,1)
 
+  m = ConvTranspose((3,3), 1=>1)
   # Test that the gradient call does not throw: #900
   @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
 end

From 59bb0d81b020a33155e56add14f50ef20397ceaa Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 26 Nov 2019 16:23:09 +0530
Subject: [PATCH 37/46] add TODO

---
 src/layers/conv.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index d33c8da5..f4de3ffc 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -118,6 +118,7 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
     )
 end
 
+# TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
 @nograd conv_transpose_dims
 
 function (c::ConvTranspose)(x::AbstractArray)

From 1c0e9acc45bd85c56d95f476ab203e7f72481728 Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Tue, 26 Nov 2019 15:33:41 +0000
Subject: [PATCH 38/46] Update CuArrays to include the workspace fix.

---
 Manifest.toml | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index bb488879..c0618c8e 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -94,7 +94,9 @@ version = "1.3.0"
 
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "4757376a85ffb27d4c4f6cdf9635261e6c3a5fec"
+git-tree-sha1 = "7e00178b18672ee2cf37244ac2a273b6b0701b04"
+repo-rev = "master"
+repo-url = "https://github.com/JuliaGPU/CuArrays.jl.git"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 version = "1.4.7"
 
@@ -105,9 +107,9 @@ version = "1.1.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "1fe8fad5fc84686dcbc674aa255bc867a64f8132"
+git-tree-sha1 = "a1b652fb77ae8ca7ea328fa7ba5aa151036e5c10"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.5"
+version = "0.17.6"
 
 [[Dates]]
 deps = ["Printf"]
@@ -124,10 +126,10 @@ uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 version = "0.0.4"
 
 [[DiffRules]]
-deps = ["Random", "Test"]
-git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
+deps = ["NaNMath", "Random", "SpecialFunctions"]
+git-tree-sha1 = "f734b5f6bc9c909027ef99f6d91d5d9e4b111eed"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "0.0.10"
+version = "0.1.0"
 
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
@@ -141,9 +143,9 @@ version = "1.0.1"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "b2cf74f09216cfe3c241e8484178ec0ea941870f"
+git-tree-sha1 = "1a9fe4e1323f38de0ba4da49eafd15b25ec62298"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.1"
+version = "0.8.2"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -235,10 +237,9 @@ uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 version = "0.6.0"
 
 [[NaNMath]]
-deps = ["Compat"]
-git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
+git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "0.3.2"
+version = "0.3.3"
 
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@@ -248,9 +249,9 @@ version = "1.1.0"
 
 [[Parsers]]
 deps = ["Dates", "Test"]
-git-tree-sha1 = "c56ecb484f286639f161e712b8311f5ab77e8d32"
+git-tree-sha1 = "0139ba59ce9bc680e2925aec5b7db79065d60556"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "0.3.8"
+version = "0.3.10"
 
 [[Pkg]]
 deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]

From 99f98ca800ff959ab8a5e9c34758eb2a6f3ad00d Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 28 Nov 2019 16:00:21 +0000
Subject: [PATCH 39/46] Update README.md

---
 README.md | 88 ++-----------------------------------------------------
 1 file changed, 2 insertions(+), 86 deletions(-)

diff --git a/README.md b/README.md
index d8af28ae..4196b926 100644
--- a/README.md
+++ b/README.md
@@ -7,93 +7,9 @@
 Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, and provides lightweight abstractions on top of Julia's native GPU and AD support. Flux makes the easy things easy while remaining fully hackable.
 
 ```julia
-julia> Pkg.add("Flux")
+] add Flux
 ```
 
 See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
 
-If you use Flux in research, please cite the following paper:
-
-```
-@article{innes:2018,
-  author    = {Mike Innes},
-  title     = {Flux: Elegant Machine Learning with Julia},
-  journal   = {Journal of Open Source Software},
-  year      = {2018},
-  doi       = {10.21105/joss.00602},
-}
-```
-
-## Features
-
-Flux has powerful high-level features, and common architectures can be defined in a few lines.
-
-```julia
-model = Chain(
-  Dense(768, 128, σ),
-  LSTM(128, 256),
-  LSTM(256, 128),
-  Dense(128, 10),
-  softmax)
-
-loss(x, y) = crossentropy(model(x), y)
-
-Flux.train!(loss, params(model), data, ADAM(...))
-```
-
-Yet you can easily strip away the layers, and directly write the mathematics for your problem. Flux will seamlessly take gradients of any Julia code, so your model looks just like the paper.
-
-```julia
-W = param(randn(2, 10))
-b = param(randn(2))
-
-y(x) = σ.(W * x .+ b)
-```
-
-If that's *still* not enough, you can go as deep as you want, even writing your own CUDA kernels with [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl)! All this can be freely mixed-and-matched in a single model or script, and it all runs interactively via Jupyter or Juno.
-
-```julia
-function gpu_add(a, b, c)
-  i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-  c[i] = a[i] + b[i]
-  return nothing
-end
-```
-
-Unusual architectures are no problem in Flux, as you can use all the loops, control flow and even macros that you're used to. Here's a Tree RNN in 4 lines.
-
-```julia
-tree() = rand() < 0.5 ? rand(10) : (tree(), tree()) # dummy data
-
-shrink = Dense(20, 10)
-combine(a, b) = shrink([a; b])
-
-model(x) = x
-model(x::Tuple) = combine(model(x[1]), model(x[2]))
-
-model(tree()) # Sample output
-```
-
-Despite this flexibility, Julia's advanced compiler lets us do some powerful optimisations. For example, this definition of `sigmoid` automatically gets fused into a *single* GPU kernel – so it's really fast.
-
-```julia
-sigmoid(xs) = 1 ./ (1 .+ exp.(.-xs))
-```
-
-Similarly, Flux is the first dynamic framework to support [compiling to the browser](https://fluxml.github.io/experiments/) and model import via [formats like ONNX](https://github.com/FluxML/ONNX.jl/), both of which are thinly-veiled compiler problems.
-
-For more on our philosophy on machine learning, check out our article [On Machine Learning & Programming Languages](https://julialang.org/blog/2017/12/ml&pl).
-
-## Contributing & Help
-
-For general questions and help, check out Julia's [community forum](https://discourse.julialang.org/c/domain/ML).
-
-Flux development is carried out via our [GitHub issues](https://github.com/FluxML/Flux.jl/issues), so feel free to open feature requests or PRs here.
-
-For more informal discussions we'd love to have you on the [Julia slack](https://slackinvite.julialang.org/), where we hang out on the #machine-learning channel.
-
-## Related Packages
-
-Check out [Metalhead.jl](https://github.com/FluxML/Metalhead.jl) for common computer vision datasets and trained models.
-
-[MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) provides further common datasets.
+If you use Flux in research, please see [CITATION.bib] for papers to cite.

From 75d609ecc87875ebb885f20a2e54d22f6b18cc8b Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 28 Nov 2019 16:00:55 +0000
Subject: [PATCH 40/46] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4196b926..ef090f5b 100644
--- a/README.md
+++ b/README.md
@@ -12,4 +12,4 @@ Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, a
 
 See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
 
-If you use Flux in research, please see [CITATION.bib] for papers to cite.
+If you use Flux in research, please see [our papers](CITATION.bib) for appropriate citations.

From 4481c74f50e9b9ce03bd1d21027d0cf99e44b7b7 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 28 Nov 2019 21:45:06 +0530
Subject: [PATCH 41/46] v0.10 changes

---
 NEWS.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 26853df3..80239760 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,18 @@
+# v0.10.0
+* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
+  - The dependency on Tracker.jl has been removed.
+  - This means Flux now does not depend on using a specialised `TrackedArray` type, and can be used with normal Array implementations directly.
+  - Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
+* The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
+* `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
+* Added a new [RADAM optimiser](https://github.com/FluxML/Flux.jl/pull/842)
+* Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
+* [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
+* [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
+* @treelike has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
+* `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
+
+
 # v0.9.0
 * [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
 * New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.

From 1ae554d82c8572bafa2287dee249581aad14596e Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 28 Nov 2019 21:47:37 +0530
Subject: [PATCH 42/46] rm new line

---
 NEWS.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 80239760..d4375458 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -12,7 +12,6 @@
 * @treelike has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
 * `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
 
-
 # v0.9.0
 * [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
 * New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.

From c17dc34e383c27f4edbe93c30bc6aa092eeba3a0 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 28 Nov 2019 21:49:34 +0530
Subject: [PATCH 43/46] phew

Co-Authored-By: Mike J Innes <mike.j.innes@gmail.com>
---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index d4375458..7c964956 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,7 +9,7 @@
 * Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
 * [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
 * [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
-* @treelike has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
+* `@treelike` has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
 * `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
 
 # v0.9.0

From b65b491e516cb3ff209a4d2c93b551116a6ee2ac Mon Sep 17 00:00:00 2001
From: Mike Innes <mike.j.innes@gmail.com>
Date: Thu, 28 Nov 2019 16:23:22 +0000
Subject: [PATCH 44/46] compat, pkg up

---
 Manifest.toml | 14 ++++++--------
 Project.toml  |  9 +++++++++
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index c0618c8e..be9bf768 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,5 +1,3 @@
-# This file is machine-generated - editing it directly is not advised
-
 [[AbstractFFTs]]
 deps = ["LinearAlgebra"]
 git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40"
@@ -132,7 +130,7 @@ uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
 version = "0.1.0"
 
 [[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
+deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 
 [[FFTW]]
@@ -154,9 +152,9 @@ version = "0.6.1"
 
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "4407e7b76999eca2646abdb68203bd4302476168"
+git-tree-sha1 = "da46ac97b17793eba44ff366dc6cb70f1238a738"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.6"
+version = "0.10.7"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
@@ -171,7 +169,7 @@ uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
 version = "0.3.0"
 
 [[InteractiveUtils]]
-deps = ["Markdown"]
+deps = ["LinearAlgebra", "Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[JSON]]
@@ -254,7 +252,7 @@ uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 version = "0.3.10"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
@@ -353,7 +351,7 @@ uuid = "30578b45-9adc-5946-b283-645ec420af67"
 version = "0.4.0"
 
 [[UUIDs]]
-deps = ["Random", "SHA"]
+deps = ["Random"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [[Unicode]]
diff --git a/Project.toml b/Project.toml
index 7f4ab464..bc5e9de8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -24,8 +24,17 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
+AbstractTrees = "0.2"
+Adapt = "1"
+CodecZlib = "0.5, 0.6"
+Colors = "0.8, 0.9"
 CuArrays = "1.4.3"
+Juno = "0.5, 0.6, 0.7"
+MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
+Reexport = "0.2"
+StatsBase = "0"
+ZipFile = "0.7, 0.8"
 Zygote = "0.4"
 julia = "1"
 

From 73d572b1a9e60f61b46390d0050ccb5a347dd7be Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 28 Nov 2019 23:57:01 +0530
Subject: [PATCH 45/46] rm RADAM

---
 NEWS.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index d4375458..faf3fe49 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,7 +5,6 @@
   - Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
 * The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
 * `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
-* Added a new [RADAM optimiser](https://github.com/FluxML/Flux.jl/pull/842)
 * Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
 * [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
 * [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.

From 4b63e69b656e7f41cd37dec4378e703a7f81ff07 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 29 Nov 2019 00:02:59 +0530
Subject: [PATCH 46/46] bump version to v0.10

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 7f4ab464..a64f272b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.9.0"
+version = "0.10.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"