From fc4827c48fce279bf76d4a90a3014135bec95fc1 Mon Sep 17 00:00:00 2001
From: Lyndon White <oxinabox@ucc.asn.au>
Date: Tue, 7 May 2019 16:38:21 +0100
Subject: [PATCH 01/10] Some cleanup on performance tips

---
 docs/src/performance.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/src/performance.md b/docs/src/performance.md
index 00b94a9d..95a64217 100644
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@@ -14,11 +14,11 @@ Which means allocations occur much faster.
 And you use less memory.
 
 
-## Make sure your custom activation functions preserve the type of their inputs
-Not only should your activation functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
+## Make sure your activation and loss functions preserve the type of their inputs
+Not only should your activation and loss functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
 they should also preserve the type of their inputs.
 
-A very artificial example using an activatioon function like
+A very artificial example using an activation function like
 
 ```
     my_tanh(x) = Float64(tanh(x))
@@ -26,6 +26,7 @@ A very artificial example using an activatioon function like
 
 will result in performance on `Float32` input orders of magnitude slower than the normal `tanh` would,
 because it results in having to use slow mixed type multiplication in the dense layers.
+Similar can occur in the loss function during backpropagation.
 
 Which means if you change your data say from `Float64` to `Float32` (which should give a speedup: see above),
 you will see a large slow-down
@@ -60,7 +61,7 @@ end
 
 It is much faster to concatenate them into a matrix,
 as this will hit BLAS matrix-matrix multiplication, which is much faster than the equivalent sequence of matrix-vector multiplications.
-Even though this means allocating new memory to store them contiguously.
+The improvement is enough that it is worthwild allocating new memory to store them contiguously.
 
 ```julia
 x_batch = reduce(hcat, xs)

From fe759ac43ce859f00e79d3ccefc940cfb33ffe32 Mon Sep 17 00:00:00 2001
From: Lyndon White <oxinabox@ucc.asn.au>
Date: Tue, 28 May 2019 14:19:56 +0100
Subject: [PATCH 02/10] Update docs/src/performance.md

Co-Authored-By: Kristoffer Carlsson <kristoffer.carlsson@chalmers.se>
---
 docs/src/performance.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/performance.md b/docs/src/performance.md
index 95a64217..fc663324 100644
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@@ -26,7 +26,7 @@ A very artificial example using an activation function like
 
 will result in performance on `Float32` input orders of magnitude slower than the normal `tanh` would,
 because it results in having to use slow mixed type multiplication in the dense layers.
-Similar can occur in the loss function during backpropagation.
+Similar situations can occur in the loss function during backpropagation.
 
 Which means if you change your data say from `Float64` to `Float32` (which should give a speedup: see above),
 you will see a large slow-down

From b24e05bb20d7b02a1794df86356bce25bc325052 Mon Sep 17 00:00:00 2001
From: Jason Wu <wujiechen@gmail.com>
Date: Tue, 2 Jul 2019 13:15:54 -0400
Subject: [PATCH 03/10] Fix lack of x

---
 docs/src/performance.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/performance.md b/docs/src/performance.md
index 682b7231..7b58316d 100644
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@@ -41,7 +41,7 @@ While one could change your activation function (e.g. to use `0.01f0x`) to avoid
 the idiomatic (and safe way) is to use `oftype`.
 
 ```
-    leaky_tanh(x) = oftype(x/1, 0.01) + tanh(x)
+    leaky_tanh(x) = oftype(x/1, 0.01)x + tanh(x)
 ```
 
 

From 16d5f2bc2430577dd64b49afd5baa09f94152a7a Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Mon, 8 Jul 2019 23:11:35 +0200
Subject: [PATCH 04/10] Add x to seen in prefor to avoid infinite recursion if
 passed something self-referential

---
 src/treelike.jl |  1 +
 test/utils.jl   | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/src/treelike.jl b/src/treelike.jl
index 443a91e2..ccb0fe81 100644
--- a/src/treelike.jl
+++ b/src/treelike.jl
@@ -31,6 +31,7 @@ end
 
 function prefor(f, x; seen = IdSet())
   x ∈ seen && return
+  push!(seen, x)
   f(x)
   foreach(x -> prefor(f, x, seen = seen), children(x))
   return
diff --git a/test/utils.jl b/test/utils.jl
index 7bcf72c3..2453d8b8 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -85,6 +85,17 @@ end
   @test size.(params(m)) == [(5, 10), (5,)]
   m = RNN(10, 5)
   @test size.(params(m)) == [(5, 10), (5, 5), (5,), (5,)]
+
+  # Layer duplicated in same chain, params just once pls.
+  c = Chain(m, m)
+  @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5,)]
+
+  # Recursive struct. Just want params, no stack overflow pls.
+  mutable struct R m;r end
+  Flux.@treelike R
+  r = R(m, nothing)
+  r.r = r
+  @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5,)]
 end
 
 @testset "Basic Stacking" begin

From 9b96a3d69b2f9e40e28a76a09360bcbaf5fe666b Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Tue, 9 Jul 2019 01:15:55 +0200
Subject: [PATCH 05/10] Change to array due to "type definition not allowed
 inside a local scope"

---
 test/utils.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/utils.jl b/test/utils.jl
index 2453d8b8..366f02b0 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -90,11 +90,10 @@ end
   c = Chain(m, m)
   @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5,)]
 
-  # Recursive struct. Just want params, no stack overflow pls.
-  mutable struct R m;r end
-  Flux.@treelike R
-  r = R(m, nothing)
-  r.r = r
+  # Self-referential array. Just want params, no stack overflow pls.
+  r = Any[nothing,m]
+  Flux.children(a::Vector{Any}) = Tuple(a)
+  r[1] = r
   @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5,)]
 end
 

From 27904d349cdbda17ff0b1aa6a4f80dd254036ee0 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 11 Jul 2019 16:11:32 +0100
Subject: [PATCH 06/10] Update performance.md

---
 docs/src/performance.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/performance.md b/docs/src/performance.md
index fc663324..903200ee 100644
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@@ -61,7 +61,7 @@ end
 
 It is much faster to concatenate them into a matrix,
 as this will hit BLAS matrix-matrix multiplication, which is much faster than the equivalent sequence of matrix-vector multiplications.
-The improvement is enough that it is worthwild allocating new memory to store them contiguously.
+The improvement is enough that it is worthwhile allocating new memory to store them contiguously.
 
 ```julia
 x_batch = reduce(hcat, xs)

From ed12d4e7c04207a44e4c11a96b970228fd3b16e1 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <Contact@ChrisRackauckas.com>
Date: Wed, 31 Jul 2019 17:56:51 -0400
Subject: [PATCH 07/10] Momentum doesn't need params

---
 src/optimise/optimisers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 2319cfdb..939a4678 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -23,7 +23,7 @@ function apply!(o::Descent, x, Δ)
 end
 
 """
-    Momentum(params, η = 0.01; ρ = 0.9)
+    Momentum(η = 0.01; ρ = 0.9)
 
 Gradient descent with learning rate `η` and momentum `ρ`.
 """

From 4d00957b36a55647d37fca0a174251445f7c161c Mon Sep 17 00:00:00 2001
From: Moelf <jerryling315@gmail.com>
Date: Tue, 6 Aug 2019 22:23:21 +0200
Subject: [PATCH 08/10] Fix CuArray zeros deprecation

---
 src/cuda/curnn.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl
index 09f6d43c..4990599f 100644
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@@ -63,7 +63,7 @@ function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T
   @check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint),
     handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
 
-  w = cuzeros(T, rnnParamSize(T, d[], input))
+  w = CuArrays.zeros(T, rnnParamSize(T, d[], input))
   # TODO: avoid reserve allocation here
   rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
   finalizer(rd) do x

From 7c111e7cdeda91826490ed55912973dd629b6623 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 9 Aug 2019 13:53:11 +0100
Subject: [PATCH 09/10] fixes #645 fixes #831

---
 src/layers/basic.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 12d4e2e3..83eeee21 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -110,7 +110,7 @@ end
 (a::Dense{<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   invoke(a, Tuple{AbstractArray}, x)
 
-(a::Dense{<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
+(a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
 """

From 14affbc91bf290fa69e3b340a23a9584fcf946b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Madrid=20Menc=C3=ADa?=
 <miguel.madrid.mencia@gmail.com>
Date: Sun, 11 Aug 2019 13:38:44 +0200
Subject: [PATCH 10/10] Use `CuArrays.ones` instead `cuones` which is
 deprecated

---
 src/cuda/curnn.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl
index 4990599f..c60104d2 100644
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@@ -130,8 +130,8 @@ end
 # TODO: can we just manipulate strides here?
 # TODO: should use repmat, but this isn't implemented.
 hBatch(x::AbstractVector, h::CuVector) = h
-hBatch(x::AbstractMatrix, h::CuVector) = h .* cuones(1, size(x, 2))
-hBatch(x::AbstractMatrix, h::CuMatrix) = h .* cuones(1, size(h,2) == 1 ? size(x,2) : 1)
+hBatch(x::AbstractMatrix, h::CuVector) = h .* CuArrays.ones(1, size(x, 2))
+hBatch(x::AbstractMatrix, h::CuMatrix) = h .* CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1)
 
 function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
   h = hBatch(x, h_)