From 427c55af9287783807ad4198ecc445da56872225 Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Mon, 20 Apr 2020 19:11:57 +0800
Subject: [PATCH 01/11] speedup matmul of CuMatrix and OneHotMatrix

---
 src/onehot.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/onehot.jl b/src/onehot.jl
index 4b7e5e36..9d5394ef 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -41,6 +41,10 @@ import .CuArrays: CuArray, CuArrayStyle, cudaconvert
 import Base.Broadcast: BroadcastStyle, ArrayStyle
 BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = CuArrayStyle{2}()
 cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
+function Base.:(*)(A::CuArrays.CuMatrix, B::OneHotMatrix{CuArrays.CuArray{OneHotVector,1}})
+  I = CuArrays.CuArray{UInt32, 1}(B.data.buf, 2 .* B.data.dims, offset = B.data.offset)[1:2:end]
+  A[:, Array(I)]
+end
 
 """
     onehot(l, labels[, unk])

From b33c4b49be2bec0fd0034beee3c3d24d7bec289b Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Mon, 20 Apr 2020 19:41:10 +0800
Subject: [PATCH 02/11] add ClipValue and ClipNorm

---
 src/Flux.jl                |  6 ++++--
 src/optimise/optimisers.jl | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index f973dc4c..0195cc8c 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -3,7 +3,8 @@ module Flux
 # Zero Flux Given
 
 using Base: tail
-using Zygote, MacroTools, Juno, Reexport, Statistics, Random
+using Statistics, Random, LinearAlgebra
+using Zygote, MacroTools, Juno, Reexport, Requires
 using MacroTools: @forward
 @reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
@@ -20,7 +21,8 @@ using .Optimise
 using .Optimise: @epochs
 export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
   ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
-  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
+  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay,
+  ClipValue, ClipNorm
 
 
 using CuArrays
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 611edddb..3731e8e3 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -533,3 +533,37 @@ function apply!(o::WeightDecay, x, Δ)
   wd = o.wd
   @. Δ += wd * x
 end
+
+"""
+    ClipValue(thresh)
+
+Clip gradients when their absolute value exceeds `thresh`.
+
+# Parameters
+- Clipping threshold (`thresh`)
+"""
+mutable struct ClipValue{T}
+    thresh::T
+end
+
+apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
+
+"""
+    ClipNorm(thresh)
+
+Clip gradients when their L2 norm exceeds `thresh`.
+
+# Parameters
+- Clipping threshold (`thresh`)
+"""
+mutable struct ClipNorm{T}
+    thresh::T
+end
+
+function apply!(o::ClipNorm, x, Δ)
+    Δnrm = norm(Δ, 2)
+    if Δnrm > o.thresh
+        rmul!(Δ, o.thresh / Δnrm)
+    end
+    return Δ
+end
\ No newline at end of file

From ba0fca5a19dc62c6ed647c57884a21ac8a38b6e3 Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Mon, 20 Apr 2020 19:45:15 +0800
Subject: [PATCH 03/11] remove onehot

---
 src/onehot.jl | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/onehot.jl b/src/onehot.jl
index 9d5394ef..4b7e5e36 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -41,10 +41,6 @@ import .CuArrays: CuArray, CuArrayStyle, cudaconvert
 import Base.Broadcast: BroadcastStyle, ArrayStyle
 BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = CuArrayStyle{2}()
 cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
-function Base.:(*)(A::CuArrays.CuMatrix, B::OneHotMatrix{CuArrays.CuArray{OneHotVector,1}})
-  I = CuArrays.CuArray{UInt32, 1}(B.data.buf, 2 .* B.data.dims, offset = B.data.offset)[1:2:end]
-  A[:, Array(I)]
-end
 
 """
     onehot(l, labels[, unk])

From 68b84bba368ce0783196aef183dce6bf49343690 Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Mon, 20 Apr 2020 19:54:44 +0800
Subject: [PATCH 04/11] add LinearAlgebra

---
 Project.toml | 1 +
 src/Flux.jl  | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 1883d974..7ce3909a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
diff --git a/src/Flux.jl b/src/Flux.jl
index 0195cc8c..1ec1ad94 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -4,7 +4,7 @@ module Flux
 
 using Base: tail
 using Statistics, Random, LinearAlgebra
-using Zygote, MacroTools, Juno, Reexport, Requires
+using Zygote, MacroTools, Juno, Reexport
 using MacroTools: @forward
 @reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
@@ -51,6 +51,7 @@ function __init__()
       @warn "CuArrays.jl found cuda, but did not find libcudnn. Some functionality will not be available."
     end
   end
+  @require MPI="da04e1cc-30fd-572f-bb4f-1f8673147195" include("optimise/mpi.jl")
 end
 
 end # module

From cc1dcd5590f5ad0edb94364a4a42501a67f9800b Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Mon, 20 Apr 2020 20:02:29 +0800
Subject: [PATCH 05/11] rm requires

---
 src/Flux.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 1ec1ad94..d8db39e9 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -51,7 +51,6 @@ function __init__()
       @warn "CuArrays.jl found cuda, but did not find libcudnn. Some functionality will not be available."
     end
   end
-  @require MPI="da04e1cc-30fd-572f-bb4f-1f8673147195" include("optimise/mpi.jl")
 end
 
 end # module

From def19b058e37d94846085a6937d8bc6ac7f3131f Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Tue, 21 Apr 2020 10:56:38 +0800
Subject: [PATCH 06/11] simplify docstrings

---
 src/optimise/optimisers.jl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 3731e8e3..15330e2a 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -538,9 +538,6 @@ end
     ClipValue(thresh)
 
 Clip gradients when their absolute value exceeds `thresh`.
-
-# Parameters
-- Clipping threshold (`thresh`)
 """
 mutable struct ClipValue{T}
     thresh::T
@@ -552,9 +549,6 @@ apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
     ClipNorm(thresh)
 
 Clip gradients when their L2 norm exceeds `thresh`.
-
-# Parameters
-- Clipping threshold (`thresh`)
 """
 mutable struct ClipNorm{T}
     thresh::T

From 1dfec7f38bac2e0b0def9c6b4c4a2d2430784c33 Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Wed, 22 Apr 2020 01:22:34 +0800
Subject: [PATCH 07/11] add test

---
 src/optimise/Optimise.jl |  7 +++++--
 test/optimise.jl         | 12 ++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index 28a1849d..184d472c 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -1,9 +1,12 @@
 module Optimise
 
+using LinearAlgebra
+
 export train!, update!,
 	SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
-	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
-	InvDecay, ExpDecay, WeightDecay, stop, Optimiser
+	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW, RADAM, 
+	InvDecay, ExpDecay, WeightDecay, ClipValue, ClipNorm,
+	stop, Optimiser
 
 include("optimisers.jl")
 include("train.jl")
diff --git a/test/optimise.jl b/test/optimise.jl
index ac131b96..b3a0250c 100644
--- a/test/optimise.jl
+++ b/test/optimise.jl
@@ -89,3 +89,15 @@ end
     @test decay_steps == ground_truth
     @test o.eta == o.clip
 end
+
+@testset "Clipping" begin
+    w = randn(10, 10)
+    loss(x) = sum(w * x)
+    θ = Params([w])
+    x = 1000 * randn(10)
+    w̄ = gradient(() -> loss(x), θ)[w]
+    w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
+    @test all(w̄_value .<= 1)
+    w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
+    @test norm(w̄_norm) <= 1
+end
\ No newline at end of file

From c4f5e83697a2b6163c29c8ba2e45a394c00d9bdb Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Wed, 22 Apr 2020 01:24:13 +0800
Subject: [PATCH 08/11] resolve conflict

---
 src/optimise/Optimise.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index 184d472c..66be6dce 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -5,8 +5,7 @@ using LinearAlgebra
 export train!, update!,
 	SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
 	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW, RADAM, 
-	InvDecay, ExpDecay, WeightDecay, ClipValue, ClipNorm,
-	stop, Optimiser
+	InvDecay, ExpDecay, WeightDecay, stop, Optimiser, ClipValue, ClipNorm
 
 include("optimisers.jl")
 include("train.jl")

From 114f63a214032c7af4ebce308a7d7d9a9e2bc58b Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Sun, 26 Apr 2020 17:28:07 +0800
Subject: [PATCH 09/11] =?UTF-8?q?norm(=CE=94)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/optimise/optimisers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 15330e2a..466b7b6d 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -555,7 +555,7 @@ mutable struct ClipNorm{T}
 end
 
 function apply!(o::ClipNorm, x, Δ)
-    Δnrm = norm(Δ, 2)
+    Δnrm = norm(Δ)
     if Δnrm > o.thresh
         rmul!(Δ, o.thresh / Δnrm)
     end

From d1ad8db625d0a870ef7bd6bcbcbcb7342953e54c Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Sat, 9 May 2020 16:40:26 +0800
Subject: [PATCH 10/11] add to docs

---
 docs/src/training/optimisers.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 5ed083ee..25cfd3ce 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -140,3 +140,16 @@ ExpDecay
 InvDecay
 WeightDecay
 ```
+
+## Gradient Clipping
+
+Gradient clipping is useful for training recurrent neural networks, which have a tendency to suffer from the exploding gradient problem. An example usage is
+
+```julia
+opt = Optimiser(ClipValue(1e-3), ADAM(1e-3))
+```
+
+```@docs
+ClipValue
+ClipNorm
+```
\ No newline at end of file

From 007586858c4f0451f0be7b65149d497635024884 Mon Sep 17 00:00:00 2001
From: Yao Lu <luyaocns@gmail.com>
Date: Thu, 14 May 2020 17:13:35 +0800
Subject: [PATCH 11/11] fix export merge conflict

---
 src/optimise/Optimise.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index 4ea5235e..0f5e644f 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -2,10 +2,11 @@ module Optimise
 
 using LinearAlgebra
 
-export train!, update!, stop, Optimiser,
-		Descent, ADAM, Momentum, Nesterov, RMSProp,
-		ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW, RADAM,
-		InvDecay, ExpDecay, WeightDecay, ClipValue, ClipNorm
+export train!, update!,
+	Descent, ADAM, Momentum, Nesterov, RMSProp,
+	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
+	InvDecay, ExpDecay, WeightDecay, stop, Optimiser,
+	ClipValue, ClipNorm
 
 include("optimisers.jl")
 include("train.jl")