From 13b934c2500b8e39ac24c834079b562057dede5a Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 12 Oct 2017 10:31:38 +0200
Subject: [PATCH 1/3] improve optimizers

---
 src/data/cmudict.jl        |  3 +-
 src/optimise/interface.jl  | 50 +++++++++++-----------
 src/optimise/optimisers.jl | 85 +++++++++++++++++++++-----------------
 src/tracker/Tracker.jl     |  2 +
 test/optimise.jl           | 19 +++++++++
 test/runtests.jl           |  1 +
 6 files changed, 98 insertions(+), 62 deletions(-)
 create mode 100644 test/optimise.jl

diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl
index 88b9c6c0..a23c6a3d 100644
--- a/src/data/cmudict.jl
+++ b/src/data/cmudict.jl
@@ -33,7 +33,8 @@ function rawdict()
        filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n"))))
 end
 
-validword(s) = ismatch(r"^[\w-\.]+$", s)
+# validword(s) = ismatch(r"^[\w-\.]+$", s)
+validword(s) = ismatch(r"^\[\w-\.\]+$", s)
 
 cmudict() = filter((s, ps) -> validword(s), rawdict())
 
diff --git a/src/optimise/interface.jl b/src/optimise/interface.jl
index 0b2a25ae..47b0f62c 100644
--- a/src/optimise/interface.jl
+++ b/src/optimise/interface.jl
@@ -1,5 +1,7 @@
 call(f, xs...) = f(xs...)
 
+# note for optimisers: set to zero
+# p.Δ at the end of the weigths update
 function optimiser(ps, fs...)
   ps = [Param(p) for p in ps]
   fs = map(ps) do p
@@ -10,64 +12,64 @@ function optimiser(ps, fs...)
 end
 
 """
-    SGD(params, η = 1; decay = 0)
+    SGD(params, η = 0.1; decay = 0)
 
-Classic gradient descent optimiser. For each parameter `p` and its
-gradient `δp`, this runs `p -= η*δp`.
+Classic gradient descent optimiser with learning rate `η`.
+For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
 
-Supports decayed learning rate decay if the `decay` argument is provided.
+Supports inverse decaying learning rate if the `decay` argument is provided.
 """
-SGD(ps, η = 1; decay = 0) =
-  optimiser(ps, p -> invdecay(p, decay), p -> descent(p, η))
+SGD(ps, η = 0.1; decay = 0) =
+  optimiser(ps, p -> invdecay(p, decay), p -> descent(p,η))
 
 """
-    Momentum(params, ρ, decay = 0)
+    Momentum(params, η = 0.01; ρ = 0.9, decay = 0)
 
-SGD with momentum `ρ` and optional learning rate decay.
+SGD with learning rate  `η`, momentum `ρ` and optional learning rate inverse decay.
 """
-Momentum(ps, ρ; decay = 0) =
-  optimiser(ps, p -> momentum(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
+Momentum(ps, η = 0.01; ρ = 0.9, decay = 0) =
+  optimiser(ps, p->invdecay(p,decay), p->momentum(p, ρ, η), p->descent(p,1))
 
 """
-    Nesterov(params, ρ, decay = 0)
+    Nesterov(params, η = 0.01; ρ = 0.9, decay = 0)
 
-SGD with Nesterov momentum `ρ` and optional learning rate decay.
+SGD with learning rate  `η`, Nesterov momentum `ρ` and optional learning rate inverse decay.
 """
-Nesterov(ps, ρ; decay = 0) =
-  optimiser(ps, p -> nesterov(p, ρ), p -> invdecay(p, decay), p -> descent(p, 1))
+Nesterov(ps, η = 0.01; ρ = 0.9, decay = 0) =
+  optimiser(ps, p->invdecay(p,decay), p->nesterov(p, ρ, η), p->descent(p,1))
 
 """
-    RMSProp(params; η = 0.001, ρ = 0.9, ϵ = 1e-8, decay = 0)
+    RMSProp(params, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0)
 
 [RMSProp](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 optimiser. Parameters other than learning rate don't need tuning. Often a good
 choice for recurrent networks.
 """
 RMSProp(ps, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0) =
-  optimiser(ps, p -> rmsprop(p; η = η, ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
+  optimiser(ps, p->rmsprop(p; η=η, ρ=ρ, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1))
 
 """
-    ADAM(params; η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0)
+    ADAM(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0)
 
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 """
 ADAM(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) =
-  optimiser(ps, p -> adam(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
+  optimiser(ps, p->adam(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1))
 
 """
-    ADAGrad(params; η = 0.01, ϵ = 1e-8, decay = 0)
+    ADAGrad(params, η = 0.01; ϵ = 1e-8, decay = 0)
 
 [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
 Parameters don't need tuning.
 """
-ADAGrad(ps; η = 0.01, ϵ = 1e-8, decay = 0) =
-  optimiser(ps, p -> adagrad(p; η = η, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
+ADAGrad(ps, η = 0.01; ϵ = 1e-8, decay = 0) =
+  optimiser(ps, p->adagrad(p; η=η, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1))
 
 """
-    ADADelta(params; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0)
+    ADADelta(params; ρ = 0.9, ϵ = 1e-8, decay = 0)
 
 [ADADelta](http://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
 tuning.
 """
-ADADelta(ps; η = 0.01, ρ = 0.95, ϵ = 1e-8, decay = 0) =
-  optimiser(ps, p -> adadelta(p; ρ = ρ, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1))
+ADADelta(ps; ρ = 0.9, ϵ = 1e-8, decay = 0) =
+  optimiser(ps, p->adadelta(p; ρ=ρ, ϵ=ϵ), p->descent(p,1))
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index abc54090..7cf271b6 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -1,74 +1,85 @@
 function descent(p::Param, η::Real)
   function ()
-    p.x .-= p.Δ .* η
-    p.Δ .= 0
+    @. p.x -= η * p.Δ
+    @. p.Δ = 0
   end
 end
 
-function momentum(p::Param, ρ::Real)
-  mo = zeros(p.x)
-  () -> p.Δ .= mo .= ρ .* mo .+ p.Δ
-end
-
-function nesterov(p::Param, ρ::Real)
-  mo = zeros(p.x)
+function momentum(p::Param, ρ, η)
+  v = zeros(p.x)
   function ()
-    mo  .= ρ .* mo .+ p.Δ
-    p.Δ .= ρ .* mo .+ p.Δ
+    @. v = ρ * v - η * p.Δ
+    @. p.Δ = -v
   end
 end
 
-function clip(p::Param, thresh::Real)
-  () -> clamp!(p.Δ, -thresh, thresh)
-end
-
-function weightdecay(p::Param, γ::Real)
-  () -> p.Δ .+= γ .* p.x
-end
-
-function invdecay(p::Param, γ::Real)
-  n = 0
+# Ref. https://arxiv.org/pdf/1212.0901.pdf
+function nesterov(p::Param, ρ, η)
+  v = zeros(p.x)
   function ()
-    p.Δ .*= 1 / (1 + γ * n)
-    n += 1
+    d = @. ρ^2 * v - (1+ρ) * η * p.Δ
+    @. v = ρ*v - η*p.Δ
+    @. p.Δ = -d
   end
 end
 
 function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8)
-  acc  = zeros(p.x) .+ ϵ
+  acc  = zeros(p.x)
   function ()
-    @. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2
-    @. p.Δ *= η / √acc
+    @. acc = ρ * acc + (1 - ρ) * p.Δ^2
+    @. p.Δ *= η / (√acc + ϵ)
   end
 end
 
 function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8)
   acc = zeros(p.x) .+ ϵ
   function ()
-    @. acc += p.Δ ^ 2
+    @. acc += p.Δ^2
     @. p.Δ *= η / √acc
   end
 end
 
-function adadelta(p::Param; ρ::Real = 0.95, ϵ::Real = 1e-8)
-  acc = zeros(p.x) .+ ϵ
-  Δacc = zeros(p.x) .+ ϵ
+function adadelta(p::Param; ρ::Real = 0.9, ϵ::Real = 1e-8)
+  acc = zeros(p.x)
+  Δacc = zeros(p.x)
   function ()
-    @. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2
-    @. p.Δ *= √Δacc / √acc
-    @. Δacc = ρ * Δacc + (1 - ρ) * p.Δ ^ 2
-  end
+    @. acc = ρ * acc + (1 - ρ) * p.Δ^2
+    @. p.Δ *= √(Δacc + ϵ) / √(acc + ϵ)
+    @. Δacc = ρ * Δacc + (1 - ρ) * p.Δ^2
+   end
 end
 
 function adam(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8)
   mt = zeros(p.x)
-  vt = zeros(p.x) .+ ϵ
+  vt = zeros(p.x)
   β1p, β2p = β1, β2
   function ()
     @. mt = β1 * mt + (1 - β1) * p.Δ
-    @. vt = β2 * vt + (1 - β2) * p.Δ ^ 2
-    @. p.Δ = √(1 - β2p) / √(1 - β1p) * mt / √vt * η
+    @. vt = β2 * vt + (1 - β2) * p.Δ^2
+    @. p.Δ =  mt / (1 - β1p) / (sqrt(vt / (1 - β2p)) + ϵ) * η
     β1p *= β1
     β2p *= β2
   end
 end
+
+clip(p::Param, thresh::Real) = () -> clamp!(p.Δ, -thresh, thresh)
+
+function expdecay(p::Param, γ::Real)
+  if γ != 0
+    return () -> p.Δ .+= γ .* p.x
+  else
+    return () -> nothing
+  end
+end
+
+function invdecay(p::Param, γ::Real)
+  if γ != 0
+    n = 0
+    return () -> begin
+      p.Δ .*= 1 / (1 + γ * n)
+      n += 1
+    end
+  else
+    return () -> nothing
+  end
+end
\ No newline at end of file
diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index 3a64fcb7..57bdc447 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -58,6 +58,7 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) =
 
 Base.similar(x::TrackedArray, T::Type) = similar(data(x), T)
 
+# TODO decide if keeping both data and value. The problem is TrackedScalar
 value(x) = x
 value(x::TrackedArray) = data(x)
 value(x::TrackedScalar) = data(x)[]
@@ -69,6 +70,7 @@ Base.:(==)(x::TrackedArray, y::TrackedArray) = value(x) == value(x)
 Base.isless(x::TrackedScalar, y) = isless(value(x), y)
 Base.isless(x, y::TrackedScalar) = isless(x, value(y))
 Base.isless(x::TrackedScalar, y::TrackedScalar) = isless(value(x), value(y))
+Base.isapprox(x::TrackedScalar, y; kws...) = isapprox(x.data[], y; kws...)
 
 Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} =
   print(io, "TrackedArray{…,$A}")
diff --git a/test/optimise.jl b/test/optimise.jl
new file mode 100644
index 00000000..85fd53f9
--- /dev/null
+++ b/test/optimise.jl
@@ -0,0 +1,19 @@
+using Flux.Optimise
+using Flux.Tracker
+
+@testset "Optimise" begin
+    loss(x) = sum(x.^2)
+    η = 0.1
+    # RMSProp gets stuck
+    for OPT in [SGD, Nesterov, Momentum, ADAM, ADAGrad, ADADelta]
+        x = param(randn(10))
+        opt = OPT == ADADelta ? OPT([x]) : OPT([x], η)
+        for t=1:10000
+            l = loss(x)
+            back!(l)
+            opt()
+            l.data[] < 1e-10 && break
+        end
+        @test loss(x) ≈ 0. atol=1e-7
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index efd1a462..bdd1f2d0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,5 +5,6 @@ using Flux, Base.Test
 include("utils.jl")
 include("tracker.jl")
 include("layers/normalisation.jl")
+include("optimise.jl")
 
 end

From 951c21366a54ab60899f2e9955c05bd8ebaedf5b Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 8 Dec 2017 16:42:30 +0000
Subject: [PATCH 2/3] fix regex

---
 src/data/cmudict.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl
index a23c6a3d..9ec567b4 100644
--- a/src/data/cmudict.jl
+++ b/src/data/cmudict.jl
@@ -33,8 +33,7 @@ function rawdict()
        filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n"))))
 end
 
-# validword(s) = ismatch(r"^[\w-\.]+$", s)
-validword(s) = ismatch(r"^\[\w-\.\]+$", s)
+validword(s) = ismatch(r"^[\w\-\.]+$", s)
 
 cmudict() = filter((s, ps) -> validword(s), rawdict())
 

From 69cc5642b48b685bbbf109af310384f8eae917e4 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 8 Dec 2017 17:10:29 +0000
Subject: [PATCH 3/3] regression testing

---
 test/optimise.jl | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/test/optimise.jl b/test/optimise.jl
index 85fd53f9..65bb65be 100644
--- a/test/optimise.jl
+++ b/test/optimise.jl
@@ -2,18 +2,16 @@ using Flux.Optimise
 using Flux.Tracker
 
 @testset "Optimise" begin
-    loss(x) = sum(x.^2)
-    η = 0.1
-    # RMSProp gets stuck
-    for OPT in [SGD, Nesterov, Momentum, ADAM, ADAGrad, ADADelta]
-        x = param(randn(10))
-        opt = OPT == ADADelta ? OPT([x]) : OPT([x], η)
-        for t=1:10000
-            l = loss(x)
-            back!(l)
-            opt()
-            l.data[] < 1e-10 && break
-        end
-        @test loss(x) ≈ 0. atol=1e-7
+  w = randn(10, 10)
+  for Opt in [SGD, Nesterov, Momentum, ADAM, RMSProp, ps -> ADAGrad(ps, 0.1), ADADelta]
+    w′ = param(randn(10, 10))
+    loss(x) = Flux.mse(w*x, w′*x)
+    opt = Opt([w′])
+    for t=1:10^5
+      l = loss(rand(10))
+      back!(l)
+      opt()
     end
+    @test Flux.mse(w, w′) < 0.01
+  end
 end