From 8013c728b112aec15d50c4b6e1470f24758b4c5f Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sat, 28 Sep 2019 16:09:00 +0530
Subject: [PATCH] clearer optimiser docstrings

---
 src/optimise/optimisers.jl | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 09a86174..aa5b7203 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -7,7 +7,7 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-    Descent() => Descent(η)
+    Descent(η)
     Defaults: η = 0.1
 
 Classic gradient descent optimiser with learning rate `η`.
@@ -24,7 +24,7 @@ function apply!(o::Descent, x, Δ)
 end
 
 """
-    Momentum() => Momentum(η, ρ)
+    Momentum(η, ρ)
     Defaults: η = 0.01, ρ = 0.9
 
 Gradient descent with learning rate `η` and momentum `ρ`.
@@ -45,7 +45,7 @@ function apply!(o::Momentum, x, Δ)
 end
 
 """
-    Nesterov() => Nesterov(η, ρ)
+    Nesterov(η, ρ)
     Defaults: η = 0.001, ρ = 0.9
 
 Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
@@ -67,7 +67,7 @@ function apply!(o::Nesterov, x, Δ)
 end
 
 """
-    RMSProp() => RMSProp(η, ρ)
+    RMSProp(η, ρ)
     Defaults: η = 0.001, ρ = 0.9
 
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
@@ -90,7 +90,7 @@ function apply!(o::RMSProp, x, Δ)
 end
 
 """
-    ADAM() => ADAM(η, β)
+    ADAM(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
@@ -114,7 +114,7 @@ function apply!(o::ADAM, x, Δ)
 end
 
 """
-    RADAM() => RADAM(η, β)
+    RADAM(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
@@ -145,7 +145,7 @@ function apply!(o::RADAM, x, Δ)
 end
 
 """
-    AdaMax() => AdaMax(η, β)
+    AdaMax(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
@@ -170,7 +170,7 @@ function apply!(o::AdaMax, x, Δ)
 end
 
 """
-    ADAGrad() => ADAGrad(η)
+    ADAGrad(η)
     Defaults: η = 0.1
 
 [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
@@ -191,7 +191,7 @@ function apply!(o::ADAGrad, x, Δ)
 end
 
 """
-    ADADelta() => ADADelta(ρ)
+    ADADelta(ρ)
     Defaults: ρ = 0.9
 
 [ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
@@ -214,7 +214,7 @@ function apply!(o::ADADelta, x, Δ)
 end
 
 """
-    AMSGrad() => AMSGrad(η, β)
+    AMSGrad(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
@@ -238,7 +238,7 @@ function apply!(o::AMSGrad, x, Δ)
 end
 
 """
-    NADAM() => NADAM(η, β)
+    NADAM(η, β)
     Defaults: η = 0.001, β = (0.9, 0.999)
 
 [NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
@@ -263,8 +263,8 @@ function apply!(o::NADAM, x, Δ)
 end
 
 """
+    ADAMW(η, β, decay)
     Defaults: η = 0.001, β = (0.9, 0.999), decay = 0
-    ADAMW() => ADAMW(η, β, decay)
 
 [ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam.
 """
@@ -299,7 +299,7 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
-InvDecay() => InvDecay(γ)
+InvDecay(γ)
 Defaults: γ = 0.001
 
 Apply inverse time decay to an optimiser
@@ -354,7 +354,7 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-WeightDecay() => WeightDecay(wd)
+WeightDecay(wd)
 Defaults: wd = 0
 
 Decay the weight parameter by `wd`