From 2e1ed4c3fce1ccb3fad7e558e6eef00936907e89 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Mon, 23 Oct 2017 10:12:53 +0200
Subject: [PATCH 01/32] add dropout

---
 src/Flux.jl         |  4 ++--
 src/layers/basic.jl | 42 +++++++++++++++++++++++++++++++++++++++++-
 test/layers.jl      | 23 +++++++++++++++++++++++
 test/runtests.jl    |  1 +
 4 files changed, 67 insertions(+), 3 deletions(-)
 create mode 100644 test/layers.jl

diff --git a/src/Flux.jl b/src/Flux.jl
index e4f170f2..daeaa9ac 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -7,9 +7,9 @@ module Flux
 using Juno, Requires
 using Lazy: @forward
 
-export Chain, Dense, RNN, LSTM,
+export Chain, Dense, RNN, LSTM, Dropout,
   SGD, ADAM, Momentum, Nesterov,
-  param, params, mapleaves
+  param, params, mapleaves, setmode!
 
 using NNlib
 export σ, relu, leakyrelu, elu, swish, softmax
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 9c8b1016..088cf1e1 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -27,7 +27,7 @@ end
 children(c::Chain) = c.layers
 mapchildren(f, c::Chain) = Chain(f.(c.layers)...)
 
-(s::Chain)(x) = foldl((x, m) -> m(x), x, s.layers)
+(c::Chain)(x) = foldl((x, m) -> m(x), x, c.layers)
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
 
@@ -78,3 +78,43 @@ function Base.show(io::IO, l::Dense)
   l.σ == identity || print(io, ", ", l.σ)
   print(io, ")")
 end
+
+
+"""
+  Dropout(p; mode=:train)
+
+A Dropout layer. In `:train` mode sets input components `x[i]` to zero with
+probability `p` and to `x[i]/(1-p)` with probability `(1-p)`.
+
+In `:eval` mode it doesn't alter the input: `x == Dropout(p; mode=:eval)(x)`.
+Change the mode with [`setmode!`](@ref).
+"""
+mutable struct Dropout{F}
+  p::F
+  mode::Symbol
+end
+Dropout(p::F; mode=:train) where {F} = Dropout{F}(p, mode)
+
+function (a::Dropout)(x)
+  if a.mode == :eval
+    return x
+  else
+    if 0 < a.p < 1
+      y = similar(x)
+      rand!(y)
+      q = 1 - a.p
+      @inbounds for i=1:length(y)
+        y[i] = y[i] > a.p ? 1 / q : 0
+      end
+      return y .* x
+    elseif a.p == 0
+      return x
+    elseif a.p == 1
+      return zeros(x)
+    end
+  end
+end
+
+setmode!(a, mode::Symbol) = nothing
+setmode!(c::Chain, mode::Symbol) = mapchildren(x->setmode!(x, mode), c)
+setmode!(a::Dropout, mode::Symbol) = a.mode = mode
diff --git a/test/layers.jl b/test/layers.jl
new file mode 100644
index 00000000..ead9c343
--- /dev/null
+++ b/test/layers.jl
@@ -0,0 +1,23 @@
+@testset "dropout" begin
+  x = [1.,2.,3.]
+  @test x === Dropout(0.1, mode=:eval)(x)
+  @test x === Dropout(0, mode=:train)(x)
+  @test all(zeros(x) .== Dropout(1, mode=:train)(x))
+
+  x = rand(100)
+  m = Dropout(0.9)
+  y = m(x)
+  @test count(a->a==0, y) > 50
+  setmode!(m, :eval)
+  y = m(x)
+  @test count(a->a==0, y) == 0
+
+  x = rand(100)
+  m = Chain(Dense(100,100),
+            Dropout(0.9))
+  y = m(x)
+  @test count(a->a.data[] == 0, y) > 50
+  setmode!(m, :eval)
+  y = m(x)
+  @test count(a->a.data[] == 0, y) == 0
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 2ab0e447..b7b838df 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,5 +4,6 @@ using Flux, Base.Test
 
 include("utils.jl")
 include("tracker.jl")
+include("layers.jl")
 
 end

From 86c7c9246ea60fafaae844eddde6e7ef7daa4216 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Mon, 23 Oct 2017 11:41:08 +0200
Subject: [PATCH 02/32] add == and < for tracked arrays

---
 src/layers/basic.jl    | 7 +++++++
 src/tracker/Tracker.jl | 9 ++++++++-
 test/layers.jl         | 4 ++--
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 088cf1e1..0c7e1fd0 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -115,6 +115,13 @@ function (a::Dropout)(x)
   end
 end
 
+"""
+    setmode!(m, mode::Symbol)
+
+Change the mode of model `m` to `mode`. Possible values for `mode` are
+`:train` and `:eval`.
+This has an affect only if `m` contains [`Dropout`](@ref) of `BatchNorm` layers.
+"""
 setmode!(a, mode::Symbol) = nothing
 setmode!(c::Chain, mode::Symbol) = mapchildren(x->setmode!(x, mode), c)
 setmode!(a::Dropout, mode::Symbol) = a.mode = mode
diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index a2a6c745..8f495f82 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -1,5 +1,5 @@
 module Tracker
-
+import Base: <, ==
 export TrackedArray, param, back!
 
 data(x) = x
@@ -54,6 +54,13 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) =
 
 Base.similar(x::TrackedArray, T::Type) = similar(data(x), T)
 
+==(x::TrackedArray, y) = data(x) == y
+==(y, x::TrackedArray) = y == data(x)
+==(x::TrackedScalar, y) = data(x)[] == y
+==(y, x::TrackedScalar) = y == data(x)[]
+<(x::TrackedScalar, y) = data(x)[] < y
+<(x, y::TrackedScalar) = x <  data(y)[]
+
 Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} =
   print(io, "TrackedArray{…,$A}")
 
diff --git a/test/layers.jl b/test/layers.jl
index ead9c343..d0a5cbe1 100644
--- a/test/layers.jl
+++ b/test/layers.jl
@@ -16,8 +16,8 @@
   m = Chain(Dense(100,100),
             Dropout(0.9))
   y = m(x)
-  @test count(a->a.data[] == 0, y) > 50
+  @test count(a->a == 0, y) > 50
   setmode!(m, :eval)
   y = m(x)
-  @test count(a->a.data[] == 0, y) == 0
+  @test count(a->a == 0, y) == 0
 end

From 536ab3861dcf40a726272763efd8e489f700d667 Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Mon, 23 Oct 2017 16:23:29 +0200
Subject: [PATCH 03/32] setmode! -> testmode!

---
 src/Flux.jl            |  2 +-
 src/layers/basic.jl    | 25 +++++++++++--------------
 src/tracker/Tracker.jl |  1 +
 test/layers.jl         | 13 ++++++++-----
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index daeaa9ac..957940c3 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -9,7 +9,7 @@ using Lazy: @forward
 
 export Chain, Dense, RNN, LSTM, Dropout,
   SGD, ADAM, Momentum, Nesterov,
-  param, params, mapleaves, setmode!
+  param, params, mapleaves, testmode!
 
 using NNlib
 export σ, relu, leakyrelu, elu, swish, softmax
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 0c7e1fd0..841cf094 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -81,22 +81,22 @@ end
 
 
 """
-  Dropout(p; mode=:train)
+  Dropout(p; testmode=false)
 
-A Dropout layer. In `:train` mode sets input components `x[i]` to zero with
+A Dropout layer. If `testmode=false` mode sets input components `x[i]` to zero with
 probability `p` and to `x[i]/(1-p)` with probability `(1-p)`.
 
-In `:eval` mode it doesn't alter the input: `x == Dropout(p; mode=:eval)(x)`.
-Change the mode with [`setmode!`](@ref).
+In `testmode=true`it doesn't alter the input: `x == Dropout(p; mode=:eval)(x)`.
+Change the mode with [`testmode!`](@ref).
 """
 mutable struct Dropout{F}
   p::F
-  mode::Symbol
+  testmode::Bool
 end
-Dropout(p::F; mode=:train) where {F} = Dropout{F}(p, mode)
+Dropout(p::F; testmode::Bool=false) where {F} = Dropout{F}(p, testmode)
 
 function (a::Dropout)(x)
-  if a.mode == :eval
+  if a.testmode
     return x
   else
     if 0 < a.p < 1
@@ -116,12 +116,9 @@ function (a::Dropout)(x)
 end
 
 """
-    setmode!(m, mode::Symbol)
+    testmode!(m, val=true)
 
-Change the mode of model `m` to `mode`. Possible values for `mode` are
-`:train` and `:eval`.
-This has an affect only if `m` contains [`Dropout`](@ref) of `BatchNorm` layers.
+Set model `m` in test mode if `val=true`, and in training mode otherwise.
+This has an affect only if `m` contains [`Dropout`](@ref) or `BatchNorm` layers.
 """
-setmode!(a, mode::Symbol) = nothing
-setmode!(c::Chain, mode::Symbol) = mapchildren(x->setmode!(x, mode), c)
-setmode!(a::Dropout, mode::Symbol) = a.mode = mode
+testmode!(m, val::Bool=true) = prefor(x -> x isa Dropout && (x.testmode = val), m)
diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index 8f495f82..1ab92f7e 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -41,6 +41,7 @@ TrackedArray(x::AbstractArray) = TrackedArray(Call(nothing), x, zeros(x))
 param(xs) = TrackedArray(AbstractFloat.(xs))
 istracked(x::TrackedArray) = true
 data(x::TrackedArray) = x.data
+# data(x::TrackedScalar) = x.data[]
 grad(x::TrackedArray) = x.grad
 
 # Fallthrough methods
diff --git a/test/layers.jl b/test/layers.jl
index d0a5cbe1..0d6c3bc6 100644
--- a/test/layers.jl
+++ b/test/layers.jl
@@ -1,23 +1,26 @@
 @testset "dropout" begin
   x = [1.,2.,3.]
-  @test x === Dropout(0.1, mode=:eval)(x)
-  @test x === Dropout(0, mode=:train)(x)
-  @test all(zeros(x) .== Dropout(1, mode=:train)(x))
+  @test x === Dropout(0.1, testmode=true)(x)
+  @test x === Dropout(0, testmode=false)(x)
+  @test all(zeros(x) .== Dropout(1, testmode=false)(x))
 
   x = rand(100)
   m = Dropout(0.9)
   y = m(x)
   @test count(a->a==0, y) > 50
-  setmode!(m, :eval)
+  testmode!(m)
   y = m(x)
   @test count(a->a==0, y) == 0
+  testmode!(m, false)
+  y = m(x)
+  @test count(a->a==0, y) > 50
 
   x = rand(100)
   m = Chain(Dense(100,100),
             Dropout(0.9))
   y = m(x)
   @test count(a->a == 0, y) > 50
-  setmode!(m, :eval)
+  testmode!(m)
   y = m(x)
   @test count(a->a == 0, y) == 0
 end

From 711ea09d99cc3cc8daf39b172c5a5be065f13d7f Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Wed, 25 Oct 2017 02:35:27 +0200
Subject: [PATCH 04/32] address comments

---
 src/layers/basic.jl    |  2 +-
 src/tracker/Tracker.jl | 17 +++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 841cf094..c15868ab 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -121,4 +121,4 @@ end
 Set model `m` in test mode if `val=true`, and in training mode otherwise.
 This has an affect only if `m` contains [`Dropout`](@ref) or `BatchNorm` layers.
 """
-testmode!(m, val::Bool=true) = prefor(x -> x isa Dropout && (x.testmode = val), m)
+testmode!(m, val::Bool=true) = prefor(x -> :testmode ∈ fieldnames(x) && (x.testmode = val), m)
diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index 1ab92f7e..90707ea5 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -55,12 +55,17 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) =
 
 Base.similar(x::TrackedArray, T::Type) = similar(data(x), T)
 
-==(x::TrackedArray, y) = data(x) == y
-==(y, x::TrackedArray) = y == data(x)
-==(x::TrackedScalar, y) = data(x)[] == y
-==(y, x::TrackedScalar) = y == data(x)[]
-<(x::TrackedScalar, y) = data(x)[] < y
-<(x, y::TrackedScalar) = x <  data(y)[]
+#to be merged with data in the future
+unbox(x::TrackedArray) = data(x)
+unbox(x::TrackedScalar) = data(x)[]
+
+==(x::TrackedArray, y) = unbox(x) == y
+==(y, x::TrackedArray) = y == unbox(x)
+==(x::TrackedArray, y::TrackedArray) = unbox(x) == unbox(x)
+
+<(x::TrackedScalar, y) = unbox(x) < y
+<(x, y::TrackedScalar) = x <  unbox(y)
+<(x::TrackedScalar, y::TrackedScalar) = unbox(x) <  unbox(y)
 
 Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} =
   print(io, "TrackedArray{…,$A}")

From 0df300299ffc91487318c3c8d3f483985ea92d9c Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 26 Oct 2017 11:15:14 +0100
Subject: [PATCH 05/32] clearer error message, fixes #93

---
 src/tracker/Tracker.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index 8e6a584a..f2e52f61 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -70,6 +70,9 @@ function Base.showarray(io::IO, X::TrackedArray, repr::Bool = true; header = tru
   end
 end
 
+Base.setindex!(xs::TrackedArray, v, i...) =
+  error("Can't differentiate `setindex!`")
+
 include("back.jl")
 include("lib.jl")
 include("numeric.jl")

From cf6b930f639970a4fb2707eedee0a1d4c2287205 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 26 Oct 2017 11:46:12 +0100
Subject: [PATCH 06/32] reorganise

---
 docs/src/models/layers.md                   |  8 ++++
 src/Flux.jl                                 |  3 +-
 src/layers/basic.jl                         | 44 --------------------
 src/layers/normalisation.jl                 | 45 +++++++++++++++++++++
 test/{layers.jl => layers/normalisation.jl} | 10 +++--
 test/runtests.jl                            |  2 +-
 6 files changed, 62 insertions(+), 50 deletions(-)
 create mode 100644 src/layers/normalisation.jl
 rename test/{layers.jl => layers/normalisation.jl} (69%)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 565e3b05..46547ce3 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -30,3 +30,11 @@ leakyrelu
 elu
 swish
 ```
+
+## Normalisation & Regularisation
+
+These layers don't affect the structure of the network but may improve training times or reduce overfitting.
+
+```@docs
+Dropout
+```
diff --git a/src/Flux.jl b/src/Flux.jl
index 957940c3..e4b6c832 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -9,7 +9,7 @@ using Lazy: @forward
 
 export Chain, Dense, RNN, LSTM, Dropout,
   SGD, ADAM, Momentum, Nesterov,
-  param, params, mapleaves, testmode!
+  param, params, mapleaves
 
 using NNlib
 export σ, relu, leakyrelu, elu, swish, softmax
@@ -27,5 +27,6 @@ include("tree.jl")
 include("layers/stateless.jl")
 include("layers/basic.jl")
 include("layers/recurrent.jl")
+include("layers/normalisation.jl")
 
 end # module
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index c15868ab..969a261c 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -78,47 +78,3 @@ function Base.show(io::IO, l::Dense)
   l.σ == identity || print(io, ", ", l.σ)
   print(io, ")")
 end
-
-
-"""
-  Dropout(p; testmode=false)
-
-A Dropout layer. If `testmode=false` mode sets input components `x[i]` to zero with
-probability `p` and to `x[i]/(1-p)` with probability `(1-p)`.
-
-In `testmode=true`it doesn't alter the input: `x == Dropout(p; mode=:eval)(x)`.
-Change the mode with [`testmode!`](@ref).
-"""
-mutable struct Dropout{F}
-  p::F
-  testmode::Bool
-end
-Dropout(p::F; testmode::Bool=false) where {F} = Dropout{F}(p, testmode)
-
-function (a::Dropout)(x)
-  if a.testmode
-    return x
-  else
-    if 0 < a.p < 1
-      y = similar(x)
-      rand!(y)
-      q = 1 - a.p
-      @inbounds for i=1:length(y)
-        y[i] = y[i] > a.p ? 1 / q : 0
-      end
-      return y .* x
-    elseif a.p == 0
-      return x
-    elseif a.p == 1
-      return zeros(x)
-    end
-  end
-end
-
-"""
-    testmode!(m, val=true)
-
-Set model `m` in test mode if `val=true`, and in training mode otherwise.
-This has an affect only if `m` contains [`Dropout`](@ref) or `BatchNorm` layers.
-"""
-testmode!(m, val::Bool=true) = prefor(x -> :testmode ∈ fieldnames(x) && (x.testmode = val), m)
diff --git a/src/layers/normalisation.jl b/src/layers/normalisation.jl
new file mode 100644
index 00000000..08c21428
--- /dev/null
+++ b/src/layers/normalisation.jl
@@ -0,0 +1,45 @@
+"""
+    testmode!(m)
+    testmode!(m, false)
+
+Put layers like [`Dropout`](@ref) and `BatchNorm` into testing mode (or back to
+training mode with `false`).
+"""
+function testmode!(m, val::Bool=true)
+  prefor(x -> _testmode!(x, val), m)
+  return m
+end
+
+_testmode!(m, test) = nothing
+
+"""
+    Dropout(p)
+
+A Dropout layer. For each input, either sets that input to `0` (with probability
+`p`) or scales it by `1/(1-p)`. This is used as a regularisation, i.e. it
+reduces overfitting during training.
+
+Does nothing to the input once in [`testmode!`](@ref).
+"""
+mutable struct Dropout{F}
+  p::F
+  active::Bool
+end
+
+function Dropout(p)
+  @assert 0 ≤ p ≤ 1
+  Dropout{typeof(p)}(p, true)
+end
+
+function (a::Dropout)(x)
+  a.active || return x
+  y = similar(x)
+  rand!(y)
+  q = 1 - a.p
+  @inbounds for i=1:length(y)
+    y[i] = y[i] > a.p ? 1 / q : 0
+  end
+  return y .* x
+end
+
+_testmode!(a::Dropout, test) = (a.active = !test)
diff --git a/test/layers.jl b/test/layers/normalisation.jl
similarity index 69%
rename from test/layers.jl
rename to test/layers/normalisation.jl
index 0d6c3bc6..5a302a51 100644
--- a/test/layers.jl
+++ b/test/layers/normalisation.jl
@@ -1,8 +1,10 @@
-@testset "dropout" begin
+using Flux: testmode!
+
+@testset "Dropout" begin
   x = [1.,2.,3.]
-  @test x === Dropout(0.1, testmode=true)(x)
-  @test x === Dropout(0, testmode=false)(x)
-  @test all(zeros(x) .== Dropout(1, testmode=false)(x))
+  @test x == testmode!(Dropout(0.1))(x)
+  @test x == Dropout(0)(x)
+  @test zeros(x) == Dropout(1)(x)
 
   x = rand(100)
   m = Dropout(0.9)
diff --git a/test/runtests.jl b/test/runtests.jl
index b7b838df..efd1a462 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,6 +4,6 @@ using Flux, Base.Test
 
 include("utils.jl")
 include("tracker.jl")
-include("layers.jl")
+include("layers/normalisation.jl")
 
 end

From 84efbbcc8422d1521e2b67f11f5f015b1868e581 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 26 Oct 2017 12:06:29 +0100
Subject: [PATCH 07/32] tracker predicate tweaks

---
 src/tracker/Tracker.jl | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index 90707ea5..8a481970 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -1,5 +1,5 @@
 module Tracker
-import Base: <, ==
+
 export TrackedArray, param, back!
 
 data(x) = x
@@ -41,7 +41,6 @@ TrackedArray(x::AbstractArray) = TrackedArray(Call(nothing), x, zeros(x))
 param(xs) = TrackedArray(AbstractFloat.(xs))
 istracked(x::TrackedArray) = true
 data(x::TrackedArray) = x.data
-# data(x::TrackedScalar) = x.data[]
 grad(x::TrackedArray) = x.grad
 
 # Fallthrough methods
@@ -55,17 +54,17 @@ Base.similar(x::TrackedArray, dims::Union{AbstractUnitRange,Integer}...) =
 
 Base.similar(x::TrackedArray, T::Type) = similar(data(x), T)
 
-#to be merged with data in the future
-unbox(x::TrackedArray) = data(x)
-unbox(x::TrackedScalar) = data(x)[]
+value(x) = x
+value(x::TrackedArray) = data(x)
+value(x::TrackedScalar) = data(x)[]
 
-==(x::TrackedArray, y) = unbox(x) == y
-==(y, x::TrackedArray) = y == unbox(x)
-==(x::TrackedArray, y::TrackedArray) = unbox(x) == unbox(x)
+Base.:(==)(x::TrackedArray, y) = value(x) == y
+Base.:(==)(y, x::TrackedArray) = y == value(x)
+Base.:(==)(x::TrackedArray, y::TrackedArray) = value(x) == value(x)
 
-<(x::TrackedScalar, y) = unbox(x) < y
-<(x, y::TrackedScalar) = x <  unbox(y)
-<(x::TrackedScalar, y::TrackedScalar) = unbox(x) <  unbox(y)
+Base.isless(x::TrackedScalar, y) = isless(value(x), y)
+Base.isless(x, y::TrackedScalar) = isless(x, value(y))
+Base.isless(x::TrackedScalar, y::TrackedScalar) = isless(value(x), value(y))
 
 Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} =
   print(io, "TrackedArray{…,$A}")

From c43bda019bedb0fda14cd0e747a115dae22f8d54 Mon Sep 17 00:00:00 2001
From: Iblis Lin <iblis@hs.ntnu.edu.tw>
Date: Mon, 30 Oct 2017 16:21:02 +0800
Subject: [PATCH 08/32] TrackedArray: implement `mean`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

```julia
julia> p
Tracked 2×3 Array{Float64,2}:
 1.0  3.0  5.0
 2.0  4.0  6.0
```

Before
```julia
julia> @benchmark Flux.Tracker.back!(sum($p, 2) ./ size($p, 2), ones(2, 1))
BenchmarkTools.Trial:
  memory estimate:  3.44 KiB
  allocs estimate:  75
  --------------
  minimum time:     20.438 μs (0.00% GC)
  median time:      21.239 μs (0.00% GC)
  mean time:        22.354 μs (1.68% GC)
  maximum time:     3.811 ms (98.51% GC)
  --------------
  samples:          10000
  evals/sample:     1
```

After
```julia
julia> @benchmark Flux.Tracker.back!(mean($p, 2), ones(2, 1))
BenchmarkTools.Trial:
  memory estimate:  1008 bytes
  allocs estimate:  21
  --------------
  minimum time:     5.973 μs (0.00% GC)
  median time:      6.310 μs (0.00% GC)
  mean time:        6.630 μs (1.96% GC)
  maximum time:     680.709 μs (97.28% GC)
  --------------
  samples:          10000
  evals/sample:     6
```
---
 src/tracker/lib.jl |  5 +++++
 test/tracker.jl    | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl
index 254be8dc..9f3adc6b 100644
--- a/src/tracker/lib.jl
+++ b/src/tracker/lib.jl
@@ -57,6 +57,11 @@ back(::typeof(sum), Δ, xs::TrackedArray, dim...) = back(xs, similar(xs.data) .=
 Base.maximum(xs::TrackedArray, args...) = maximum(xs.data, args...)
 Base.findfirst(xs::TrackedArray, args...) = findfirst(xs.data, args...)
 
+Base.mean(xs::TrackedArray, region) = TrackedArray(Call(mean, xs, region))
+
+back(::typeof(mean), Δ, xs::TrackedArray, region) =
+  back(xs, similar(xs.data) .= Δ ./ prod(size(xs.data, region...)))
+
 # BLAS
 
 a::TrackedMatrix * b::TrackedMatrix  = TrackedArray(Call(*, a, b))
diff --git a/test/tracker.jl b/test/tracker.jl
index 2a20338e..52a73a07 100644
--- a/test/tracker.jl
+++ b/test/tracker.jl
@@ -22,6 +22,16 @@ gradtest(f, dims...) = gradtest(f, rand.(dims)...)
 @test gradtest(vcat, rand(5), rand(3))
 @test gradtest(vcat, rand(2,3), rand(3,3))
 
+@testset "mean" begin
+  @test gradtest(mean, rand(2, 3))
+
+  @test gradtest(x -> mean(x, 1), rand(2, 3))
+  @test gradtest(x -> mean(x, 2), rand(2, 3))
+  @test gradtest(x -> mean(x, 3), rand(2, 3, 4))
+
+  @test gradtest(x -> mean(x, [1, 2]), rand(2, 3, 4))
+end
+
 @test gradtest(rand(5)) do x
   y = x.^2
   2y + x

From 3d8b7250aeff4e0f600e39af6372e471df8240ac Mon Sep 17 00:00:00 2001
From: Iblis Lin <iblis@hs.ntnu.edu.tw>
Date: Tue, 31 Oct 2017 10:41:44 +0000
Subject: [PATCH 09/32] add scalar mean

---
 src/tracker/lib.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl
index 9f3adc6b..a90eb932 100644
--- a/src/tracker/lib.jl
+++ b/src/tracker/lib.jl
@@ -57,8 +57,10 @@ back(::typeof(sum), Δ, xs::TrackedArray, dim...) = back(xs, similar(xs.data) .=
 Base.maximum(xs::TrackedArray, args...) = maximum(xs.data, args...)
 Base.findfirst(xs::TrackedArray, args...) = findfirst(xs.data, args...)
 
+Base.mean(xs::TrackedArray) = TrackedArray(Call(mean, xs), toarray(xs.data, mean(xs.data)))
 Base.mean(xs::TrackedArray, region) = TrackedArray(Call(mean, xs, region))
 
+back(::typeof(mean), Δ, xs::TrackedArray) = back(xs, similar(xs.data) .= Δ ./ length(xs.data))
 back(::typeof(mean), Δ, xs::TrackedArray, region) =
   back(xs, similar(xs.data) .= Δ ./ prod(size(xs.data, region...)))
 

From e943a39ee72037184b9e46d89c0af536c78effef Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Tue, 31 Oct 2017 16:37:33 +0000
Subject: [PATCH 10/32] combine special cases

---
 src/layers/recurrent.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 716bc574..599776ce 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -1,5 +1,7 @@
 # TODO: broadcasting cat
-combine(x, h) = vcat(x, h .* trues(1, size(x, 2)))
+combine(x::AbstractMatrix, h::AbstractVector) = vcat(x, h .* trues(1, size(x, 2)))
+combine(x::AbstractVector, h::AbstractVector) = vcat(x, h)
+combine(x::AbstractMatrix, h::AbstractMatrix) = vcat(x, h)
 
 # Stateful recurrence
 

From 0f8ba87dc68307050abb46f594280c1873ee4d84 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Tue, 31 Oct 2017 16:37:41 +0000
Subject: [PATCH 11/32] treelike tuples

---
 src/tree.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/tree.jl b/src/tree.jl
index efdf9101..899fccea 100644
--- a/src/tree.jl
+++ b/src/tree.jl
@@ -1,6 +1,9 @@
 children(x) = ()
 mapchildren(f, x) = x
 
+children(x::Tuple) = x
+mapchildren(f, x::Tuple) = map(f, x)
+
 function treelike(T, fs = fieldnames(T))
   @eval begin
     children(x::$T) = ($([:(x.$f) for f in fs]...),)

From e7a510da9a42495bd5599aa857b19b96b5b7a442 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 1 Nov 2017 16:01:34 +0000
Subject: [PATCH 12/32] add cmudict dataset

---
 .gitignore          |  2 +-
 src/Flux.jl         |  2 ++
 src/data/Data.jl    | 14 ++++++++++++++
 src/data/cmudict.jl | 42 ++++++++++++++++++++++++++++++++++++++++++
 test/data.jl        |  3 +++
 5 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 src/data/Data.jl
 create mode 100644 src/data/cmudict.jl
 create mode 100644 test/data.jl

diff --git a/.gitignore b/.gitignore
index 785b9c4e..9d6de240 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,4 @@
 docs/build/
 docs/site/
 docs/flux.css
-demos
+deps
diff --git a/src/Flux.jl b/src/Flux.jl
index e4b6c832..242c8b1f 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -29,4 +29,6 @@ include("layers/basic.jl")
 include("layers/recurrent.jl")
 include("layers/normalisation.jl")
 
+include("data/Data.jl")
+
 end # module
diff --git a/src/data/Data.jl b/src/data/Data.jl
new file mode 100644
index 00000000..ffea729c
--- /dev/null
+++ b/src/data/Data.jl
@@ -0,0 +1,14 @@
+module Data
+
+export CMUDict, cmudict
+
+deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...)
+
+function __init__()
+  mkpath(deps())
+end
+
+include("cmudict.jl")
+using .CMUDict
+
+end
diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl
new file mode 100644
index 00000000..88b9c6c0
--- /dev/null
+++ b/src/data/cmudict.jl
@@ -0,0 +1,42 @@
+module CMUDict
+
+export cmudict
+
+using ..Data: deps
+
+const version = "0.7b"
+
+function load()
+  isdir(deps("cmudict")) && return
+  mkpath(deps("cmudict"))
+  for x in ["", ".phones", ".symbols"]
+    download("http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-$version$x",
+             deps("cmudict", "cmudict$x"))
+  end
+end
+
+function phones()
+  load()
+  Symbol.(first.(split.(split(readstring(deps("cmudict", "cmudict.phones")),
+                        "\n", keep = false), "\t")))
+end
+
+function symbols()
+  load()
+  Symbol.(split(readstring(deps("CMUDict", "cmudict.symbols")),
+                "\n", keep = false))
+end
+
+function rawdict()
+  load()
+  Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
+       filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n"))))
+end
+
+validword(s) = ismatch(r"^[\w-\.]+$", s)
+
+cmudict() = filter((s, ps) -> validword(s), rawdict())
+
+alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
+
+end
diff --git a/test/data.jl b/test/data.jl
new file mode 100644
index 00000000..1b93ab3c
--- /dev/null
+++ b/test/data.jl
@@ -0,0 +1,3 @@
+using Flux.Data
+
+@test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args

From 21ea93ffcd08c87ed5dfae5bc6645852744160fe Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 2 Nov 2017 11:44:39 +0000
Subject: [PATCH 13/32] rename treelike

---
 src/Flux.jl                  | 2 +-
 src/{tree.jl => treelike.jl} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename src/{tree.jl => treelike.jl} (100%)

diff --git a/src/Flux.jl b/src/Flux.jl
index 242c8b1f..ff78593f 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -22,7 +22,7 @@ using .Optimise
 
 include("utils.jl")
 include("onehot.jl")
-include("tree.jl")
+include("treelike.jl")
 
 include("layers/stateless.jl")
 include("layers/basic.jl")
diff --git a/src/tree.jl b/src/treelike.jl
similarity index 100%
rename from src/tree.jl
rename to src/treelike.jl

From efa51f02e7a7ea28d79aabe496cdb57aedbae4fd Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 11 Oct 2017 11:54:18 +0100
Subject: [PATCH 14/32] basic batch type

---
 src/Flux.jl            | 2 ++
 src/batches/Batches.jl | 7 +++++++
 src/batches/batch.jl   | 8 ++++++++
 3 files changed, 17 insertions(+)
 create mode 100644 src/batches/Batches.jl
 create mode 100644 src/batches/batch.jl

diff --git a/src/Flux.jl b/src/Flux.jl
index ff78593f..acefff19 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -31,4 +31,6 @@ include("layers/normalisation.jl")
 
 include("data/Data.jl")
 
+include("batches/Batches.jl")
+
 end # module
diff --git a/src/batches/Batches.jl b/src/batches/Batches.jl
new file mode 100644
index 00000000..066f4d1c
--- /dev/null
+++ b/src/batches/Batches.jl
@@ -0,0 +1,7 @@
+module Batches
+
+import ..Flux
+
+include("batch.jl")
+
+end
diff --git a/src/batches/batch.jl b/src/batches/batch.jl
new file mode 100644
index 00000000..5a2eb82e
--- /dev/null
+++ b/src/batches/batch.jl
@@ -0,0 +1,8 @@
+struct Batch{T,A,M}
+  data::A
+  mask::M
+end
+
+Batch{T}(data, mask) where T = Batch{T,typeof(data),typeof(mask)}(data, mask)
+
+Batch(xs) = Batch{typeof(first(xs))}(Flux.batch(xs),trues(length(xs)))

From 97244e0a68fa8cbae17f8065160126897a674009 Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <baggepinnen@gmail.com>
Date: Sat, 4 Nov 2017 13:27:32 +0100
Subject: [PATCH 15/32] Allow array of optimisers to train!

This allows an array of optimisers to be sent to `train!`
---
 src/optimise/train.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 2a2ec5eb..0809e86b 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -1,8 +1,8 @@
 using Juno
 using Flux.Tracker: back!
 
-tocb(f) = f
-tocb(fs::AbstractVector) = () -> foreach(call, fs)
+runall(f) = f
+runall(fs::AbstractVector) = () -> foreach(call, fs)
 
 """
     train!(loss, data, opt; cb = () -> ())
@@ -11,10 +11,11 @@ For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
 backpropagation and calls the optimizer `opt` and the callback `cb`
 (i.e. `opt()` and `cb()`).
 
-Multiple callbacks can be passed to `cb` as an array.
+Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays.
 """
 function train!(loss, data, opt; cb = () -> ())
-  cb = tocb(cb)
+  cb = runall(cb)
+  opt = runall(opt)
   @progress for d in data
     l = loss(d...)
     isinf(l.data[]) && error("Loss is Inf")

From d6423eefe54b8ba822ed49b8b5c0d52dbe58ae1d Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Tue, 7 Nov 2017 19:34:27 +0000
Subject: [PATCH 16/32] matrix-vector fast path

---
 src/tracker/Tracker.jl |  2 ++
 src/tracker/lib.jl     | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index d6fa6f35..5e26a051 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -38,6 +38,8 @@ TrackedArray(c::Call) = TrackedArray(c, c())
 
 TrackedArray(x::AbstractArray) = TrackedArray(Call(nothing), x, zeros(x))
 
+isleaf(x::TrackedArray) = x.f == Call(nothing)
+
 param(xs) = TrackedArray(AbstractFloat.(xs))
 param(xs::Real) = param(fill(xs))
 
diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl
index a90eb932..2ee5d659 100644
--- a/src/tracker/lib.jl
+++ b/src/tracker/lib.jl
@@ -79,6 +79,16 @@ function back(::typeof(*), Δ, a::AbstractMatrix, b::AbstractVecOrMat)
   @back(b, At_mul_B(data(a), Δ))
 end
 
+# Fast path for matrix-vector
+function back(::typeof(*), Δ::AbstractVector, W::TrackedMatrix, x::AbstractVector)
+  if isleaf(W)
+    W.grad .+= Δ .* data(x).'
+  else
+    back(W, A_mul_Bt(Δ, data(x)))
+  end
+  @back(x, At_mul_B(data(W), Δ))
+end
+
 # NNlib
 
 import NNlib: softmax, ∇softmax

From d4229c4815a265d2ba084dc2b5b6db264cea497d Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Tue, 7 Nov 2017 19:34:35 +0000
Subject: [PATCH 17/32] useful params method

---
 src/treelike.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/treelike.jl b/src/treelike.jl
index 899fccea..097ccdc6 100644
--- a/src/treelike.jl
+++ b/src/treelike.jl
@@ -35,3 +35,5 @@ function params(m)
   prefor(p -> p isa TrackedArray && push!(ps, p), m)
   return ps
 end
+
+params(m...) = params(m)

From fcd091e8f06fc7a8824c4ca12d38dd23a4da4f08 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 8 Nov 2017 22:00:19 +0000
Subject: [PATCH 18/32] Ac_mul_B derivatives

---
 src/tracker/lib.jl | 28 ++++++++++++++++++++--------
 test/tracker.jl    |  2 ++
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl
index 2ee5d659..aab26dfe 100644
--- a/src/tracker/lib.jl
+++ b/src/tracker/lib.jl
@@ -1,5 +1,3 @@
-import Base: *
-
 toarray(xs::AbstractArray, ys::AbstractArray) = ys
 toarray(xs::AbstractArray, y) = similar(xs, typeof(y), ()) .= y
 
@@ -66,19 +64,33 @@ back(::typeof(mean), Δ, xs::TrackedArray, region) =
 
 # BLAS
 
-a::TrackedMatrix * b::TrackedMatrix  = TrackedArray(Call(*, a, b))
-a::TrackedMatrix * b::AbstractMatrix = TrackedArray(Call(*, a, b))
-a::AbstractMatrix * b::TrackedMatrix = TrackedArray(Call(*, a, b))
+for f in :[*, Ac_mul_B].args
+  @eval begin
+    import Base.$f
+    $f(a::TrackedMatrix, b::TrackedMatrix)  = TrackedArray(Call($f, a, b))
+    $f(a::TrackedMatrix, b::AbstractMatrix) = TrackedArray(Call($f, a, b))
+    $f(a::AbstractMatrix, b::TrackedMatrix) = TrackedArray(Call($f, a, b))
 
-a::TrackedMatrix * b::TrackedVector  = TrackedArray(Call(*, a, b))
-a::TrackedMatrix * b::AbstractVector = TrackedArray(Call(*, a, b))
-a::AbstractMatrix * b::TrackedVector = TrackedArray(Call(*, a, b))
+    $f(a::TrackedMatrix, b::TrackedVector)  = TrackedArray(Call($f, a, b))
+    $f(a::TrackedMatrix, b::AbstractVector) = TrackedArray(Call($f, a, b))
+    $f(a::AbstractMatrix, b::TrackedVector) = TrackedArray(Call($f, a, b))
+
+    $f(a::TrackedVector, b::TrackedVector)  = TrackedArray(Call($f, a, b))
+    $f(a::TrackedVector, b::AbstractVector) = TrackedArray(Call($f, a, b))
+    $f(a::AbstractVector, b::TrackedVector) = TrackedArray(Call($f, a, b))
+  end
+end
 
 function back(::typeof(*), Δ, a::AbstractMatrix, b::AbstractVecOrMat)
   @back(a, A_mul_Bt(Δ, data(b)))
   @back(b, At_mul_B(data(a), Δ))
 end
 
+function back(::typeof(Ac_mul_B), Δ, a::AbstractVecOrMat{<:Real}, b::AbstractVecOrMat{<:Real})
+  @back(a, A_mul_Bt(Δ, data(b))')
+  @back(b, *(data(a), Δ))
+end
+
 # Fast path for matrix-vector
 function back(::typeof(*), Δ::AbstractVector, W::TrackedMatrix, x::AbstractVector)
   if isleaf(W)
diff --git a/test/tracker.jl b/test/tracker.jl
index 52a73a07..69f37367 100644
--- a/test/tracker.jl
+++ b/test/tracker.jl
@@ -9,6 +9,8 @@ gradtest(f, dims...) = gradtest(f, rand.(dims)...)
 @test gradtest((x, W, b) -> σ.(W*x .+ b), 5, (2,5), 2)
 @test gradtest((x, W, b) -> σ.(W*x .+ b), (5,3), (2,5), 2)
 
+@test gradtest((w, x) -> w'*x, randn(10, 2), randn(10))
+
 @test gradtest(x -> sin.(sum(x, (2, 3))), (3,4,5))
 
 @test gradtest(x -> softmax(x).*(1:3), 3)

From bdf02e42aee308125cf3a9a7a05bb3f7d24d4942 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 8 Nov 2017 22:00:31 +0000
Subject: [PATCH 19/32] test tweaks

---
 test/tracker.jl | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/test/tracker.jl b/test/tracker.jl
index 69f37367..f2a369f8 100644
--- a/test/tracker.jl
+++ b/test/tracker.jl
@@ -39,18 +39,4 @@ end
   2y + x
 end
 
-for T in [Float32, Float64]
-    @test isa(param(T(1)), TrackedArray{T, 0})
-    @test isa(param(rand(T, 2)), TrackedArray{T, 1})
-    @test isa(param(rand(T, 2,2)), TrackedArray{T, 2})
-end
-
-# TODO: do we wand this behaviour ??
-F = typeof(AbstractFloat(1))
-for T in [Int32, Int64]
-    @test isa(param(T(1)), TrackedArray{F, 0})
-    @test isa(param(rand(T, 2)), TrackedArray{F, 1})
-    @test isa(param(rand(T, 2,2)), TrackedArray{F, 2})
-end
-
 end #testset

From e5d99d784ec23d32e679b9f5a72cacb32ac5d361 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 9 Nov 2017 14:53:26 +0000
Subject: [PATCH 20/32] fixes #79

---
 src/onehot.jl          | 11 +++++++++--
 src/tracker/Tracker.jl |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/onehot.jl b/src/onehot.jl
index 5414773c..f8061063 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -1,3 +1,5 @@
+import Base: *
+
 struct OneHotVector <: AbstractVector{Bool}
   ix::UInt32
   of::UInt32
@@ -7,7 +9,7 @@ Base.size(xs::OneHotVector) = (Int64(xs.of),)
 
 Base.getindex(xs::OneHotVector, i::Integer) = i == xs.ix
 
-Base.:*(A::AbstractMatrix, b::OneHotVector) = A[:, b.ix]
+A::AbstractMatrix * b::OneHotVector = A[:, b.ix]
 
 struct OneHotMatrix{A<:AbstractVector{OneHotVector}} <: AbstractMatrix{Bool}
   height::Int
@@ -18,7 +20,7 @@ Base.size(xs::OneHotMatrix) = (Int64(xs.height),length(xs.data))
 
 Base.getindex(xs::OneHotMatrix, i::Int, j::Int) = xs.data[j][i]
 
-Base.:*(A::AbstractMatrix, B::OneHotMatrix) = A[:, map(x->x.ix, B.data)]
+A::AbstractMatrix * B::OneHotMatrix = A[:, map(x->x.ix, B.data)]
 
 Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...])
 
@@ -47,3 +49,8 @@ argmax(y::AbstractVector, labels = 1:length(y)) =
 
 argmax(y::AbstractMatrix, l...) =
   squeeze(mapslices(y -> argmax(y, l...), y, 1), 1)
+
+# Ambiguity hack
+
+a::TrackedMatrix * b::OneHotVector = TrackedArray(Tracker.Call(*, a, b))
+a::TrackedMatrix * b::OneHotMatrix = TrackedArray(Tracker.Call(*, a, b))
diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index 5e26a051..3a64fcb7 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -1,6 +1,6 @@
 module Tracker
 
-export TrackedArray, param, back!
+export TrackedArray, TrackedVector, TrackedMatrix, param, back!
 
 data(x) = x
 istracked(x) = false

From 2cb94981a0176f070eb2dec31c00ef125613ce3f Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 27 Oct 2017 12:05:37 +0100
Subject: [PATCH 21/32] gpu-ready log

---
 src/Flux.jl             |  1 +
 src/layers/stateless.jl |  4 +--
 src/numeric.jl          | 80 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 2 deletions(-)
 create mode 100644 src/numeric.jl

diff --git a/src/Flux.jl b/src/Flux.jl
index acefff19..ce3861e5 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -21,6 +21,7 @@ include("optimise/Optimise.jl")
 using .Optimise
 
 include("utils.jl")
+include("numeric.jl")
 include("onehot.jl")
 include("treelike.jl")
 
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 3931c216..56d18349 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -3,12 +3,12 @@
 mse(ŷ, y) = sum((ŷ .- y).^2)/length(y)
 
 crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat) =
-  -sum(y .* log.(ŷ)) / size(y, 2)
+  -sum(y .* log_fast.(ŷ)) / size(y, 2)
 
 @deprecate logloss(x, y) crossentropy(x, y)
 
 function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat)
   logŷ = logŷ .- maximum(logŷ, 1)
-  ypred = logŷ .- log.(sum(exp.(logŷ), 1))
+  ypred = logŷ .- log_fast.(sum(exp.(logŷ), 1))
   -sum(y .* ypred) / size(y, 2)
 end
diff --git a/src/numeric.jl b/src/numeric.jl
new file mode 100644
index 00000000..9c444043
--- /dev/null
+++ b/src/numeric.jl
@@ -0,0 +1,80 @@
+using Base.Math: @horner, significand_bits, exponent_raw_max, exponent_bias
+
+if VERSION < v"0.7.0-DEV.1430"
+    using Base.Math.fpinttype
+else
+    using Base.uinttype
+end
+
+# log_fast from
+# https://github.com/musm/SLEEF.jl/blob/c9dcd2eb090d69ec40790f19798c5fef2aba2616/src/log.jl
+
+const MLN2  = 6.931471805599453094172321214581765680755001343602552541206800094933936219696955e-01 # log(2)
+
+@inline float2integer(d::Float64) = (reinterpret(Int64, d) >> significand_bits(Float64)) % Int
+@inline float2integer(d::Float32) = (reinterpret(Int32, d) >> significand_bits(Float32)) % Int
+
+@inline function ilogb2k(d::T) where {T<:Union{Float32,Float64}}
+    (float2integer(d) & exponent_raw_max(T)) - exponent_bias(T)
+end
+
+@inline function ldexp3k(x::T, e::Int) where {T<:Union{Float32,Float64}}
+    if VERSION < v"0.7.0-DEV.1430"
+        reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % fpinttype(T))
+    else
+        reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % uinttype(T))
+    end
+end
+
+"""
+    log_fast(x)
+Compute the natural logarithm of `x`. The inverse of the natural logarithm is
+the natural expoenential function `exp(x)`
+"""
+function log_fast end
+
+let
+global log_fast
+
+c8d = 0.153487338491425068243146
+c7d = 0.152519917006351951593857
+c6d = 0.181863266251982985677316
+c5d = 0.222221366518767365905163
+c4d = 0.285714294746548025383248
+c3d = 0.399999999950799600689777
+c2d = 0.6666666666667778740063
+c1d = 2.0
+
+c5f = 0.2392828464508056640625f0
+c4f = 0.28518211841583251953125f0
+c3f = 0.400005877017974853515625f0
+c2f = 0.666666686534881591796875f0
+c1f = 2f0
+
+global @inline log_fast_kernel(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d
+global @inline log_fast_kernel(x::Float32) = @horner x c1f c2f c3f c4f c5f
+
+function log_fast(d::T) where {T<:Union{Float32,Float64}}
+    o = d < realmin(T)
+    o && (d *= T(Int64(1) << 32) * T(Int64(1) << 32))
+
+    e = ilogb2k(d * T(1.0/0.75))
+    m = ldexp3k(d, -e)
+    o && (e -= 64)
+
+    x  = (m - 1) / (m + 1)
+    x2 = x * x
+
+    t = log_fast_kernel(x2)
+
+    x = x * t + T(MLN2) * e
+
+    isinf(d) && (x = T(Inf))
+    (d < 0 || isnan(d)) && (x = T(NaN))
+    d == 0 && (x = -T(Inf))
+
+    return x
+end
+end
+
+log_fast(x::Union{Int32,Int64}) = log_fast(float(x))

From e0657d93ecccf1b1ac924a42909a0c79b9433df4 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 9 Nov 2017 15:03:57 +0000
Subject: [PATCH 22/32] mv numeric.jl to nnlib

---
 src/Flux.jl             |  1 -
 src/layers/stateless.jl |  2 ++
 src/numeric.jl          | 80 -----------------------------------------
 3 files changed, 2 insertions(+), 81 deletions(-)
 delete mode 100644 src/numeric.jl

diff --git a/src/Flux.jl b/src/Flux.jl
index ce3861e5..acefff19 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -21,7 +21,6 @@ include("optimise/Optimise.jl")
 using .Optimise
 
 include("utils.jl")
-include("numeric.jl")
 include("onehot.jl")
 include("treelike.jl")
 
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 56d18349..834068aa 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -1,3 +1,5 @@
+using NNlib: log_fast
+
 # Cost functions
 
 mse(ŷ, y) = sum((ŷ .- y).^2)/length(y)
diff --git a/src/numeric.jl b/src/numeric.jl
deleted file mode 100644
index 9c444043..00000000
--- a/src/numeric.jl
+++ /dev/null
@@ -1,80 +0,0 @@
-using Base.Math: @horner, significand_bits, exponent_raw_max, exponent_bias
-
-if VERSION < v"0.7.0-DEV.1430"
-    using Base.Math.fpinttype
-else
-    using Base.uinttype
-end
-
-# log_fast from
-# https://github.com/musm/SLEEF.jl/blob/c9dcd2eb090d69ec40790f19798c5fef2aba2616/src/log.jl
-
-const MLN2  = 6.931471805599453094172321214581765680755001343602552541206800094933936219696955e-01 # log(2)
-
-@inline float2integer(d::Float64) = (reinterpret(Int64, d) >> significand_bits(Float64)) % Int
-@inline float2integer(d::Float32) = (reinterpret(Int32, d) >> significand_bits(Float32)) % Int
-
-@inline function ilogb2k(d::T) where {T<:Union{Float32,Float64}}
-    (float2integer(d) & exponent_raw_max(T)) - exponent_bias(T)
-end
-
-@inline function ldexp3k(x::T, e::Int) where {T<:Union{Float32,Float64}}
-    if VERSION < v"0.7.0-DEV.1430"
-        reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % fpinttype(T))
-    else
-        reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % uinttype(T))
-    end
-end
-
-"""
-    log_fast(x)
-Compute the natural logarithm of `x`. The inverse of the natural logarithm is
-the natural expoenential function `exp(x)`
-"""
-function log_fast end
-
-let
-global log_fast
-
-c8d = 0.153487338491425068243146
-c7d = 0.152519917006351951593857
-c6d = 0.181863266251982985677316
-c5d = 0.222221366518767365905163
-c4d = 0.285714294746548025383248
-c3d = 0.399999999950799600689777
-c2d = 0.6666666666667778740063
-c1d = 2.0
-
-c5f = 0.2392828464508056640625f0
-c4f = 0.28518211841583251953125f0
-c3f = 0.400005877017974853515625f0
-c2f = 0.666666686534881591796875f0
-c1f = 2f0
-
-global @inline log_fast_kernel(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d
-global @inline log_fast_kernel(x::Float32) = @horner x c1f c2f c3f c4f c5f
-
-function log_fast(d::T) where {T<:Union{Float32,Float64}}
-    o = d < realmin(T)
-    o && (d *= T(Int64(1) << 32) * T(Int64(1) << 32))
-
-    e = ilogb2k(d * T(1.0/0.75))
-    m = ldexp3k(d, -e)
-    o && (e -= 64)
-
-    x  = (m - 1) / (m + 1)
-    x2 = x * x
-
-    t = log_fast_kernel(x2)
-
-    x = x * t + T(MLN2) * e
-
-    isinf(d) && (x = T(Inf))
-    (d < 0 || isnan(d)) && (x = T(NaN))
-    d == 0 && (x = -T(Inf))
-
-    return x
-end
-end
-
-log_fast(x::Union{Int32,Int64}) = log_fast(float(x))

From 8991ce028ca02ed9d4c3286eba3468d2fe6e9ec1 Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <baggepinnen@gmail.com>
Date: Tue, 14 Nov 2017 17:32:16 +0100
Subject: [PATCH 23/32] Fix bug in rmsprop and adadelta
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`@. p.Δ = η * p.Δ / √acc` parses correctly while `@. p.Δ /= √acc*η` seems to parse like `@. p.Δ /= (√acc*η)`, hence the step size was de facto interpreted as `1/η`
---
 src/optimise/optimisers.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 95b31b98..1ffd8982 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -38,7 +38,7 @@ function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8)
   acc  = zeros(p.x) .+ ϵ
   function ()
     @. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2
-    @. p.Δ /= √acc * η
+    @. p.Δ = η * p.Δ / √acc
   end
 end
 
@@ -46,7 +46,7 @@ function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8)
   acc = zeros(p.x) .+ ϵ
   function ()
     @. acc += p.Δ ^ 2
-    @. p.Δ /= √acc * η
+    @. p.Δ = η * p.Δ / √acc
   end
 end
 

From 187fddc11c2f0733d5e6a1644c2167d8bde590ab Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Tue, 21 Nov 2017 12:29:02 +0100
Subject: [PATCH 24/32] doc fixes

---
 docs/src/models/layers.md       | 1 +
 docs/src/training/optimisers.md | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 5d5d2ee8..f92f751a 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -36,5 +36,6 @@ swish
 These layers don't affect the structure of the network but may improve training times or reduce overfitting.
 
 ```@docs
+Flux.testmode!
 Dropout
 ```
diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 3af5604b..56f511e4 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -58,8 +58,5 @@ All optimisers return a function that, when called, will update the parameters p
 SGD
 Momentum
 Nesterov
-RMSProp
 ADAM
-ADAGrad
-ADADelta
 ```

From e51268caf57cb259a74a6f7f71bc4235b8891d90 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Tue, 21 Nov 2017 12:59:39 +0100
Subject: [PATCH 25/32] mention treelike

---
 docs/src/models/basics.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 6fbd0792..02225279 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -151,3 +151,13 @@ m = Chain(x -> x^2, x -> x+1)
 
 m(5) # => 26
 ```
+
+## Layer helpers
+
+Flux provides a set of helpers for custom layers, which you can enable by calling
+
+```julia
+Flux.treelike(Affine)
+```
+
+This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).

From 979949d01adab7bec0711771785eb02b6109788f Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Tue, 21 Nov 2017 15:25:09 +0100
Subject: [PATCH 26/32] style

---
 src/optimise/optimisers.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 1ffd8982..abc54090 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -38,7 +38,7 @@ function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8)
   acc  = zeros(p.x) .+ ϵ
   function ()
     @. acc = ρ * acc + (1 - ρ) * p.Δ ^ 2
-    @. p.Δ = η * p.Δ / √acc
+    @. p.Δ *= η / √acc
   end
 end
 
@@ -46,7 +46,7 @@ function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8)
   acc = zeros(p.x) .+ ϵ
   function ()
     @. acc += p.Δ ^ 2
-    @. p.Δ = η * p.Δ / √acc
+    @. p.Δ *= η / √acc
   end
 end
 

From 11d53781b254bbb0fbe8a1c1313a3b05efc61112 Mon Sep 17 00:00:00 2001
From: skariel <skariel@gmail.com>
Date: Tue, 10 Oct 2017 23:33:37 +0300
Subject: [PATCH 27/32] adding layer normalization

---
 src/layers/basic.jl     | 30 ++++++++++++++++++++++++++++++
 src/layers/stateless.jl | 23 +++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 969a261c..03a340df 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -78,3 +78,33 @@ function Base.show(io::IO, l::Dense)
   l.σ == identity || print(io, ", ", l.σ)
   print(io, ")")
 end
+
+"""
+    ElementwiseLinear(in::Integer)
+
+Creates an element-wise linear transformation layer with learnable
+vectors α and β:
+
+    y = α .* x .+ b
+
+The input `x` must be a vector of length `in`, or a batch of vectors represented
+as an `in × N` matrix. The out `y` will be a vector or batch of length `in`.
+"""
+struct ElementwiseLinear{T}
+  α::T
+  β::T
+end
+
+ElementwiseLinear(in::Integer; initα = ones, initβ = zeros) =
+  ElementwiseLinear(param(initα(in)), param(initβ(in)))
+
+treelike(ElementwiseLinear)
+
+function (a::ElementwiseLinear)(x)
+  α, β = a.α, a.β
+  α.*x .+ β
+end
+
+function Base.show(io::IO, l::ElementwiseLinear)
+  print(io, "ElementwiseLinear(", length(l.α), ")")
+end
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 3931c216..8d0276e8 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -12,3 +12,26 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat)
   ypred = logŷ .- log.(sum(exp.(logŷ), 1))
   -sum(y .* ypred) / size(y, 2)
 end
+
+"""
+    layernormalization(α=1.0, β=0.0)
+
+Creates a normalization layer based on https://arxiv.org/pdf/1607.06450.pdf
+
+The differences are:
+
+1) std here divides by N-1 (as does std in Julia) vs the paper N
+2) this layer α and β are constant numbers (i.e. not learnable vectors)
+
+To achieve the same effect of learnable vectors α and β oe can use
+the ElementwiseLinear layer
+"""
+function layernormalization(α=1.0, β=0.0)
+  function layer(y)
+    _mean = mean(y)
+    _std = sqrt.(sum((y.-_mean).^2) ./ (length(y)-1))
+    _std /= α
+    _mean -= β*_std
+    return (y .- _mean) ./ _std
+  end
+end

From b06884b9123d9168104602c9855e4bc046bdecab Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Mon, 23 Oct 2017 12:53:07 +0100
Subject: [PATCH 28/32] LayerNorm tweaks

---
 docs/src/models/layers.md   |  1 +
 src/Flux.jl                 |  2 +-
 src/layers/basic.jl         | 19 +++++++++----------
 src/layers/normalisation.jl | 22 ++++++++++++++++++++++
 src/layers/stateless.jl     | 24 ++++++------------------
 5 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index f92f751a..1fd87d41 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -38,4 +38,5 @@ These layers don't affect the structure of the network but may improve training
 ```@docs
 Flux.testmode!
 Dropout
+LayerNorm
 ```
diff --git a/src/Flux.jl b/src/Flux.jl
index acefff19..df4b1636 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -7,7 +7,7 @@ module Flux
 using Juno, Requires
 using Lazy: @forward
 
-export Chain, Dense, RNN, LSTM, Dropout,
+export Chain, Dense, RNN, LSTM, Dropout, LayerNorm,
   SGD, ADAM, Momentum, Nesterov,
   param, params, mapleaves
 
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 03a340df..3c47b595 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -80,31 +80,30 @@ function Base.show(io::IO, l::Dense)
 end
 
 """
-    ElementwiseLinear(in::Integer)
+    Diagonal(in::Integer)
 
 Creates an element-wise linear transformation layer with learnable
 vectors α and β:
 
     y = α .* x .+ b
 
-The input `x` must be a vector of length `in`, or a batch of vectors represented
-as an `in × N` matrix. The out `y` will be a vector or batch of length `in`.
+The input `x` must be a array where `size(x, 1) == in`.
 """
-struct ElementwiseLinear{T}
+struct Diagonal{T}
   α::T
   β::T
 end
 
-ElementwiseLinear(in::Integer; initα = ones, initβ = zeros) =
-  ElementwiseLinear(param(initα(in)), param(initβ(in)))
+Diagonal(in::Integer; initα = ones, initβ = zeros) =
+  Diagonal(param(initα(in)), param(initβ(in)))
 
-treelike(ElementwiseLinear)
+treelike(Diagonal)
 
-function (a::ElementwiseLinear)(x)
+function (a::Diagonal)(x)
   α, β = a.α, a.β
   α.*x .+ β
 end
 
-function Base.show(io::IO, l::ElementwiseLinear)
-  print(io, "ElementwiseLinear(", length(l.α), ")")
+function Base.show(io::IO, l::Diagonal)
+  print(io, "Diagonal(", length(l.α), ")")
 end
diff --git a/src/layers/normalisation.jl b/src/layers/normalisation.jl
index 08c21428..d296b0a3 100644
--- a/src/layers/normalisation.jl
+++ b/src/layers/normalisation.jl
@@ -43,3 +43,25 @@ function (a::Dropout)(x)
 end
 
 _testmode!(a::Dropout, test) = (a.active = !test)
+
+"""
+    LayerNorm(h::Integer)
+
+A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be
+used with recurrent hidden states of size `h`. Normalises the mean/stddev of
+each input before applying a per-neuron gain/bias.
+"""
+struct LayerNorm{T}
+  diag::Diagonal{T}
+end
+
+LayerNorm(h::Integer) =
+  LayerNorm(Diagonal(h))
+
+treelike(LayerNorm)
+
+(a::LayerNorm)(x) = a.diag(normalise(x))
+
+function Base.show(io::IO, l::LayerNorm)
+  print(io, "LayerNorm(", length(l.diag.α), ")")
+end
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 8d0276e8..2a4b9a7c 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -14,24 +14,12 @@ function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat)
 end
 
 """
-    layernormalization(α=1.0, β=0.0)
+    normalise(x::AbstractVecOrMat)
 
-Creates a normalization layer based on https://arxiv.org/pdf/1607.06450.pdf
-
-The differences are:
-
-1) std here divides by N-1 (as does std in Julia) vs the paper N
-2) this layer α and β are constant numbers (i.e. not learnable vectors)
-
-To achieve the same effect of learnable vectors α and β oe can use
-the ElementwiseLinear layer
+Normalise each column of `x` to mean 0 and standard deviation 1.
 """
-function layernormalization(α=1.0, β=0.0)
-  function layer(y)
-    _mean = mean(y)
-    _std = sqrt.(sum((y.-_mean).^2) ./ (length(y)-1))
-    _std /= α
-    _mean -= β*_std
-    return (y .- _mean) ./ _std
-  end
+function normalise(x::AbstractVecOrMat)
+  μ′ = mean(x, 1)
+  σ′ = std(x, 1, mean = μ′)
+  return (x .- μ′) ./ σ′
 end

From 351d3d4771da08e53d2a2f89547f91d5fdb47beb Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Tue, 21 Nov 2017 17:04:04 +0100
Subject: [PATCH 29/32] std derivative

---
 src/layers/basic.jl | 4 ++--
 src/tracker/lib.jl  | 6 ++++++
 test/tracker.jl     | 3 +++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 3c47b595..aa101c43 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -83,9 +83,9 @@ end
     Diagonal(in::Integer)
 
 Creates an element-wise linear transformation layer with learnable
-vectors α and β:
+vectors `α` and `β`:
 
-    y = α .* x .+ b
+    y = α .* x .+ β
 
 The input `x` must be a array where `size(x, 1) == in`.
 """
diff --git a/src/tracker/lib.jl b/src/tracker/lib.jl
index aab26dfe..5065a40d 100644
--- a/src/tracker/lib.jl
+++ b/src/tracker/lib.jl
@@ -58,6 +58,12 @@ Base.findfirst(xs::TrackedArray, args...) = findfirst(xs.data, args...)
 Base.mean(xs::TrackedArray) = TrackedArray(Call(mean, xs), toarray(xs.data, mean(xs.data)))
 Base.mean(xs::TrackedArray, region) = TrackedArray(Call(mean, xs, region))
 
+# Hacks to get std working
+Base.std(x::TrackedArray; mean = Base.mean(x)) =
+  sqrt.(sum((x .- mean).^2) ./ (length(x)-1))
+Base.std(x::TrackedArray, dim; mean = Base.mean(x, dim)) =
+  sqrt.(sum((x .- mean).^2, dim) ./ (size(x, dim)-1))
+
 back(::typeof(mean), Δ, xs::TrackedArray) = back(xs, similar(xs.data) .= Δ ./ length(xs.data))
 back(::typeof(mean), Δ, xs::TrackedArray, region) =
   back(xs, similar(xs.data) .= Δ ./ prod(size(xs.data, region...)))
diff --git a/test/tracker.jl b/test/tracker.jl
index f2a369f8..81a72566 100644
--- a/test/tracker.jl
+++ b/test/tracker.jl
@@ -34,6 +34,9 @@ gradtest(f, dims...) = gradtest(f, rand.(dims)...)
   @test gradtest(x -> mean(x, [1, 2]), rand(2, 3, 4))
 end
 
+@test gradtest(x -> std(x), rand(5,5))
+@test gradtest(x -> std(x, 1), rand(5,5))
+
 @test gradtest(rand(5)) do x
   y = x.^2
   2y + x

From 2d33f19346b48dd76559926b62ba1dd7cd978ba7 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Wed, 29 Nov 2017 16:45:50 +0000
Subject: [PATCH 30/32] onehot unk arg

---
 src/onehot.jl | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/onehot.jl b/src/onehot.jl
index f8061063..f94fb93e 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -42,7 +42,14 @@ function onehot(l, labels)
   OneHotVector(i, length(labels))
 end
 
-onehotbatch(ls, labels) = OneHotMatrix(length(labels), [onehot(l, labels) for l in ls])
+function onehot(l, labels, unk)
+  i = findfirst(labels, l)
+  i > 0 || return onehot(unk, labels)
+  OneHotVector(i, length(labels))
+end
+
+onehotbatch(ls, labels, unk...) =
+  OneHotMatrix(length(labels), [onehot(l, labels, unk...) for l in ls])
 
 argmax(y::AbstractVector, labels = 1:length(y)) =
   labels[findfirst(y, maximum(y))]

From 19039f48819835bf01ea6f2f69792f53dfe7d4f8 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 30 Nov 2017 13:37:38 +0000
Subject: [PATCH 31/32] export sigmoid

---
 src/Flux.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index df4b1636..7671ddd2 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -12,7 +12,7 @@ export Chain, Dense, RNN, LSTM, Dropout, LayerNorm,
   param, params, mapleaves
 
 using NNlib
-export σ, relu, leakyrelu, elu, swish, softmax
+export σ, sigmoid, relu, leakyrelu, elu, swish, softmax
 
 include("tracker/Tracker.jl")
 using .Tracker

From cab235a57863558aa060a28776f8934d5a0a0ed4 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Thu, 30 Nov 2017 13:51:31 +0000
Subject: [PATCH 32/32] gpu compat

---
 src/tracker/Tracker.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index 3a64fcb7..74ed2d75 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -40,7 +40,7 @@ TrackedArray(x::AbstractArray) = TrackedArray(Call(nothing), x, zeros(x))
 
 isleaf(x::TrackedArray) = x.f == Call(nothing)
 
-param(xs) = TrackedArray(AbstractFloat.(xs))
+param(xs) = TrackedArray(map(x -> AbstractFloat(x), xs))
 param(xs::Real) = param(fill(xs))
 
 istracked(x::TrackedArray) = true