From 37d58e16dd234b16fb59bb2dd9cf1a37f54fcbde Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sat, 8 Feb 2020 16:33:18 +0530
Subject: [PATCH 01/10] common questions answered in docs

---
 docs/src/models/basics.md     | 18 ++++++++++++++++++
 docs/src/training/training.md | 17 +++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index d83fc462..76f93684 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -219,3 +219,21 @@ Flux.@functor Affine
 ```
 
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
+
+By default all the fields in the `Affine` type are collected as its parameters, however, in some cases it may be desired to hold other metadata in our "layers" that may not be needed for training, and are hence supposed to be ignored while the parameters are collected. With Flux, it is possible to mark the fields of our layers that are trainable in two ways.
+
+The first way of achieving this is through overloading the `trainable` function.
+
+```julia
+Flux.trainable(a::Affine) = (a.W, a.b,)
+```
+
+To add other fields is simply to add them to the tuple.
+
+Another way of achieving this is through the `@functor` macro. Here, wee can mark the fields we are interested in like so:
+
+```julia
+Flux.@functor Affine (W,)
+```
+
+However, doing this requires the `struct` to have a corresponding constructor that accepts those parameters.
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index b42db7c9..7680a776 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -41,6 +41,23 @@ The model to be trained must have a set of tracked parameters that are used to c
 
 Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
 
+When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
+
+Consider the simple multi-layer model where we want to omit optimising the second layer. This setup would look something like so:
+
+```julia
+m = Chain(
+  Dense(784, 64, σ),
+  Dense(64, 32),
+  Dense(32, 10), softmax)
+
+ps = Flux.params(m[1], m[3:end])
+```
+
+`ps` now holds a reference to only the parameters of the layers passed to it.
+
+Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. 
+
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From ee6d950696e4f163ded8098624f4c8a79d978694 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Wed, 12 Feb 2020 11:25:50 +0530
Subject: [PATCH 02/10] Update docs/src/models/basics.md

Co-Authored-By: Carlo Lucibello <carlo.lucibello@gmail.com>
---
 docs/src/models/basics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 76f93684..8982fdfb 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -228,7 +228,7 @@ The first way of achieving this is through overloading the `trainable` function.
 Flux.trainable(a::Affine) = (a.W, a.b,)
 ```
 
-To add other fields is simply to add them to the tuple.
+Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`. 
 
 Another way of achieving this is through the `@functor` macro. Here, wee can mark the fields we are interested in like so:
 

From d5ed9a447858745a4551d646d54788ed94074c23 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Wed, 12 Feb 2020 11:26:11 +0530
Subject: [PATCH 03/10] Update docs/src/models/basics.md

Co-Authored-By: Carlo Lucibello <carlo.lucibello@gmail.com>
---
 docs/src/models/basics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 8982fdfb..3f43f29d 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -231,7 +231,7 @@ Flux.trainable(a::Affine) = (a.W, a.b,)
 Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`. 
 
 Another way of achieving this is through the `@functor` macro. Here, wee can mark the fields we are interested in like so:
-
+Another way of achieving this is through the `@functor` macro. Here, we can mark the fields we are interested in by grouping them in the second argument:
 ```julia
 Flux.@functor Affine (W,)
 ```

From 27949693f334b4d803526efa585cc957320e957c Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 2 Mar 2020 12:40:19 +0530
Subject: [PATCH 04/10] refactor

---
 docs/make.jl                  |  3 +-
 docs/src/models/advanced.md   | 61 +++++++++++++++++++++++++++++++++++
 docs/src/models/basics.md     | 18 +----------
 docs/src/training/training.md | 15 +--------
 4 files changed, 65 insertions(+), 32 deletions(-)
 create mode 100644 docs/src/models/advanced.md

diff --git a/docs/make.jl b/docs/make.jl
index b950e959..365cdfc0 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -13,7 +13,8 @@ makedocs(modules=[Flux, NNlib],
                     ["Basics" => "models/basics.md",
                      "Recurrence" => "models/recurrence.md",
                      "Regularisation" => "models/regularisation.md",
-                     "Model Reference" => "models/layers.md"],
+                     "Model Reference" => "models/layers.md",
+                     "Advanced Model Building" => "models/advanced.md"],
                   "Training Models" =>
                     ["Optimisers" => "training/optimisers.md",
                      "Training" => "training/training.md"],
diff --git a/docs/src/models/advanced.md b/docs/src/models/advanced.md
new file mode 100644
index 00000000..4a023709
--- /dev/null
+++ b/docs/src/models/advanced.md
@@ -0,0 +1,61 @@
+# Advanced Model Building and Customisation
+
+Here we will try and describe usage of some more advanced features that Flux provides to give more control over model building.
+
+## Customising Parameter Collection for a Model
+
+Taking reference from our example `Affine` layer from the [basics](basics.md#Building-Layers-1).
+
+By default all the fields in the `Affine` type are collected as its parameters, however, in some cases it may be desired to hold other metadata in our "layers" that may not be needed for training, and are hence supposed to be ignored while the parameters are collected. With Flux, it is possible to mark the fields of our layers that are trainable in two ways.
+
+The first way of achieving this is through overloading the `trainable` function.
+
+```julia-repl
+julia> @functor Affine
+
+julia> a = Affine(rand(3,3), rand(3))
+Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955])
+
+julia> Flux.params(a) # default behavior
+Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])
+
+julia> Flux.trainable(a::Affine) = (a.W, a.b,)
+
+julia> Flux.params(a)
+Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])
+```
+
+Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`.
+
+Another way of achieving this is through the `@functor` macro directly. Here, we can mark the fields we are interested in by grouping them in the second argument:
+
+```julia
+Flux.@functor Affine (W,)
+```
+
+However, doing this requires the `struct` to have a corresponding constructor that accepts those parameters.
+
+## Freezing Layer Parameters
+
+When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
+
+Consider the simple multi-layer model where we want to omit optimising the first two `Dense` layers. This setup would look something like so:
+
+```julia
+m = Chain(
+  Dense(784, 64, σ),
+  Dense(64, 32),
+  Dense(32, 10), softmax)
+
+ps = Flux.params(m[3:end])
+```
+
+`ps` now holds a reference to only the parameters of the layers passed to it.
+
+During training, now the gradients would only be applied to the last `Dense` layer (and the `softmax` layer, but that is stateless so doesn't have any parameters), so only that would have its parameters changed.
+
+`Flux.params` also takes multiple inputs to make it easy to collect parameters from heterogenous models with a single call. A simple demonstration would be if we wanted to omit optimising the second `Dense` layer in the previous example. It would look something like this:
+
+```julia
+Flux.params(m[1], m[3:end])
+```
diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 3f43f29d..a5e3ca9a 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -220,20 +220,4 @@ Flux.@functor Affine
 
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
 
-By default all the fields in the `Affine` type are collected as its parameters, however, in some cases it may be desired to hold other metadata in our "layers" that may not be needed for training, and are hence supposed to be ignored while the parameters are collected. With Flux, it is possible to mark the fields of our layers that are trainable in two ways.
-
-The first way of achieving this is through overloading the `trainable` function.
-
-```julia
-Flux.trainable(a::Affine) = (a.W, a.b,)
-```
-
-Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`. 
-
-Another way of achieving this is through the `@functor` macro. Here, wee can mark the fields we are interested in like so:
-Another way of achieving this is through the `@functor` macro. Here, we can mark the fields we are interested in by grouping them in the second argument:
-```julia
-Flux.@functor Affine (W,)
-```
-
-However, doing this requires the `struct` to have a corresponding constructor that accepts those parameters.
+For some more helpful tricks, including parameter freezing, please checkout the [advanced usage guide](advacned.md).
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 7680a776..153d0278 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -43,20 +43,7 @@ Such an object contains a reference to the model's parameters, not a copy, such
 
 When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
 
-Consider the simple multi-layer model where we want to omit optimising the second layer. This setup would look something like so:
-
-```julia
-m = Chain(
-  Dense(784, 64, σ),
-  Dense(64, 32),
-  Dense(32, 10), softmax)
-
-ps = Flux.params(m[1], m[3:end])
-```
-
-`ps` now holds a reference to only the parameters of the layers passed to it.
-
-Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. 
+Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. Also, for freezing model parameters, see the [Advanced Usage Guide](../models/advanced.md).
 
 ## Datasets
 

From bb5350591f5f9d84fbc1397da7aea7bd6d54c3a6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 2 Mar 2020 12:42:33 +0530
Subject: [PATCH 05/10] cleanup

---
 docs/src/training/training.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 153d0278..3775f5ba 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -41,8 +41,6 @@ The model to be trained must have a set of tracked parameters that are used to c
 
 Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
 
-When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
-
 Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. Also, for freezing model parameters, see the [Advanced Usage Guide](../models/advanced.md).
 
 ## Datasets

From 0def3523839f319e7b6b0e0f5343df657060fa72 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 11:49:34 -0500
Subject: [PATCH 06/10] Prevent breakage due to new `active` field in BatchNorm

---
 src/layers/normalise.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index fc781f70..0647e6b4 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -157,6 +157,8 @@ mutable struct BatchNorm{F,V,W,N}
   active::Union{Bool, Nothing}
 end
 
+BatchNorm(λ, β, γ, μ, σ², ϵ, momentum) = BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
+
 BatchNorm(chs::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   BatchNorm(λ, initβ(chs), initγ(chs),

From d9ea5fba761bb8471bac878237a7b4b836dbcf00 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 11:55:39 -0500
Subject: [PATCH 07/10] add `active` helpers for other normalise layers

---
 src/layers/normalise.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 0647e6b4..f9ef4de8 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -40,6 +40,8 @@ mutable struct Dropout{F,D}
   active::Union{Bool, Nothing}
 end
 
+Dropout(p, dims) = Dropout(p, dims, nothing)
+
 function Dropout(p; dims = :)
   @assert 0 ≤ p ≤ 1
   Dropout{typeof(p),typeof(dims)}(p, dims, nothing)
@@ -77,6 +79,8 @@ mutable struct AlphaDropout{F}
   end
 end
 
+AlphaDropout(p) = AlphaDropout(p, nothing)
+
 function (a::AlphaDropout)(x)
   _isactive(a) || return x
   λ = eltype(x)(1.0507009873554804934193349852946)
@@ -253,6 +257,8 @@ mutable struct InstanceNorm{F,V,W,N}
   active::Union{Bool, Nothing}
 end
 
+InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum) = InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
+
 InstanceNorm(chs::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   InstanceNorm(λ, initβ(chs), initγ(chs),
@@ -344,6 +350,8 @@ mutable struct GroupNorm{F,V,W,N,T}
   active::Union{Bool, Nothing}
 end
 
+GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum) = GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, nothing)
+
 GroupNorm(chs::Integer, G::Integer, λ = identity;
           initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
   GroupNorm(G, λ, initβ(chs), initγ(chs),

From d63fcf2cb46856fa091f6c353b33e09a649dd314 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 13:05:03 -0500
Subject: [PATCH 08/10] add depreciation reminder

---
 src/layers/normalise.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index f9ef4de8..858d4986 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -40,6 +40,7 @@ mutable struct Dropout{F,D}
   active::Union{Bool, Nothing}
 end
 
+# TODO: deprecate in v0.11
 Dropout(p, dims) = Dropout(p, dims, nothing)
 
 function Dropout(p; dims = :)
@@ -79,6 +80,7 @@ mutable struct AlphaDropout{F}
   end
 end
 
+# TODO: deprecate in v0.11
 AlphaDropout(p) = AlphaDropout(p, nothing)
 
 function (a::AlphaDropout)(x)
@@ -161,6 +163,7 @@ mutable struct BatchNorm{F,V,W,N}
   active::Union{Bool, Nothing}
 end
 
+# TODO: deprecate in v0.11
 BatchNorm(λ, β, γ, μ, σ², ϵ, momentum) = BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
 
 BatchNorm(chs::Integer, λ = identity;
@@ -257,6 +260,7 @@ mutable struct InstanceNorm{F,V,W,N}
   active::Union{Bool, Nothing}
 end
 
+# TODO: deprecate in v0.11
 InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum) = InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
 
 InstanceNorm(chs::Integer, λ = identity;
@@ -350,6 +354,7 @@ mutable struct GroupNorm{F,V,W,N,T}
   active::Union{Bool, Nothing}
 end
 
+# TODO: deprecate in v0.11
 GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum) = GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, nothing)
 
 GroupNorm(chs::Integer, G::Integer, λ = identity;

From 078ad7dd500f75dfa3f125b72742e9a8c07b5f6a Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 13:05:23 -0500
Subject: [PATCH 09/10] bump version to 0.10.3

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index a27d766b..451a73b7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.2"
+version = "0.10.3"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

From 61f66e3dcdfe64bfe630bc2183420197bc6babe0 Mon Sep 17 00:00:00 2001
From: Ian <i.r.butterworth@gmail.com>
Date: Tue, 3 Mar 2020 13:20:02 -0500
Subject: [PATCH 10/10] remove unnecessary helper for AlphaDropout

---
 src/layers/normalise.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 858d4986..250a06fc 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -80,9 +80,6 @@ mutable struct AlphaDropout{F}
   end
 end
 
-# TODO: deprecate in v0.11
-AlphaDropout(p) = AlphaDropout(p, nothing)
-
 function (a::AlphaDropout)(x)
   _isactive(a) || return x
   λ = eltype(x)(1.0507009873554804934193349852946)