Merge #563

563: noise shape for dropout r=MikeInnes a=chengchingwen I add the noise shape for dropout, similar to the `noise_shape` argument in [`tf.nn.dropout`](https://www.tensorflow.org/api_docs/python/tf/nn/dropout) Co-authored-by: chengchingwen <adgjl5645@hotmail.com> Co-authored-by: Peter <adgjl5645@hotmail.com>
2019-05-13 17:16:10 +00:00 · 2019-05-13 17:16:10 +00:00 · 68ba6e4e2f
commit 68ba6e4e2f
parent 16fc41cd00 9c1bb93aa3
3 changed files with 38 additions and 9 deletions
--- a/NEWS.md
+++ b/NEWS.md
@ -3,6 +3,7 @@

 # v0.8.0

+* [Dropout now has a `dims` argument for specifying the unbroadcast dimensions.](https://github.com/FluxML/Flux.jl/pull/563)
 * New [ConvTranspose layer](https://github.com/FluxML/Flux.jl/pull/311).
 * New [Maxout layer](https://github.com/FluxML/Flux.jl/pull/647)
 * Datasets are now [hash verified on download](https://github.com/FluxML/Flux.jl/pull/585) to avoid corruption.
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@ -13,32 +13,50 @@ end
 _testmode!(m, test) = nothing

 """
-    Dropout(p)
+    Dropout(p, dims = :)

 A Dropout layer. For each input, either sets that input to `0` (with probability
-`p`) or scales it by `1/(1-p)`. This is used as a regularisation, i.e. it
-reduces overfitting during training.
+`p`) or scales it by `1/(1-p)`. The `dims` argument is to specified the unbroadcasted
+ dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
+ used as a regularisation, i.e. it reduces overfitting during training. see also [`dropout`](@ref).

 Does nothing to the input once in [`testmode!`](@ref).
 """
 mutable struct Dropout{F}
  p::F
+  dims::Union{Colon, Int, NTuple{N, Int} where N}
  active::Bool
 end

-function Dropout(p)
+function Dropout(p; dims = :)
  @assert 0 ≤ p ≤ 1
-  Dropout{typeof(p)}(p, true)
+  Dropout{typeof(p)}(p, dims, true)
 end

+_dropout_shape(s, ::Colon) = size(s)
+_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
+
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)

+
+"""
+    dropout(x, p; dims = :)
+
+The dropout function. For each input, either sets that input to `0` (with probability
+`p`) or scales it by `1/(1-p)`. The `dims` argument is to specified the unbroadcasted
+ dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
+ used as a regularisation, i.e. it reduces overfitting during training.
+"""
+function dropout(x, p; dims = :)
+  y = similar(x, _dropout_shape(x, dims))
+  rand!(y)
+  y .= _dropout_kernel.(y, p, 1 - p)
+  return x .* y
+end
+
 function (a::Dropout)(x)
  a.active || return x
-  y = similar(x)
-  rand!(y)
-  y .= _dropout_kernel.(y, a.p, 1 - a.p)
-  return x .* y
+  return dropout(x, a.p; dims = a.dims)
 end

 _testmode!(a::Dropout, test) = (a.active = !test)
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@ -26,6 +26,16 @@ using Flux.Tracker: data
  testmode!(m)
  y = m(x)
  @test count(a->a == 0, y) == 0
+
+  x = rand(100, 50)
+  m = Dropout(0.5, dims = 2)
+  y = m(x)
+  c = map(i->count(a->a==0, @view y[i, :]), 1:100)
+  @test minimum(c) == maximum(c)
+  m = Dropout(0.5, dims = 1)
+  y = m(x)
+  c = map(i->count(a->a==0, @view y[:, i]), 1:50)
+  @test minimum(c) == maximum(c)
 end

@testset "BatchNorm" begin