Merge #873

873: Make bias optional r=MikeInnes a=dhairyagandhi96 Addresses #868 Co-authored-by: Dhairya Gandhi <dhairya@juliacopmuting.com>
2020-05-01 13:28:15 +00:00 · 2020-05-01 13:28:15 +00:00 · 5d9acc7e73
commit 5d9acc7e73
parent 9237cdaf5b 8f877f2dbf
4 changed files with 319 additions and 37 deletions
--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -27,6 +27,7 @@ using CuArrays
 const use_cuda = Ref(false)

 include("utils.jl")
+include("zeros.jl")
 include("onehot.jl")
 include("functor.jl")

--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@ -30,27 +30,36 @@ function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
 end

 """
-    Conv(size, in => out, σ = identity; init = glorot_uniform,
+    Conv(filter, in => out, σ = identity; init = glorot_uniform,
         stride = 1, pad = 0, dilation = 1)

-Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
+    filter = (2,2)
+    in = 1
+    out = 16
+    Conv((2, 2), 1=>16, relu)
+
+Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.

 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.

+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.

 # Examples

-Apply a `Conv` layer to a 1-channel input using a 2×2 window size, giving us a
+Apply a `Conv` layer to a 1-channel input using a 2×2 window filter size, giving us a
 16-channel output. Output is activated with ReLU.
 ```julia
-size = (2,2)
+filter = (2,2)
 in = 1
 out = 16
-Conv(size, in => out, relu)
+Conv(filter, in => out, relu)
 ```
 """
 struct Conv{N,M,F,A,V}
@ -62,7 +71,28 @@ struct Conv{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end

-function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
+"""
+    Conv(weight::AbstractArray, bias::AbstractArray)
+    Conv(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the convolutional layer with user defined weight and bias arrays.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+There is also a keyword-only constuctor available for all convoultional
+layers.
+
+```julia
+weight = rand(Float32, 3, 3, 5)
+bias = zeros(Float32, 5)
+Conv(weight = weight,
+    bias = bias,
+    σ = sigmoid)
+```
+"""
+function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
              stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
  dilation = expand(Val(N-2), dilation)
@ -70,10 +100,32 @@ function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
  return Conv(σ, w, b, stride, pad, dilation)
 end

-Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1) where N =
-  Conv(init(k..., ch...), zeros(ch[2]), σ,
-       stride = stride, pad = pad, dilation = dilation)
+function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+              activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+"""
+    convfilter(filter::Tuple, in=>out)
+
+Constructs a standard convolutional weight matrix with given `filter` and
+channels from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`depthwiseconvfilter`](@ref)
+"""
+convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+          init = glorot_uniform) where N = init(filter..., ch...)
+
+function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+            init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
+            weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+
+  Conv(weight, bias, σ,
+      stride = stride, pad = pad, dilation = dilation)
+end

@functor Conv

@ -114,16 +166,22 @@ outdims(l::Conv, isize) =
  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))

 """
-    ConvTranspose(size, in => out, σ = identity; init = glorot_uniform,
+    ConvTranspose(filter, in=>out)
+    ConvTranspose(filter, in=>out, activation)
+    ConvTranspose(filter, in => out, σ = identity; init = glorot_uniform,
                  stride = 1, pad = 0, dilation = 1)

-Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
+Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.

 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.

+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1.
 """
 struct ConvTranspose{N,M,F,A,V}
@ -135,18 +193,39 @@ struct ConvTranspose{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end

-function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+"""
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray)
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the convolutional transpose layer with user defined weight and bias arrays.
+forward pass.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
  dilation = expand(Val(N-2), dilation)
  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return ConvTranspose(σ, w, b, stride, pad, dilation)
 end

-ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-              init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
-ConvTranspose(init(k..., reverse(ch)...), zeros(ch[2]), σ,
+function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                        activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
+  
+  ConvTranspose(weight, bias, σ,
              stride = stride, pad = pad, dilation = dilation)
+end

@functor ConvTranspose

@ -158,9 +237,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
    batch_size = size(x)[end]
    # Create DenseConvDims() that looks like the corresponding conv()
    return DenseConvDims((I..., C_in, batch_size), size(c.weight);
-        stride=c.stride,
-        padding=c.pad,
-        dilation=c.dilation,
+                        stride=c.stride,
+                        padding=c.pad,
+                        dilation=c.dilation,
    )
 end

@ -171,7 +250,7 @@ function (c::ConvTranspose)(x::AbstractArray)
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
  cdims = conv_transpose_dims(c, x)
-  return σ.(∇conv_data(x, c.weight, cdims) .+ b)
+  σ.(∇conv_data(x, c.weight, cdims) .+ b)
 end

 function Base.show(io::IO, l::ConvTranspose)
@ -190,10 +269,12 @@ end
 outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)

 """
-    DepthwiseConv(size, in => out, σ = identity; init = glorot_uniform,
+    DepthwiseConv(filter::Tuple, in=>out)
+    DepthwiseConv(filter::Tuple, in=>out, activation)
+    DepthwiseConv(filter, in => out, σ = identity; init = glorot_uniform,
                  stride = 1, pad = 0, dilation = 1)

-Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
+Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Note that `out` must be an integer multiple of `in`.

@ -201,6 +282,10 @@ Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.

+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 """
 struct DepthwiseConv{N,M,F,A,V}
@ -212,20 +297,54 @@ struct DepthwiseConv{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end

-function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-                       stride = 1, pad = 0, dilation = 1) where {T,N}
+"""
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray)
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
+forward pass.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
  dilation = expand(Val(N-2), dilation)
  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end

+function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+"""
+    depthwiseconvfilter(filter::Tuple, in=>out)
+
+Constructs a depthwise convolutional weight array defined by `filter` and channels
+from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`convfilter`](@ref)
+"""
+depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+                    init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
+
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
  @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
+
  return DepthwiseConv(
-    init(k..., div(ch[2], ch[1]), ch[1]),
-    zeros(ch[2]),
+    weight,
+    bias,
    σ;
    stride = stride,
    pad = pad,
@ -258,24 +377,30 @@ outdims(l::DepthwiseConv, isize) =
  output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))

 """
-    CrossCor(size, in => out, σ = identity; init = glorot_uniform,
+    CrossCor(filter, in=>out)
+    CrossCor(filter, in=>out, activation)
+    CrossCor(filter, in => out, σ = identity; init = glorot_uniform,
             stride = 1, pad = 0, dilation = 1)

-Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
+Standard cross convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.

 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.

+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.

 # Examples

-Apply a `CrossCor` layer to a 1-channel input using a 2×2 window size, giving us a
+Apply a `CrossCor` layer to a 1-channel input using a 2×2 window filter size, giving us a
 16-channel output. Output is activated with ReLU.
 ```julia
-size = (2,2)
+filter = (2,2)
 in = 1
 out = 16
 CrossCor((2, 2), 1=>16, relu)
@ -290,18 +415,39 @@ struct CrossCor{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end

-function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+"""
+    CrossCor(weight::AbstractArray, bias::AbstractArray)
+    CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the standard cross convolutional layer with user defined weight and bias
+arrays.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                  stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
  dilation = expand(Val(N-2), dilation)
  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return CrossCor(σ, w, b, stride, pad, dilation)
 end

-CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
-  CrossCor(init(k..., ch...), zeros(ch[2]), σ,
+function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+                  init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                  weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+
+  CrossCor(weight, bias, σ,
       stride = stride, pad = pad, dilation = dilation)
+end

@functor CrossCor

--- a/src/zeros.jl
+++ b/src/zeros.jl
@ -0,0 +1,106 @@
+import Base: +, -, *, reshape, size
+import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
+
+"""
+    Zeros()
+    Zeros(size...)
+    Zeros(Type, size...)
+
+Acts as a stand-in for an array of zeros that can be
+used during training which is ignored by the optimisers.
+
+Useful to turn bias off for a forward pass of a layer.
+
+## Examples
+
+```julia
+julia> Flux.Zeros(3,3)
+3×3 Flux.Zeros{Bool,2}:
+ false  false  false
+ false  false  false
+ false  false  false
+
+julia> Flux.Zeros(Float32, 3,3)
+3×3 Flux.Zeros{Float32,2}:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+
+julia> rand(3,3) .+ Flux.Zeros()
+3×3 Array{Float64,2}:
+ 0.198739  0.490459  0.785386
+ 0.779074  0.39986   0.66383
+ 0.854981  0.447292  0.314497
+
+julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
+Conv((2, 2), 1=>3)
+```
+"""
+struct Zeros{T,N} <: AbstractArray{T,N}
+  size::Tuple
+end
+
+Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
+Zeros(sz::Integer...) = Zeros(Bool, sz...)
+
+Base.size(xs::Zeros) = xs.size
+Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
+
+Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
+
+Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
+Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
+              Zeros(T, length(inds))
+
+Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
+
+@adjoint reshape(xs::Zeros{T}, dims...) where T =
+                reshape(xs, dims...), _ -> nothing
+
+# Define basic ops
+for f in (:+, :-)
+  @eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
+    @assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
+    a
+  end
+end
+
+(a::Zeros, b::AbstractArray) = b + a
+-(a::Zeros, b::AbstractArray) = -b + a
+
+Base.copy(xs::Zeros{T,N}) where {T,N} = xs
+
+# Define broadcasting behaviour
+for op in (:+, :-)
+  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
+    bs = Broadcast.broadcast_shape(size(a), size(b))
+    size(a) == bs && return a
+    sz = similar(a, bs)
+    sz .= a
+  end
+end
+
+broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
+broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
+
+function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
+  Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+end
+
+broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
+
+for op in (:+, :-, :*)
+  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+end
+
+# Some opportunities to avoid scalar indexing, intermediaries
+# Since it replicates a little of what we expect Base to do,
+# it should be possible to remove in the future, but for now,
+# these help with performance.
+broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
+broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
+broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
+broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@ -25,6 +25,35 @@ end
    Dense(288, 10), softmax)

  @test size(m(r)) == (10, 5)
+
+  # Test bias switch
+  bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
+  ip = zeros(Float32, 28,28,1,1)
+
+  op = bias(ip)
+  @test sum(op) == prod(size(op))
+
+  bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
+  op = bias(ip)
+  @test sum(op) === 0.f0
+  gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
+  @test gs[bias.bias] == nothing
+
+  # Train w/o bias and make sure no convergence happens
+  # when only bias can be converged
+  bias = Conv((2, 2), 1=>3, bias = Flux.Zeros());
+  ip = zeros(Float32, 28,28,1,1)
+  op = zeros(Float32, 27,27,3,1) .+ 2.f0
+  opt = Descent()
+
+  for _ = 1:10^3
+    gs = gradient(params(bias)) do
+      Flux.mse(bias(ip), op)
+    end
+    Flux.Optimise.update!(opt, params(bias), gs)
+  end
+
+  @test Flux.mse(bias(ip), op) ≈ 4.f0
 end

@testset "asymmetric padding" begin