From 5ea6a33f443a4efe1fb2a2e045f501e67399fbc8 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 27 Sep 2019 11:48:12 +0530
Subject: [PATCH 01/39] make bias optional

---
 src/layers/conv.jl  | 19 ++++++++++++-------
 test/layers/conv.jl | 11 +++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 519f129f..26a34306 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -21,7 +21,7 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
 """
 struct Conv{N,M,F,A,V}
   σ::F
@@ -30,29 +30,34 @@ struct Conv{N,M,F,A,V}
   stride::NTuple{N,Int}
   pad::NTuple{M,Int}
   dilation::NTuple{N,Int}
+  use_bias::Bool
 end
 
 function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+              stride = 1, pad = 0, dilation = 1, use_bias = true) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  return Conv(σ, w, b, stride, pad, dilation)
+  return Conv(σ, w, b, stride, pad, dilation, use_bias)
 end
 
 Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1) where N =
+     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1, use_bias = true) where N =
   Conv(init(k..., ch...), zeros(ch[2]), σ,
-       stride = stride, pad = pad, dilation = dilation)
+       stride = stride, pad = pad, dilation = dilation, use_bias = use_bias)
 
 @functor Conv
 
 function (c::Conv)(x::AbstractArray)
   # TODO: breaks gpu broadcast :(
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
-  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
   cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  σ.(conv(x, c.weight, cdims) .+ b)
+  if c.use_bias
+    σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
+    σ.(conv(x, c.weight, cdims) .+ b)
+  else
+    c.σ.(conv(x, c.weight, cdims))
+  end
 end
 
 function Base.show(io::IO, l::Conv)
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index aa3925f1..2ac61e24 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -20,6 +20,17 @@ end
     Dense(288, 10), softmax)
 
   @test size(m(r)) == (10, 5)
+
+  # Test bias switch
+  bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
+  ip = zeros(Float32, 28,28,1,1)
+
+  op = bias(ip)
+  @test sum(op) == prod(size(op))
+
+  bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3), use_bias = false)
+  op = bias(ip)
+  @test sum(op) === 0.f0
 end
 
 @testset "asymmetric padding" begin

From 9f2ac8fdef99b2257d566af0f41d46c7a5f57172 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 27 Sep 2019 12:04:27 +0530
Subject: [PATCH 02/39] ditto remaining conv layers

---
 src/layers/conv.jl | 52 ++++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 26a34306..a427c143 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -92,20 +92,21 @@ struct ConvTranspose{N,M,F,A,V}
   stride::NTuple{N,Int}
   pad::NTuple{M,Int}
   dilation::NTuple{N,Int}
+  use_bias::Bool
 end
 
 function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+              stride = 1, pad = 0, dilation = 1, use_bias = true) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  return ConvTranspose(σ, w, b, stride, pad, dilation)
+  return ConvTranspose(σ, w, b, stride, pad, dilation, use_bias)
 end
 
 ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-              init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
+              init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N =
 ConvTranspose(init(k..., reverse(ch)...), zeros(ch[2]), σ,
-              stride = stride, pad = pad, dilation = dilation)
+              stride = stride, pad = pad, dilation = dilation, use_bias = use_bias)
 
 @functor ConvTranspose
 
@@ -125,9 +126,13 @@ end
 
 function (c::ConvTranspose)(x::AbstractArray)
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
-  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
   cdims = conv_transpose_dims(c, x)
-  return σ.(∇conv_data(x, c.weight, cdims) .+ b)
+  if c.use_bias
+    σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
+    σ.(∇conv_data(x, c.weight, cdims) .+ b)
+  else
+    c.σ.(∇conv_data(x, c.weight, cdims))
+  end
 end
 
 function Base.show(io::IO, l::ConvTranspose)
@@ -162,18 +167,19 @@ struct DepthwiseConv{N,M,F,A,V}
   stride::NTuple{N,Int}
   pad::NTuple{M,Int}
   dilation::NTuple{N,Int}
+  use_bias::Bool
 end
 
 function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-                       stride = 1, pad = 0, dilation = 1) where {T,N}
+                       stride = 1, pad = 0, dilation = 1, use_bias = true) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  return DepthwiseConv(σ, w, b, stride, pad, dilation)
+  return DepthwiseConv(σ, w, b, stride, pad, dilation, use_bias)
 end
 
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N
+     init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
   return DepthwiseConv(
     init(k..., div(ch[2], ch[1]), ch[1]),
@@ -181,16 +187,21 @@ function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ =
     σ;
     stride = stride,
     pad = pad,
-    dilation = dilation
+    dilation = dilation,
+    use_bias = use_bias
   )
 end
 
 @functor DepthwiseConv
 
 function (c::DepthwiseConv)(x)
-  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
   cdims = DepthwiseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  σ.(depthwiseconv(x, c.weight, cdims) .+ b)
+  if c.use_bias
+    σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
+    σ.(depthwiseconv(x, c.weight, cdims) .+ b)
+  else
+    c.σ.(depthwiseconv(x, c.weight, cdims))
+  end
 end
 
 function Base.show(io::IO, l::DepthwiseConv)
@@ -234,20 +245,21 @@ struct CrossCor{N,M,F,A,V}
   stride::NTuple{N,Int}
   pad::NTuple{M,Int}
   dilation::NTuple{N,Int}
+  use_bias::Bool
 end
 
 function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+              stride = 1, pad = 0, dilation = 1, use_bias = true) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  return CrossCor(σ, w, b, stride, pad, dilation)
+  return CrossCor(σ, w, b, stride, pad, dilation, use_bias)
 end
 
 CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
+     init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N =
   CrossCor(init(k..., ch...), zeros(ch[2]), σ,
-       stride = stride, pad = pad, dilation = dilation)
+       stride = stride, pad = pad, dilation = dilation, use_bias = use_bias)
 
 @functor CrossCor
 
@@ -259,9 +271,13 @@ end
 function (c::CrossCor)(x::AbstractArray)
   # TODO: breaks gpu broadcast :(
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
-  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
   cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  σ.(crosscor(x, c.weight, cdims) .+ b)
+  if c.use_bias
+    σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
+    σ.(crosscor(x, c.weight, cdims) .+ b)
+  else
+    c.σ.(crosscor(x, c.weight, cdims))
+  end
 end
 
 function Base.show(io::IO, l::CrossCor)

From a801fcb9e7e5075bad34cd0fde3d4eb85828cb5d Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 27 Sep 2019 12:07:55 +0530
Subject: [PATCH 03/39] docstrings

---
 src/layers/conv.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index a427c143..f77fb58c 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -83,7 +83,7 @@ Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
 """
 struct ConvTranspose{N,M,F,A,V}
   σ::F
@@ -158,7 +158,7 @@ Note that `out` must be an integer multiple of `in`.
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
 """
 struct DepthwiseConv{N,M,F,A,V}
   σ::F
@@ -236,7 +236,7 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
 """
 struct CrossCor{N,M,F,A,V}
   σ::F

From dced8c04e5e605c925433f138a169fc394959f93 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 1 Oct 2019 21:25:07 +0530
Subject: [PATCH 04/39] use ZeroType

---
 src/layers/conv.jl  | 108 ++++++++++++++++++++------------------------
 src/utils.jl        |  10 ++++
 test/layers/conv.jl |   6 ++-
 3 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index f77fb58c..a8ab158f 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -21,7 +21,7 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct Conv{N,M,F,A,V}
   σ::F
@@ -30,34 +30,32 @@ struct Conv{N,M,F,A,V}
   stride::NTuple{N,Int}
   pad::NTuple{M,Int}
   dilation::NTuple{N,Int}
-  use_bias::Bool
 end
 
-function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1, use_bias = true) where {T,N}
+function Conv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+              stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  return Conv(σ, w, b, stride, pad, dilation, use_bias)
+  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
+  return Conv(σ, w, b, stride, pad, dilation)
 end
 
-Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1, use_bias = true) where N =
-  Conv(init(k..., ch...), zeros(ch[2]), σ,
-       stride = stride, pad = pad, dilation = dilation, use_bias = use_bias)
+function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1, use_bias = true) where N
+  b = use_bias ? zeros(ch[2]) : ZeroType((ch[2],))
+  Conv(init(k..., ch...), b, σ,
+       stride = stride, pad = pad, dilation = dilation)
+end
 
 @functor Conv
 
 function (c::Conv)(x::AbstractArray)
   # TODO: breaks gpu broadcast :(
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
+  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
   cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  if c.use_bias
-    σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
-    σ.(conv(x, c.weight, cdims) .+ b)
-  else
-    c.σ.(conv(x, c.weight, cdims))
-  end
+  σ.(conv(x, c.weight, cdims) .+ b)
 end
 
 function Base.show(io::IO, l::Conv)
@@ -83,7 +81,7 @@ Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct ConvTranspose{N,M,F,A,V}
   σ::F
@@ -92,21 +90,23 @@ struct ConvTranspose{N,M,F,A,V}
   stride::NTuple{N,Int}
   pad::NTuple{M,Int}
   dilation::NTuple{N,Int}
-  use_bias::Bool
 end
 
-function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1, use_bias = true) where {T,N}
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+              stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  return ConvTranspose(σ, w, b, stride, pad, dilation, use_bias)
+  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
+  return ConvTranspose(σ, w, b, stride, pad, dilation)
 end
 
-ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-              init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N =
-ConvTranspose(init(k..., reverse(ch)...), zeros(ch[2]), σ,
-              stride = stride, pad = pad, dilation = dilation, use_bias = use_bias)
+function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+              init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N
+  b = use_bias ? zeros(ch[2]) : ZeroType((ch[2], ))
+  ConvTranspose(init(k..., reverse(ch)...), b, σ,
+              stride = stride, pad = pad, dilation = dilation)
+end
 
 @functor ConvTranspose
 
@@ -126,13 +126,9 @@ end
 
 function (c::ConvTranspose)(x::AbstractArray)
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
+  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
   cdims = conv_transpose_dims(c, x)
-  if c.use_bias
-    σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
-    σ.(∇conv_data(x, c.weight, cdims) .+ b)
-  else
-    c.σ.(∇conv_data(x, c.weight, cdims))
-  end
+  σ.(∇conv_data(x, c.weight, cdims) .+ b)
 end
 
 function Base.show(io::IO, l::ConvTranspose)
@@ -158,7 +154,7 @@ Note that `out` must be an integer multiple of `in`.
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct DepthwiseConv{N,M,F,A,V}
   σ::F
@@ -167,41 +163,37 @@ struct DepthwiseConv{N,M,F,A,V}
   stride::NTuple{N,Int}
   pad::NTuple{M,Int}
   dilation::NTuple{N,Int}
-  use_bias::Bool
 end
 
-function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-                       stride = 1, pad = 0, dilation = 1, use_bias = true) where {T,N}
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+                       stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  return DepthwiseConv(σ, w, b, stride, pad, dilation, use_bias)
+  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
+  return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
      init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
+  b = use_bias ? zeros(ch[2]) : ZeroType((ch[2], ))
   return DepthwiseConv(
     init(k..., div(ch[2], ch[1]), ch[1]),
-    zeros(ch[2]),
+    b,
     σ;
     stride = stride,
     pad = pad,
-    dilation = dilation,
-    use_bias = use_bias
+    dilation = dilation
   )
 end
 
 @functor DepthwiseConv
 
 function (c::DepthwiseConv)(x)
+  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
   cdims = DepthwiseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  if c.use_bias
-    σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
-    σ.(depthwiseconv(x, c.weight, cdims) .+ b)
-  else
-    c.σ.(depthwiseconv(x, c.weight, cdims))
-  end
+  σ.(depthwiseconv(x, c.weight, cdims) .+ b)
 end
 
 function Base.show(io::IO, l::DepthwiseConv)
@@ -236,7 +228,7 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct CrossCor{N,M,F,A,V}
   σ::F
@@ -245,21 +237,23 @@ struct CrossCor{N,M,F,A,V}
   stride::NTuple{N,Int}
   pad::NTuple{M,Int}
   dilation::NTuple{N,Int}
-  use_bias::Bool
 end
 
-function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1, use_bias = true) where {T,N}
+function CrossCor(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+              stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  return CrossCor(σ, w, b, stride, pad, dilation, use_bias)
+  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
+  return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
-CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N =
-  CrossCor(init(k..., ch...), zeros(ch[2]), σ,
-       stride = stride, pad = pad, dilation = dilation, use_bias = use_bias)
+function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+     init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N
+  b = use_bias ? zeros(ch[2]) : ZeroType((ch[2],))
+  CrossCor(init(k..., ch...), b, σ,
+       stride = stride, pad = pad, dilation = dilation)
+end
 
 @functor CrossCor
 
@@ -271,13 +265,9 @@ end
 function (c::CrossCor)(x::AbstractArray)
   # TODO: breaks gpu broadcast :(
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
+  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
   cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  if c.use_bias
-    σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
-    σ.(crosscor(x, c.weight, cdims) .+ b)
-  else
-    c.σ.(crosscor(x, c.weight, cdims))
-  end
+  σ.(crosscor(x, c.weight, cdims) .+ b)
 end
 
 function Base.show(io::IO, l::CrossCor)
diff --git a/src/utils.jl b/src/utils.jl
index 246c30d7..0507efa5 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -139,6 +139,16 @@ function throttle(f, timeout; leading=true, trailing=false)
   end
 end
 
+import Base: +, reshape, size
+struct ZeroType{T} <: Number
+  size::T
+end
++(a::Number, ::ZeroType) = a
++(::ZeroType, a::Number) = a
+size(xs::ZeroType) = xs.size
+reshape(::ZeroType, args...) = ZeroType(args)
+@adjoint reshape(xs::ZeroType, dims...) = ZeroType(dims), Δ -> (ZeroType(size(xs)), map(_ -> nothing, dims)...)
+
 """
     @jit ...
 
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 2ac61e24..fe5c575b 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -28,7 +28,11 @@ end
   op = bias(ip)
   @test sum(op) == prod(size(op))
 
-  bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3), use_bias = false)
+  bias = Conv(ones(Float32, 2, 2, 1, 3), Flux.ZeroType((3,)))
+  op = bias(ip)
+  @test sum(op) === 0.f0
+
+  bias = Conv(ones(Float32, 2, 2, 1, 3), nothing)
   op = bias(ip)
   @test sum(op) === 0.f0
 end

From 1fe321781b38edc48233cf3a3a47dd54b81e569b Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 1 Oct 2019 21:29:18 +0530
Subject: [PATCH 05/39] add to docs

---
 src/layers/conv.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index a8ab158f..2a5ab981 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -21,7 +21,7 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
 """
 struct Conv{N,M,F,A,V}
   σ::F
@@ -81,7 +81,7 @@ Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
 """
 struct ConvTranspose{N,M,F,A,V}
   σ::F
@@ -154,7 +154,7 @@ Note that `out` must be an integer multiple of `in`.
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
 """
 struct DepthwiseConv{N,M,F,A,V}
   σ::F
@@ -228,7 +228,7 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
 """
 struct CrossCor{N,M,F,A,V}
   σ::F

From 55ef7c1aba83479610b6ed0d8d48ffc769304f68 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 6 Oct 2019 04:25:23 +0530
Subject: [PATCH 06/39] add weight and bias kwargs

---
 src/layers/conv.jl  | 23 ++++++++++++++++++-----
 src/utils.jl        |  9 ---------
 test/layers/conv.jl |  6 +-----
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 2a5ab981..8a9edb64 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -32,19 +32,32 @@ struct Conv{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function Conv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+"""
+    Conv(weight::AbstractArray, bias::AbstractArray)
+    Conv(weight::AbstractArray, bias::AbstractArray, relu)
+
+Constructs the convolutional layer with user defined weight and bias arrays.
+All other behaviours of the Conv layer apply with regard to data order and
+forward pass.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+"""
+function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
   return Conv(σ, w, b, stride, pad, dilation)
 end
 
+convweight(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}; init = glorot_uniform) = init(k..., ch...)
+const convbias = zeros
+
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1, use_bias = true) where N
-  b = use_bias ? zeros(ch[2]) : ZeroType((ch[2],))
-  Conv(init(k..., ch...), b, σ,
+     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
+     weight = convweight(k, ch, init = init), bias = convbias(ch[2])) where N
+
+  Conv(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)
 end
 
diff --git a/src/utils.jl b/src/utils.jl
index 0507efa5..a12b59b7 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -139,15 +139,6 @@ function throttle(f, timeout; leading=true, trailing=false)
   end
 end
 
-import Base: +, reshape, size
-struct ZeroType{T} <: Number
-  size::T
-end
-+(a::Number, ::ZeroType) = a
-+(::ZeroType, a::Number) = a
-size(xs::ZeroType) = xs.size
-reshape(::ZeroType, args...) = ZeroType(args)
-@adjoint reshape(xs::ZeroType, dims...) = ZeroType(dims), Δ -> (ZeroType(size(xs)), map(_ -> nothing, dims)...)
 
 """
     @jit ...
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index fe5c575b..169c3077 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -28,11 +28,7 @@ end
   op = bias(ip)
   @test sum(op) == prod(size(op))
 
-  bias = Conv(ones(Float32, 2, 2, 1, 3), Flux.ZeroType((3,)))
-  op = bias(ip)
-  @test sum(op) === 0.f0
-
-  bias = Conv(ones(Float32, 2, 2, 1, 3), nothing)
+  bias = Conv((2,2), 1=>3, bias = zero(3))
   op = bias(ip)
   @test sum(op) === 0.f0
 end

From 48a305bd21183e1d8df664a31c8b22611603509b Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 6 Oct 2019 04:41:06 +0530
Subject: [PATCH 07/39] ditto remaining layers

---
 src/layers/conv.jl | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 8a9edb64..ee2b8f79 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -42,7 +42,7 @@ forward pass.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
+function Conv(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
@@ -105,19 +105,19 @@ struct ConvTranspose{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function ConvTranspose(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
   return ConvTranspose(σ, w, b, stride, pad, dilation)
 end
 
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-              init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N
-  b = use_bias ? zeros(ch[2]) : ZeroType((ch[2], ))
-  ConvTranspose(init(k..., reverse(ch)...), b, σ,
+              init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+              weight = convweight(k, reverse(ch), init = init), bias = convbias(ch[2])) where N
+  
+  ConvTranspose(weight, bias, σ,
               stride = stride, pad = pad, dilation = dilation)
 end
 
@@ -178,22 +178,24 @@ struct DepthwiseConv{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Number AbstractVector{T}}, σ = identity;
                        stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
   return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 
+depthwiseconvweight(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+  init = glorot_uniform) where N = init(k..., div(ch[2], ch[1]), ch[1])
+
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N
+     init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+     weight = depthwiseconvweight(k, ch, init = init), bias = convbias(ch[2])) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
-  b = use_bias ? zeros(ch[2]) : ZeroType((ch[2], ))
   return DepthwiseConv(
-    init(k..., div(ch[2], ch[1]), ch[1]),
-    b,
+    weight,
+    bias,
     σ;
     stride = stride,
     pad = pad,
@@ -252,7 +254,7 @@ struct CrossCor{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function CrossCor(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+function CrossCor(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
@@ -262,9 +264,9 @@ function CrossCor(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVec
 end
 
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1, use_bias = true) where N
-  b = use_bias ? zeros(ch[2]) : ZeroType((ch[2],))
-  CrossCor(init(k..., ch...), b, σ,
+     init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+     weight = convweight(k, ch, init = init), bias = convbias(ch[2])) where N
+  CrossCor(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)
 end
 

From e97d61f2575628527d5571163646a7e9d59e4c3a Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 6 Oct 2019 04:42:26 +0530
Subject: [PATCH 08/39] fixes

---
 src/layers/conv.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index ee2b8f79..0acf3551 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -21,7 +21,7 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct Conv{N,M,F,A,V}
   σ::F

From d00f833c17c2e18b4a5817390b93a1b35e4d8554 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 6 Oct 2019 04:44:50 +0530
Subject: [PATCH 09/39] rm ZeroType

---
 src/layers/conv.jl | 1 -
 src/utils.jl       | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 0acf3551..d05dfe4d 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -259,7 +259,6 @@ function CrossCor(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
   return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
diff --git a/src/utils.jl b/src/utils.jl
index a12b59b7..246c30d7 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -139,7 +139,6 @@ function throttle(f, timeout; leading=true, trailing=false)
   end
 end
 
-
 """
     @jit ...
 

From 2ae3ad3b3182143423d8252db0261f1ff6a357a6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 6 Oct 2019 04:46:13 +0530
Subject: [PATCH 10/39] doc fixes

---
 src/layers/conv.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index d05dfe4d..f02adb41 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -94,7 +94,7 @@ Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct ConvTranspose{N,M,F,A,V}
   σ::F
@@ -167,7 +167,7 @@ Note that `out` must be an integer multiple of `in`.
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct DepthwiseConv{N,M,F,A,V}
   σ::F
@@ -243,7 +243,7 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
-Takes the keyword arguments `use_bias`, `pad`, `stride` and `dilation`.
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct CrossCor{N,M,F,A,V}
   σ::F

From 214f71f49273e135cfcffb93d87b92e041139128 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 6 Oct 2019 04:55:33 +0530
Subject: [PATCH 11/39] add N

---
 src/layers/conv.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index f02adb41..53d138db 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -50,7 +50,8 @@ function Conv(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ = i
   return Conv(σ, w, b, stride, pad, dilation)
 end
 
-convweight(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}; init = glorot_uniform) = init(k..., ch...)
+convweight(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+  init = glorot_uniform) where N = init(k..., ch...)
 const convbias = zeros
 
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;

From a1e826b888171541d32a6d59db2b14fdc62e95ff Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 6 Oct 2019 05:10:56 +0530
Subject: [PATCH 12/39] fixes

---
 src/layers/conv.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 53d138db..c7bda4ab 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -179,7 +179,7 @@ struct DepthwiseConv{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Number AbstractVector{T}}, σ = identity;
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ = identity;
                        stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)

From f3904b4e0490bdd7a1b60c16bfc86372736f6cfa Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 8 Oct 2019 17:17:36 +0530
Subject: [PATCH 13/39] add ZeroType back

---
 src/utils.jl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/utils.jl b/src/utils.jl
index 246c30d7..a42c37d5 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -139,6 +139,19 @@ function throttle(f, timeout; leading=true, trailing=false)
   end
 end
 
+import Base: +, reshape, size
+
+struct ZeroType{T} <: Number
+  size::T
+end
+
++(a::Number, ::ZeroType) = a
++(::ZeroType, a::Number) = a
+size(xs::ZeroType) = xs.size
+reshape(::ZeroType, args...) = ZeroType(args)
+@adjoint reshape(xs::ZeroType, dims...) =
+  ZeroType(dims), Δ -> (ZeroType(size(xs)), map(_ -> nothing, dims)...)
+
 """
     @jit ...
 

From 040697fb2bee3987f40bd5e8d3c3b6a815cbcfcf Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 8 Oct 2019 17:18:19 +0530
Subject: [PATCH 14/39] add bias and weight kwarg

---
 src/layers/conv.jl | 108 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 25 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index c7bda4ab..5dcd400c 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -3,16 +3,16 @@ using NNlib: conv, ∇conv_data, depthwiseconv
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
 """
-    Conv(size, in=>out)
-    Conv(size, in=>out, relu)
+    Conv(filter::Tuple, in=>out)
+    Conv(filter::Tuple, in=>out, activation)
 
-Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
+Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 
 Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
          giving us a 16-channel output. Output is activated with ReLU.
 
-    size = (2,2)
+    filter = (2,2)
     in = 1
     out = 16
     Conv((2, 2), 1=>16, relu)
@@ -34,7 +34,7 @@ end
 
 """
     Conv(weight::AbstractArray, bias::AbstractArray)
-    Conv(weight::AbstractArray, bias::AbstractArray, relu)
+    Conv(weight::AbstractArray, bias::AbstractArray, activation)
 
 Constructs the convolutional layer with user defined weight and bias arrays.
 All other behaviours of the Conv layer apply with regard to data order and
@@ -42,21 +42,32 @@ forward pass.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function Conv(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ = identity;
+function Conv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
+  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
   return Conv(σ, w, b, stride, pad, dilation)
 end
 
-convweight(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
-  init = glorot_uniform) where N = init(k..., ch...)
-const convbias = zeros
+"""
+    convweight(filter::Tuple, in=>out)
+
+Constructs a standard convolutional weight matrix with given `filter` and
+channels from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`depthwiseconvweight`](@ref)
+"""
+convweight(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+  init = glorot_uniform) where N = init(filter..., ch...)
 
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
      init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
-     weight = convweight(k, ch, init = init), bias = convbias(ch[2])) where N
+     weight = convweight(k, ch, init = init), bias = zeros(ch[2])) where N
 
   Conv(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)
@@ -86,10 +97,10 @@ end
   a(T.(x))
 
 """
-    ConvTranspose(size, in=>out)
-    ConvTranspose(size, in=>out, relu)
+    ConvTranspose(filter::Tuple, in=>out)
+    ConvTranspose(filter::Tuple, in=>out, relu)
 
-Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
+Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
@@ -106,17 +117,28 @@ struct ConvTranspose{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function ConvTranspose(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ = identity;
+"""
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray)
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the convolutional transpose layer with user defined weight and bias arrays.
+All other behaviours of the ConvTranspose layer apply with regard to data order and
+forward pass.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+"""
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
+  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
   return ConvTranspose(σ, w, b, stride, pad, dilation)
 end
 
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
               init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-              weight = convweight(k, reverse(ch), init = init), bias = convbias(ch[2])) where N
+              weight = convweight(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
   
   ConvTranspose(weight, bias, σ,
               stride = stride, pad = pad, dilation = dilation)
@@ -157,11 +179,12 @@ end
 
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
-"""
-    DepthwiseConv(size, in=>out)
-    DepthwiseConv(size, in=>out, relu)
 
-Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
+"""
+    DepthwiseConv(filter::Tuple, in=>out)
+    DepthwiseConv(filter::Tuple, in=>out, relu)
+
+Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Note that `out` must be an integer multiple of `in`.
 
@@ -179,21 +202,44 @@ struct DepthwiseConv{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ = identity;
+"""
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray)
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
+All other behaviours of the `DepthwiseConv` layer apply with regard to data order and
+forward pass.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+"""
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
                        stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
+  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
   return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 
-depthwiseconvweight(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
-  init = glorot_uniform) where N = init(k..., div(ch[2], ch[1]), ch[1])
+"""
+    depthwiseconvweight(filter::Tuple, in=>out)
+
+Constructs a depthwise convolutional weight array defined by `filter` and channels
+from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`convweight`](@ref)
+"""
+depthwiseconvweight(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+  init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
 
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-     weight = depthwiseconvweight(k, ch, init = init), bias = convbias(ch[2])) where N
+     weight = depthwiseconvweight(k, ch, init = init), bias = zeros(ch[2])) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
+
   return DepthwiseConv(
     weight,
     bias,
@@ -255,17 +301,29 @@ struct CrossCor{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function CrossCor(w::AbstractArray{T,N}, b::Union{Number, AbstractVector{T}}, σ = identity;
+"""
+    CrossCor(weight::AbstractArray, bias::AbstractArray)
+    CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the standard cross convolutional layer with user defined weight and bias
+arrays. All other behaviours of the CrossCor layer apply with regard to data order and
+forward pass.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+"""
+function CrossCor(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
+  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
   return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-     weight = convweight(k, ch, init = init), bias = convbias(ch[2])) where N
+     weight = convweight(k, ch, init = init), bias = zeros(ch[2])) where N
+
   CrossCor(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)
 end

From b596faaffabc31a48af433e6da5382defeeb8eb0 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 8 Oct 2019 17:18:39 +0530
Subject: [PATCH 15/39] tests bias switch

---
 test/layers/conv.jl | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 169c3077..5b3eb326 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -28,9 +28,25 @@ end
   op = bias(ip)
   @test sum(op) == prod(size(op))
 
-  bias = Conv((2,2), 1=>3, bias = zero(3))
+  bias = Conv((2,2), 1=>3, bias = Flux.ZeroType((3,)))
   op = bias(ip)
   @test sum(op) === 0.f0
+
+  # Train w/o bias and make sure no convergence happens
+  # when only bias can be converged
+  bias = Conv((2, 2), 1=>3, bias = Flux.ZeroType((3,)));
+  ip = zeros(Float32, 28,28,1,1)
+  op = zeros(Float32, 27,27,3,1) .+ 2.f0
+  opt = Descent()
+
+  for _ = 1:10^3
+    gs = gradient(params(bias)) do
+      Flux.mse(bias(ip), op)
+    end
+    Flux.Optimise.update!(opt, params(bias), gs)
+  end
+
+  @test Flux.mse(bias(ip), op) ≈ 4.f0
 end
 
 @testset "asymmetric padding" begin

From 95c5845e99e7d4ccde36f090a4f9f9fdbe865f9c Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 8 Oct 2019 17:54:01 +0530
Subject: [PATCH 16/39] document bias switch

---
 src/layers/conv.jl | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 5dcd400c..58b6ccb5 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -21,6 +21,10 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.ZeroType((out,))` will switch bias off for the
+layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct Conv{N,M,F,A,V}
@@ -40,6 +44,9 @@ Constructs the convolutional layer with user defined weight and bias arrays.
 All other behaviours of the Conv layer apply with regard to data order and
 forward pass.
 
+Setting `bias` to `nothing` or `Flux.ZeroType((out,))` would switch `bias` off for the
+layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 function Conv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
@@ -106,6 +113,10 @@ Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.ZeroType((out,))` will switch bias off for the
+layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct ConvTranspose{N,M,F,A,V}
@@ -125,6 +136,9 @@ Constructs the convolutional transpose layer with user defined weight and bias a
 All other behaviours of the ConvTranspose layer apply with regard to data order and
 forward pass.
 
+Setting `bias` to `nothing` or `Flux.ZeroType((out,))` would switch `bias` off for the
+layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 function ConvTranspose(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
@@ -191,6 +205,10 @@ Note that `out` must be an integer multiple of `in`.
 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.ZeroType((out,))` will switch bias off for the
+layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct DepthwiseConv{N,M,F,A,V}
@@ -210,6 +228,9 @@ Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
 All other behaviours of the `DepthwiseConv` layer apply with regard to data order and
 forward pass.
 
+Setting `bias` to `nothing` or `Flux.ZeroType((out,))` would switch `bias` off for the
+layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
@@ -290,6 +311,10 @@ Data should be stored in WHCN order (width, height, # channels, # batches).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.ZeroType((out,))` will switch bias off for the
+layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct CrossCor{N,M,F,A,V}
@@ -309,6 +334,9 @@ Constructs the standard cross convolutional layer with user defined weight and b
 arrays. All other behaviours of the CrossCor layer apply with regard to data order and
 forward pass.
 
+Setting `bias` to `nothing` or `Flux.ZeroType((out,))` would switch `bias` off for the
+layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 function CrossCor(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;

From 49ea43e711ea98f9d36184d08aa37832413f29f5 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 8 Oct 2019 20:02:04 +0530
Subject: [PATCH 17/39] ZeroType => Zeros

---
 src/layers/conv.jl  | 32 ++++++++++++++++----------------
 src/utils.jl        | 28 ++++++++++++++++++++--------
 test/layers/conv.jl |  4 ++--
 3 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 58b6ccb5..ad9164c4 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -22,7 +22,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.ZeroType((out,))` will switch bias off for the
+Setting `bias` to `Flux.Zeros()` will switch bias off for the
 layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
@@ -44,17 +44,17 @@ Constructs the convolutional layer with user defined weight and bias arrays.
 All other behaviours of the Conv layer apply with regard to data order and
 forward pass.
 
-Setting `bias` to `nothing` or `Flux.ZeroType((out,))` would switch `bias` off for the
+Setting `bias` to `nothing` or `Flux.Zeros()` would switch `bias` off for the
 layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function Conv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+function Conv(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
+  b = b isa Nothing ? Zeros((size(w, ndims(w)), )) : b
   return Conv(σ, w, b, stride, pad, dilation)
 end
 
@@ -114,7 +114,7 @@ Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.ZeroType((out,))` will switch bias off for the
+Setting `bias` to `Flux.Zeros()` will switch bias off for the
 layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
@@ -136,17 +136,17 @@ Constructs the convolutional transpose layer with user defined weight and bias a
 All other behaviours of the ConvTranspose layer apply with regard to data order and
 forward pass.
 
-Setting `bias` to `nothing` or `Flux.ZeroType((out,))` would switch `bias` off for the
+Setting `bias` to `nothing` or `Flux.Zeros()` would switch `bias` off for the
 layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function ConvTranspose(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
+  b = b isa Nothing ? Zeros((size(w, ndims(w)), )) : b
   return ConvTranspose(σ, w, b, stride, pad, dilation)
 end
 
@@ -206,7 +206,7 @@ Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.ZeroType((out,))` will switch bias off for the
+Setting `bias` to `Flux.Zeros()` will switch bias off for the
 layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
@@ -228,17 +228,17 @@ Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
 All other behaviours of the `DepthwiseConv` layer apply with regard to data order and
 forward pass.
 
-Setting `bias` to `nothing` or `Flux.ZeroType((out,))` would switch `bias` off for the
+Setting `bias` to `nothing` or `Flux.Zeros()` would switch `bias` off for the
 layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractVector{T}}, σ = identity;
                        stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
+  b = b isa Nothing ? Zeros((size(w, ndims(w)), )) : b
   return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 
@@ -312,7 +312,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.ZeroType((out,))` will switch bias off for the
+Setting `bias` to `Flux.Zeros()` will switch bias off for the
 layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
@@ -334,17 +334,17 @@ Constructs the standard cross convolutional layer with user defined weight and b
 arrays. All other behaviours of the CrossCor layer apply with regard to data order and
 forward pass.
 
-Setting `bias` to `nothing` or `Flux.ZeroType((out,))` would switch `bias` off for the
+Setting `bias` to `nothing` or `Flux.Zeros()` would switch `bias` off for the
 layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function CrossCor(w::AbstractArray{T,N}, b::Union{Nothing, ZeroType, AbstractVector{T}}, σ = identity;
+function CrossCor(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? ZeroType((size(w, ndims(w)), )) : b
+  b = b isa Nothing ? Zeros((size(w, ndims(w)), )) : b
   return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
diff --git a/src/utils.jl b/src/utils.jl
index a42c37d5..9e095811 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -141,16 +141,28 @@ end
 
 import Base: +, reshape, size
 
-struct ZeroType{T} <: Number
-  size::T
+"""
+    Zeros()
+    Zeros(T, a::Union{Colon, Int}...)
+
+Acts as a stand-in for an array of zeros that can be used during training which is
+ignored by the optimisers.
+"""
+struct Zeros{T} <: Number
+  size::Tuple
 end
 
-+(a::Number, ::ZeroType) = a
-+(::ZeroType, a::Number) = a
-size(xs::ZeroType) = xs.size
-reshape(::ZeroType, args...) = ZeroType(args)
-@adjoint reshape(xs::ZeroType, dims...) =
-  ZeroType(dims), Δ -> (ZeroType(size(xs)), map(_ -> nothing, dims)...)
+Zeros(::Type{T}, sz...) where T = Zeros{T}(sz)
+Zeros(sz::Union{Integer, Colon}...) = Zeros(Bool, sz...)
+
++(a::Number, ::Zeros) = a
++(::Zeros, a::Number) = a
+
+size(xs::Zeros) = xs.size
+reshape(z::Zeros{T}, args...) where T = Zeros(T, args...)
+
+@adjoint reshape(xs::Zeros{T}, dims...) where T =
+  Zeros(T, dims...), Δ -> (Zeros(T, size(xs)...), map(_ -> nothing, dims)...)
 
 """
     @jit ...
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 5b3eb326..4a3f8c16 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -28,13 +28,13 @@ end
   op = bias(ip)
   @test sum(op) == prod(size(op))
 
-  bias = Conv((2,2), 1=>3, bias = Flux.ZeroType((3,)))
+  bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
   op = bias(ip)
   @test sum(op) === 0.f0
 
   # Train w/o bias and make sure no convergence happens
   # when only bias can be converged
-  bias = Conv((2, 2), 1=>3, bias = Flux.ZeroType((3,)));
+  bias = Conv((2, 2), 1=>3, bias = Flux.Zeros());
   ip = zeros(Float32, 28,28,1,1)
   op = zeros(Float32, 27,27,3,1) .+ 2.f0
   opt = Descent()

From c85bad4427ca96631700c7c224317fa7fac7d439 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 8 Oct 2019 20:26:09 +0530
Subject: [PATCH 18/39] replace weight with filter

---
 src/layers/conv.jl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index ad9164c4..a60749e3 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -59,7 +59,7 @@ function Conv(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractVector{T}}
 end
 
 """
-    convweight(filter::Tuple, in=>out)
+    convfilter(filter::Tuple, in=>out)
 
 Constructs a standard convolutional weight matrix with given `filter` and
 channels from `in` to `out`.
@@ -67,14 +67,14 @@ channels from `in` to `out`.
 Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
 distribution.
 
-See also: [`depthwiseconvweight`](@ref)
+See also: [`depthwiseconvfilter`](@ref)
 """
-convweight(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
   init = glorot_uniform) where N = init(filter..., ch...)
 
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
      init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
-     weight = convweight(k, ch, init = init), bias = zeros(ch[2])) where N
+     weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
 
   Conv(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)
@@ -152,7 +152,7 @@ end
 
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
               init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-              weight = convweight(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
+              weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
   
   ConvTranspose(weight, bias, σ,
               stride = stride, pad = pad, dilation = dilation)
@@ -243,7 +243,7 @@ function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractV
 end
 
 """
-    depthwiseconvweight(filter::Tuple, in=>out)
+    depthwiseconvfilter(filter::Tuple, in=>out)
 
 Constructs a depthwise convolutional weight array defined by `filter` and channels
 from `in` to `out`.
@@ -251,14 +251,14 @@ from `in` to `out`.
 Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
 distribution.
 
-See also: [`convweight`](@ref)
+See also: [`convfilter`](@ref)
 """
-depthwiseconvweight(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
   init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
 
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-     weight = depthwiseconvweight(k, ch, init = init), bias = zeros(ch[2])) where N
+     weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
 
   return DepthwiseConv(
@@ -350,7 +350,7 @@ end
 
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-     weight = convweight(k, ch, init = init), bias = zeros(ch[2])) where N
+     weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
 
   CrossCor(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)

From 4a183aeaf02a9de9a98f21ee5eddfd0e7f8219f4 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 22 Oct 2019 16:11:27 +0530
Subject: [PATCH 19/39] make Zeros a dimensionlesss number

---
 src/utils.jl | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 9e095811..ee5f2db7 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -139,30 +139,40 @@ function throttle(f, timeout; leading=true, trailing=false)
   end
 end
 
-import Base: +, reshape, size
+import Base: +, -, reshape, size
+import Base.Broadcast: broadcasted
 
 """
     Zeros()
-    Zeros(T, a::Union{Colon, Int}...)
 
-Acts as a stand-in for an array of zeros that can be used during training which is
-ignored by the optimisers.
+Acts as a stand-in for an array of zeros that can be
+used during training which is ignored by the optimisers.
+
+Used to turn bias off for a forward pass of a layer.
+
+## Examples
+
+```julia
+julia> rand(3,3) .+ Flux.Zeros()
+3×3 Array{Float64,2}:
+ 0.198739  0.490459  0.785386
+ 0.779074  0.39986   0.66383
+ 0.854981  0.447292  0.314497
+
+julia> bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
+Conv((2, 2), 1=>3)
+```
 """
-struct Zeros{T} <: Number
-  size::Tuple
+struct Zeros <: Number end
+for f in (:+, :-)
+ @eval $f(a::Union{Number, Zeros}, b::Zeros) = a
 end
+Base.:*(a::Union{Number, Zeros}, b::Zeros) = zero(a)
 
-Zeros(::Type{T}, sz...) where T = Zeros{T}(sz)
-Zeros(sz::Union{Integer, Colon}...) = Zeros(Bool, sz...)
-
-+(a::Number, ::Zeros) = a
-+(::Zeros, a::Number) = a
-
-size(xs::Zeros) = xs.size
-reshape(z::Zeros{T}, args...) where T = Zeros(T, args...)
-
-@adjoint reshape(xs::Zeros{T}, dims...) where T =
-  Zeros(T, dims...), Δ -> (Zeros(T, size(xs)...), map(_ -> nothing, dims)...)
+broadcasted(::typeof(+), arr::AbstractArray, ::Zeros) = arr
+broadcasted(::typeof(*), arr::AbstractArray, ::Zeros) = zero(arr)
+Base.reshape(xs::Zeros, args...) = xs
+@adjoint reshape(xs::Zeros, dims...) = reshape(xs, dims...), _ -> nothing
 
 """
     @jit ...

From 7c90fb469d19585d63d95aeb28e68041af7e35b7 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 23 Oct 2019 20:02:15 +0530
Subject: [PATCH 20/39] use array to define Zeros

---
 src/utils.jl | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index ee5f2db7..155326ab 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -163,16 +163,39 @@ julia> bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
 Conv((2, 2), 1=>3)
 ```
 """
-struct Zeros <: Number end
-for f in (:+, :-)
- @eval $f(a::Union{Number, Zeros}, b::Zeros) = a
+struct Zeros{T,N} <: AbstractArray{T,N}
+  size::Tuple
 end
-Base.:*(a::Union{Number, Zeros}, b::Zeros) = zero(a)
+
+Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
+Zeros(sz::Integer...) = Zeros(Bool, sz...)
+
++(a::Union{AbstractVecOrMat, Number}, ::Zeros) = a
+
+Base.size(xs::Zeros) = xs.size
+Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
+
+Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
+
+Base.getindex(xs::Zeros{T,N}, i::Int) where {T,N} = zero(T)
+Base.setindex(xs::Zeros, args...) =
+  error("setindex disallowed on Zeros Array")
+Base.setindex!(xs::Zeros, args...) =
+  error("setindex! disallowed on Zeros Array")
+
+Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
+
+@adjoint reshape(xs::Zeros{T}, dims...) where T =
+  reshape(xs, dims...), _ -> nothing
+
+for f in (:+, :-)
+ @eval $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros) = a
+end
+Base.:*(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros) = zero(a)
 
 broadcasted(::typeof(+), arr::AbstractArray, ::Zeros) = arr
+broadcasted(::typeof(-), arr::AbstractArray, ::Zeros) = arr
 broadcasted(::typeof(*), arr::AbstractArray, ::Zeros) = zero(arr)
-Base.reshape(xs::Zeros, args...) = xs
-@adjoint reshape(xs::Zeros, dims...) = reshape(xs, dims...), _ -> nothing
 
 """
     @jit ...

From a4a987f0b0c7745a05f4322eaaa87f422ce990b6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 7 Nov 2019 16:53:41 +0530
Subject: [PATCH 21/39] hook into bcasting

---
 src/utils.jl | 66 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 12 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 155326ab..6e5ab8a2 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -139,27 +139,45 @@ function throttle(f, timeout; leading=true, trailing=false)
   end
 end
 
-import Base: +, -, reshape, size
-import Base.Broadcast: broadcasted
+import Base: +, -, *, reshape, size
+import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
 
 """
     Zeros()
+    Zeros(size...)
+    Zeros(Type, size...)
 
 Acts as a stand-in for an array of zeros that can be
 used during training which is ignored by the optimisers.
 
-Used to turn bias off for a forward pass of a layer.
+Useful to turn bias off for a forward pass of a layer.
+
+!!! warning
+    Zeros acts a scalar while broadcasting, so does not
+    expand dims. Checks for shape compatibility by default.
 
 ## Examples
 
 ```julia
+julia> Flux.Zeros(3,3)
+3×3 Flux.Zeros{Bool,2}:
+ false  false  false
+ false  false  false
+ false  false  false
+
+julia> Flux.Zeros(Float32, 3,3)
+3×3 Flux.Zeros{Float32,2}:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+
 julia> rand(3,3) .+ Flux.Zeros()
 3×3 Array{Float64,2}:
  0.198739  0.490459  0.785386
  0.779074  0.39986   0.66383
  0.854981  0.447292  0.314497
 
-julia> bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
+julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
 Conv((2, 2), 1=>3)
 ```
 """
@@ -170,14 +188,15 @@ end
 Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
 Zeros(sz::Integer...) = Zeros(Bool, sz...)
 
-+(a::Union{AbstractVecOrMat, Number}, ::Zeros) = a
-
 Base.size(xs::Zeros) = xs.size
-Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
-
 Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
 
-Base.getindex(xs::Zeros{T,N}, i::Int) where {T,N} = zero(T)
+Base.IndexStyle(::Type{<:Zeros}) = IndexCartesian()
+
+Base.getindex(xs::Zeros{T,N}, I::Vararg{Int, N}) where {T,N} = zero(T)
+Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
+  Zeros(T, inds.stop)
+
 Base.setindex(xs::Zeros, args...) =
   error("setindex disallowed on Zeros Array")
 Base.setindex!(xs::Zeros, args...) =
@@ -185,17 +204,40 @@ Base.setindex!(xs::Zeros, args...) =
 
 Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
 
+# Ignore during backwards pass
 @adjoint reshape(xs::Zeros{T}, dims...) where T =
   reshape(xs, dims...), _ -> nothing
 
+# Define basic ops
 for f in (:+, :-)
  @eval $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros) = a
 end
+Base.:+(a::Zeros, b::AbstractArray) = b
+Base.:-(a::Zeros, b::AbstractArray) = -b
 Base.:*(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros) = zero(a)
+Base.:*(a::Zeros, b::AbstractArray) = zero(a)
 
-broadcasted(::typeof(+), arr::AbstractArray, ::Zeros) = arr
-broadcasted(::typeof(-), arr::AbstractArray, ::Zeros) = arr
-broadcasted(::typeof(*), arr::AbstractArray, ::Zeros) = zero(arr)
+# Hook into broadcasting API - to allow using as a regular array
+Base.BroadcastStyle(::Type{<:Zeros}) = Broadcast.ArrayStyle{Zeros}()
+Broadcast.broadcastable(xs::Zeros) = xs
+Base.BroadcastStyle(::Broadcast.ArrayStyle{Zeros}, ::Broadcast.DefaultArrayStyle{N}) where N =
+  Broadcast.ArrayStyle{Zeros}()
+
+function Base.similar(bc::Broadcasted{Broadcast.ArrayStyle{Flux.Zeros}}, ::Type{T}) where T
+  similar(Array{T}, axes(bc))
+end
+
+Base.copy(xs::Zeros{T,N}) where {T,N} = Zeros(T, size(xs)...)
+
+isZeros(x::Zeros) = true
+isZeros(x) = false
+
+function Base.copyto!(dest::AbstractArray, bc::Broadcasted{Broadcast.ArrayStyle{Flux.Zeros}})
+  bc = Broadcast.flatten(bc)
+
+  i = isZeros(first(bc.args)) ? 2 : 1 # findfirst(!isZeros, bc.args)
+  dest .= bc.args[i]
+end
 
 """
     @jit ...

From e89b8eba774fc32b3fd782352422fb88baee74e6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 13 Nov 2019 01:12:26 +0530
Subject: [PATCH 22/39] fixes

---
 src/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.jl b/src/utils.jl
index 6e5ab8a2..ae2910cc 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -215,7 +215,7 @@ end
 Base.:+(a::Zeros, b::AbstractArray) = b
 Base.:-(a::Zeros, b::AbstractArray) = -b
 Base.:*(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros) = zero(a)
-Base.:*(a::Zeros, b::AbstractArray) = zero(a)
+Base.:*(a::Zeros, b::AbstractArray) = zero(b)
 
 # Hook into broadcasting API - to allow using as a regular array
 Base.BroadcastStyle(::Type{<:Zeros}) = Broadcast.ArrayStyle{Zeros}()

From eb41715d26998d2ad711f1644ee0f7127dd01b14 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 19 Nov 2019 13:30:33 +0530
Subject: [PATCH 23/39] define manual rules

---
 src/utils.jl | 78 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 27 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index ae2910cc..57e62cca 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -191,54 +191,78 @@ Zeros(sz::Integer...) = Zeros(Bool, sz...)
 Base.size(xs::Zeros) = xs.size
 Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
 
-Base.IndexStyle(::Type{<:Zeros}) = IndexCartesian()
+Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
 
-Base.getindex(xs::Zeros{T,N}, I::Vararg{Int, N}) where {T,N} = zero(T)
+Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
 Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
-  Zeros(T, inds.stop)
+              Zeros(T, inds.stop)
 
 Base.setindex(xs::Zeros, args...) =
-  error("setindex disallowed on Zeros Array")
+              error("setindex disallowed on Zeros Array")
 Base.setindex!(xs::Zeros, args...) =
-  error("setindex! disallowed on Zeros Array")
+              error("setindex! disallowed on Zeros Array")
 
 Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
 
-# Ignore during backwards pass
 @adjoint reshape(xs::Zeros{T}, dims...) where T =
-  reshape(xs, dims...), _ -> nothing
+                reshape(xs, dims...), _ -> nothing
 
 # Define basic ops
 for f in (:+, :-)
- @eval $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros) = a
+  @eval function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
+    @assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
+    a
+  end
 end
-Base.:+(a::Zeros, b::AbstractArray) = b
-Base.:-(a::Zeros, b::AbstractArray) = -b
-Base.:*(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros) = zero(a)
-Base.:*(a::Zeros, b::AbstractArray) = zero(b)
 
-# Hook into broadcasting API - to allow using as a regular array
-Base.BroadcastStyle(::Type{<:Zeros}) = Broadcast.ArrayStyle{Zeros}()
-Broadcast.broadcastable(xs::Zeros) = xs
-Base.BroadcastStyle(::Broadcast.ArrayStyle{Zeros}, ::Broadcast.DefaultArrayStyle{N}) where N =
-  Broadcast.ArrayStyle{Zeros}()
++(a::Zeros, b::AbstractArray) = b + a
+-(a::Zeros, b::AbstractArray) = -b + a
 
-function Base.similar(bc::Broadcasted{Broadcast.ArrayStyle{Flux.Zeros}}, ::Type{T}) where T
-  similar(Array{T}, axes(bc))
+function *(a::AbstractArray{S,2}, b::Zeros{T,2}) where {T,S}
+  @assert size(a,2) == size(b,1) throw(DimensionMismatch("A has dimensions $(size(a)) but B has dimensions $(size(b))"))
+  res = similar(a, size(a,1), size(b,2))
+  res .= zero(S)
+end
+
+function *(a::Zeros{T,2}, b::AbstractArray{S,2}) where {T,S}
+  @assert size(a,2) == size(b,1) throw(DimensionMismatch("A has dimensions $(size(a)) but B has dimensions $(size(b))"))
+  res = similar(b, size(a,1), size(b,2))
+  res .= zero(S)
 end
 
 Base.copy(xs::Zeros{T,N}) where {T,N} = Zeros(T, size(xs)...)
 
-isZeros(x::Zeros) = true
-isZeros(x) = false
-
-function Base.copyto!(dest::AbstractArray, bc::Broadcasted{Broadcast.ArrayStyle{Flux.Zeros}})
-  bc = Broadcast.flatten(bc)
-
-  i = isZeros(first(bc.args)) ? 2 : 1 # findfirst(!isZeros, bc.args)
-  dest .= bc.args[i]
+# Define broadcasting behaviour
+for op in (:+, :-)
+  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
+    sz = similar(a, Broadcast.broadcast_shape(size(a), size(b)))
+    sz .= a
+  end
 end
 
+broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(typeof(+), b, a)
+broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(typeof(+), -b, a)
+
+function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
+  sz = similar(a, Broadcast.broadcast_shape(size(a), size(b)))
+  sz .= zero(a)
+end
+
+broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(typeof(*), b, a)
+
+for op in (:+, :-, :*)
+  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+end
+
+# Some opportunities to avoid scalar indexing, intermediaries
+broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
+broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
+broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
+broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+
 """
     @jit ...
 

From 245563077b614e78d6b765b3d24aae0612decc0e Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 27 Nov 2019 19:40:58 +0530
Subject: [PATCH 24/39] cleaner API

---
 src/layers/conv.jl | 70 +++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 41 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index a60749e3..751689f5 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -22,8 +22,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the
-layer.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
@@ -44,17 +43,15 @@ Constructs the convolutional layer with user defined weight and bias arrays.
 All other behaviours of the Conv layer apply with regard to data order and
 forward pass.
 
-Setting `bias` to `nothing` or `Flux.Zeros()` would switch `bias` off for the
-layer.
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function Conv(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractVector{T}}, σ = identity;
+function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? Zeros((size(w, ndims(w)), )) : b
   return Conv(σ, w, b, stride, pad, dilation)
 end
 
@@ -70,14 +67,14 @@ distribution.
 See also: [`depthwiseconvfilter`](@ref)
 """
 convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
-  init = glorot_uniform) where N = init(filter..., ch...)
+          init = glorot_uniform) where N = init(filter..., ch...)
 
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
-     weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+            init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
+            weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
 
   Conv(weight, bias, σ,
-       stride = stride, pad = pad, dilation = dilation)
+      stride = stride, pad = pad, dilation = dilation)
 end
 
 @functor Conv
@@ -114,8 +111,7 @@ Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the
-layer.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
@@ -136,23 +132,21 @@ Constructs the convolutional transpose layer with user defined weight and bias a
 All other behaviours of the ConvTranspose layer apply with regard to data order and
 forward pass.
 
-Setting `bias` to `nothing` or `Flux.Zeros()` would switch `bias` off for the
-layer.
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function ConvTranspose(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractVector{T}}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? Zeros((size(w, ndims(w)), )) : b
   return ConvTranspose(σ, w, b, stride, pad, dilation)
 end
 
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-              init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-              weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
   
   ConvTranspose(weight, bias, σ,
               stride = stride, pad = pad, dilation = dilation)
@@ -168,9 +162,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
     batch_size = size(x)[end]
     # Create DenseConvDims() that looks like the corresponding conv()
     return DenseConvDims((I..., C_in, batch_size), size(c.weight);
-        stride=c.stride,
-        padding=c.pad,
-        dilation=c.dilation,
+                        stride=c.stride,
+                        padding=c.pad,
+                        dilation=c.dilation,
     )
 end
 
@@ -206,8 +200,7 @@ Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the
-layer.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
@@ -228,17 +221,15 @@ Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
 All other behaviours of the `DepthwiseConv` layer apply with regard to data order and
 forward pass.
 
-Setting `bias` to `nothing` or `Flux.Zeros()` would switch `bias` off for the
-layer.
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractVector{T}}, σ = identity;
-                       stride = 1, pad = 0, dilation = 1) where {T,N}
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? Zeros((size(w, ndims(w)), )) : b
   return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 
@@ -254,11 +245,11 @@ distribution.
 See also: [`convfilter`](@ref)
 """
 depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
-  init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
+                    init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
 
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-     weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
 
   return DepthwiseConv(
@@ -312,8 +303,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the
-layer.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
@@ -334,23 +324,21 @@ Constructs the standard cross convolutional layer with user defined weight and b
 arrays. All other behaviours of the CrossCor layer apply with regard to data order and
 forward pass.
 
-Setting `bias` to `nothing` or `Flux.Zeros()` would switch `bias` off for the
-layer.
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
-function CrossCor(w::AbstractArray{T,N}, b::Union{Nothing, Zeros, AbstractVector{T}}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                  stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   pad = expand(Val(2*(N-2)), pad)
   dilation = expand(Val(N-2), dilation)
-  b = b isa Nothing ? Zeros((size(w, ndims(w)), )) : b
   return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-     weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+                  init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                  weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
 
   CrossCor(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)

From ec872bb57905023388eb7bb808e475edfaefcfd9 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 27 Nov 2019 19:45:04 +0530
Subject: [PATCH 25/39] test that bias has no grads with Zeros

---
 test/layers/conv.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 4a3f8c16..d3345929 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -31,6 +31,8 @@ end
   bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
   op = bias(ip)
   @test sum(op) === 0.f0
+  gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
+  @test gs[bias.bias] == nothing
 
   # Train w/o bias and make sure no convergence happens
   # when only bias can be converged

From f39e1848144a023d3ac2ba8a0d105121c234e018 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 9 Dec 2019 21:07:30 +0530
Subject: [PATCH 26/39] rm Zeros warning

---
 src/utils.jl | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 57e62cca..7f244724 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -152,10 +152,6 @@ used during training which is ignored by the optimisers.
 
 Useful to turn bias off for a forward pass of a layer.
 
-!!! warning
-    Zeros acts a scalar while broadcasting, so does not
-    expand dims. Checks for shape compatibility by default.
-
 ## Examples
 
 ```julia

From 894c075b6d5d9a3d3a9c7ae6e7f279a5fc384977 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 9 Dec 2019 21:40:58 +0530
Subject: [PATCH 27/39] rm Zeros setindex

---
 src/utils.jl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 7f244724..ed0f95fc 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -193,11 +193,6 @@ Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
 Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
               Zeros(T, inds.stop)
 
-Base.setindex(xs::Zeros, args...) =
-              error("setindex disallowed on Zeros Array")
-Base.setindex!(xs::Zeros, args...) =
-              error("setindex! disallowed on Zeros Array")
-
 Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
 
 @adjoint reshape(xs::Zeros{T}, dims...) where T =

From a72ca2b05db5bb6e528627121eff832f5fbe64f6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 9 Dec 2019 23:18:01 +0530
Subject: [PATCH 28/39] fix args

---
 src/utils.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index ed0f95fc..97bfd3cd 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -231,15 +231,15 @@ for op in (:+, :-)
   end
 end
 
-broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(typeof(+), b, a)
-broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(typeof(+), -b, a)
+broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
+broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
 
 function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
   sz = similar(a, Broadcast.broadcast_shape(size(a), size(b)))
   sz .= zero(a)
 end
 
-broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(typeof(*), b, a)
+broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
 
 for op in (:+, :-, :*)
   @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)

From b9fbee1ff024ae9dd66e4b9b4ffebdc42ead1c51 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 31 Jan 2020 12:24:36 +0530
Subject: [PATCH 29/39] ::typeof(op) -> op

---
 src/utils.jl | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 97bfd3cd..3c8abb5e 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -225,34 +225,34 @@ Base.copy(xs::Zeros{T,N}) where {T,N} = Zeros(T, size(xs)...)
 
 # Define broadcasting behaviour
 for op in (:+, :-)
-  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
+  @eval function broadcasted($op, a::AbstractArray, b::Zeros)
     sz = similar(a, Broadcast.broadcast_shape(size(a), size(b)))
     sz .= a
   end
 end
 
-broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
-broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
+broadcasted(+, a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
+broadcasted(-, a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
 
-function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
+function broadcasted(*, a::AbstractArray, b::Zeros)
   sz = similar(a, Broadcast.broadcast_shape(size(a), size(b)))
   sz .= zero(a)
 end
 
-broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
+broadcasted(*, a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
 
 for op in (:+, :-, :*)
-  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+  @eval broadcasted($op, a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
 end
 
 # Some opportunities to avoid scalar indexing, intermediaries
-broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
-broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
-broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
-broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
-broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
-broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
-broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(+, a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(+, a::Zeros{T,0}, b::AbstractArray) where T = b
+broadcasted(-, a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(-, a::Zeros{T,0}, b::AbstractArray) where T = -b
+broadcasted(*, a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
+broadcasted(*, a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(/, a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
 
 """
     @jit ...

From bc20103ea6dd1034951cd053fe0fc1a68f7b0bcf Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 31 Jan 2020 13:23:33 +0530
Subject: [PATCH 30/39] no-op copy

---
 src/utils.jl | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 3c8abb5e..36bab5a9 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -221,38 +221,38 @@ function *(a::Zeros{T,2}, b::AbstractArray{S,2}) where {T,S}
   res .= zero(S)
 end
 
-Base.copy(xs::Zeros{T,N}) where {T,N} = Zeros(T, size(xs)...)
+Base.copy(xs::Zeros{T,N}) where {T,N} = xs
 
 # Define broadcasting behaviour
 for op in (:+, :-)
-  @eval function broadcasted($op, a::AbstractArray, b::Zeros)
+  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
     sz = similar(a, Broadcast.broadcast_shape(size(a), size(b)))
     sz .= a
   end
 end
 
-broadcasted(+, a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
-broadcasted(-, a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
+broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
+broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
 
-function broadcasted(*, a::AbstractArray, b::Zeros)
+function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
   sz = similar(a, Broadcast.broadcast_shape(size(a), size(b)))
   sz .= zero(a)
 end
 
-broadcasted(*, a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
+broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
 
 for op in (:+, :-, :*)
-  @eval broadcasted($op, a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
 end
 
 # Some opportunities to avoid scalar indexing, intermediaries
-broadcasted(+, a::AbstractArray, b::Zeros{T,0}) where T = a
-broadcasted(+, a::Zeros{T,0}, b::AbstractArray) where T = b
-broadcasted(-, a::AbstractArray, b::Zeros{T,0}) where T = a
-broadcasted(-, a::Zeros{T,0}, b::AbstractArray) where T = -b
-broadcasted(*, a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
-broadcasted(*, a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
-broadcasted(/, a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
+broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
+broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
+broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
 
 """
     @jit ...

From f889d0c4d4fd03c5209d66f640e05f7c48cbd454 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 26 Feb 2020 22:19:17 +0530
Subject: [PATCH 31/39] add kwarg constructors

---
 src/layers/conv.jl | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 751689f5..c2cc15bf 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -55,6 +55,11 @@ function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = id
   return Conv(σ, w, b, stride, pad, dilation)
 end
 
+function Conv(;weight::AbstractArray, bias::Union{Zeros, AbstractVector{T}}, activation = identity,
+              stride = 1, pad = 0, dilation = 1) where {T,N}
+  Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
 """
     convfilter(filter::Tuple, in=>out)
 
@@ -144,6 +149,11 @@ function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}
   return ConvTranspose(σ, w, b, stride, pad, dilation)
 end
 
+function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                        activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                       init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
                       weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
@@ -233,6 +243,11 @@ function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}
   return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 
+function DepthwiseConv(;weight::AbstractArray, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
 """
     depthwiseconvfilter(filter::Tuple, in=>out)
 

From 58211e31bd51408a1c80138d8ccbb7d27dbd8117 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 26 Feb 2020 22:22:11 +0530
Subject: [PATCH 32/39] docs improve

---
 src/layers/conv.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index c2cc15bf..5b19269a 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -107,7 +107,7 @@ end
 
 """
     ConvTranspose(filter::Tuple, in=>out)
-    ConvTranspose(filter::Tuple, in=>out, relu)
+    ConvTranspose(filter::Tuple, in=>out, activation)
 
 Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
@@ -200,7 +200,7 @@ end
 
 """
     DepthwiseConv(filter::Tuple, in=>out)
-    DepthwiseConv(filter::Tuple, in=>out, relu)
+    DepthwiseConv(filter::Tuple, in=>out, activation)
 
 Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.

From cf82393ae8aa1e1c44df28777f7a36bc765c3eb6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 26 Feb 2020 22:36:25 +0530
Subject: [PATCH 33/39] type signatures

---
 src/layers/conv.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 94a10606..41f0e2e3 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -60,8 +60,8 @@ function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = id
   return Conv(σ, w, b, stride, pad, dilation)
 end
 
-function Conv(;weight::AbstractArray, bias::Union{Zeros, AbstractVector{T}}, activation = identity,
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+              activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
 
@@ -268,7 +268,7 @@ function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}
   return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 
-function DepthwiseConv(;weight::AbstractArray, bias::Union{Zeros, AbstractVector{T}},
+function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
                       activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
@@ -379,7 +379,7 @@ function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ
   return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
-function CrossCor(;weight::AbstractArray, bias::Union{Zeros, AbstractVector{T}},
+function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
                       activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end

From 20e78e274ecaccc7677373a891ddadcb5777dda7 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 26 Feb 2020 22:41:45 +0530
Subject: [PATCH 34/39] docs fix

---
 src/layers/conv.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 41f0e2e3..997b96e5 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -7,6 +7,7 @@ _convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
+
 """
     Conv(filter::Tuple, in=>out)
     Conv(filter::Tuple, in=>out, activation)
@@ -127,7 +128,7 @@ outdims(l::Conv, isize) =
 
 """
     ConvTranspose(size, in=>out)
-    ConvTranspose(size, in=>out, relu)
+    ConvTranspose(size, in=>out, activation)
 
 Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.

From 7e308e77fd4b4c60906772b61351d326605ae753 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 4 Mar 2020 17:57:16 +0530
Subject: [PATCH 35/39] rm unneccesary fns

---
 src/utils.jl | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index dbf85c95..6ad410b3 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -256,18 +256,6 @@ end
 +(a::Zeros, b::AbstractArray) = b + a
 -(a::Zeros, b::AbstractArray) = -b + a
 
-function *(a::AbstractArray{S,2}, b::Zeros{T,2}) where {T,S}
-  @assert size(a,2) == size(b,1) throw(DimensionMismatch("A has dimensions $(size(a)) but B has dimensions $(size(b))"))
-  res = similar(a, size(a,1), size(b,2))
-  res .= zero(S)
-end
-
-function *(a::Zeros{T,2}, b::AbstractArray{S,2}) where {T,S}
-  @assert size(a,2) == size(b,1) throw(DimensionMismatch("A has dimensions $(size(a)) but B has dimensions $(size(b))"))
-  res = similar(b, size(a,1), size(b,2))
-  res .= zero(S)
-end
-
 Base.copy(xs::Zeros{T,N}) where {T,N} = xs
 
 # Define broadcasting behaviour
@@ -282,8 +270,7 @@ broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
 broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
 
 function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
-  sz = similar(a, Broadcast.broadcast_shape(size(a), size(b)))
-  sz .= zero(a)
+  Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
 end
 
 broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)

From d8e44fcc1c4fe98f4b87668d498575f303813701 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 4 Mar 2020 18:22:45 +0530
Subject: [PATCH 36/39] correct broadcasting for addition

---
 src/utils.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 6ad410b3..5e8eb270 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -247,7 +247,7 @@ Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
 
 # Define basic ops
 for f in (:+, :-)
-  @eval function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
+  @eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
     @assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
     a
   end
@@ -261,7 +261,9 @@ Base.copy(xs::Zeros{T,N}) where {T,N} = xs
 # Define broadcasting behaviour
 for op in (:+, :-)
   @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
-    sz = similar(a, Broadcast.broadcast_shape(size(a), size(b)))
+    bs = Broadcast.broadcast_shape(size(a), size(b))
+    size(a) == bs && return a
+    sz = similar(a, bs)
     sz .= a
   end
 end

From 534809ae78b6b09baa60c6d3e2f055d9a47d6db5 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 29 Apr 2020 16:15:35 +0530
Subject: [PATCH 37/39] move zeros to its own file

---
 src/Flux.jl  |   1 +
 src/utils.jl | 104 ---------------------------------------------------
 src/zeros.jl | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 104 deletions(-)
 create mode 100644 src/zeros.jl

diff --git a/src/Flux.jl b/src/Flux.jl
index 5799fe42..90dcb630 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -27,6 +27,7 @@ using CuArrays
 const use_cuda = Ref(false)
 
 include("utils.jl")
+include("zeros.jl")
 include("onehot.jl")
 include("functor.jl")
 
diff --git a/src/utils.jl b/src/utils.jl
index c321bd91..7842c961 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -321,110 +321,6 @@ function throttle(f, timeout; leading=true, trailing=false)
   end
 end
 
-import Base: +, -, *, reshape, size
-import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
-
-"""
-    Zeros()
-    Zeros(size...)
-    Zeros(Type, size...)
-
-Acts as a stand-in for an array of zeros that can be
-used during training which is ignored by the optimisers.
-
-Useful to turn bias off for a forward pass of a layer.
-
-## Examples
-
-```julia
-julia> Flux.Zeros(3,3)
-3×3 Flux.Zeros{Bool,2}:
- false  false  false
- false  false  false
- false  false  false
-
-julia> Flux.Zeros(Float32, 3,3)
-3×3 Flux.Zeros{Float32,2}:
- 0.0  0.0  0.0
- 0.0  0.0  0.0
- 0.0  0.0  0.0
-
-julia> rand(3,3) .+ Flux.Zeros()
-3×3 Array{Float64,2}:
- 0.198739  0.490459  0.785386
- 0.779074  0.39986   0.66383
- 0.854981  0.447292  0.314497
-
-julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
-Conv((2, 2), 1=>3)
-```
-"""
-struct Zeros{T,N} <: AbstractArray{T,N}
-  size::Tuple
-end
-
-Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
-Zeros(sz::Integer...) = Zeros(Bool, sz...)
-
-Base.size(xs::Zeros) = xs.size
-Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
-
-Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
-
-Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
-Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
-              Zeros(T, inds.stop)
-
-Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
-
-@adjoint reshape(xs::Zeros{T}, dims...) where T =
-                reshape(xs, dims...), _ -> nothing
-
-# Define basic ops
-for f in (:+, :-)
-  @eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
-    @assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
-    a
-  end
-end
-
-+(a::Zeros, b::AbstractArray) = b + a
--(a::Zeros, b::AbstractArray) = -b + a
-
-Base.copy(xs::Zeros{T,N}) where {T,N} = xs
-
-# Define broadcasting behaviour
-for op in (:+, :-)
-  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
-    bs = Broadcast.broadcast_shape(size(a), size(b))
-    size(a) == bs && return a
-    sz = similar(a, bs)
-    sz .= a
-  end
-end
-
-broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
-broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
-
-function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
-  Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
-end
-
-broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
-
-for op in (:+, :-, :*)
-  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
-end
-
-# Some opportunities to avoid scalar indexing, intermediaries
-broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
-broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
-broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
-broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
-broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
-broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
-broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
-
 """
     @jit ...
 
diff --git a/src/zeros.jl b/src/zeros.jl
new file mode 100644
index 00000000..d281d3eb
--- /dev/null
+++ b/src/zeros.jl
@@ -0,0 +1,103 @@
+import Base: +, -, *, reshape, size
+import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
+
+"""
+    Zeros()
+    Zeros(size...)
+    Zeros(Type, size...)
+
+Acts as a stand-in for an array of zeros that can be
+used during training which is ignored by the optimisers.
+
+Useful to turn bias off for a forward pass of a layer.
+
+## Examples
+
+```julia
+julia> Flux.Zeros(3,3)
+3×3 Flux.Zeros{Bool,2}:
+ false  false  false
+ false  false  false
+ false  false  false
+
+julia> Flux.Zeros(Float32, 3,3)
+3×3 Flux.Zeros{Float32,2}:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+
+julia> rand(3,3) .+ Flux.Zeros()
+3×3 Array{Float64,2}:
+ 0.198739  0.490459  0.785386
+ 0.779074  0.39986   0.66383
+ 0.854981  0.447292  0.314497
+
+julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
+Conv((2, 2), 1=>3)
+```
+"""
+struct Zeros{T,N} <: AbstractArray{T,N}
+  size::Tuple
+end
+
+Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
+Zeros(sz::Integer...) = Zeros(Bool, sz...)
+
+Base.size(xs::Zeros) = xs.size
+Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
+
+Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
+
+Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
+Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
+              Zeros(T, inds.stop)
+
+Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
+
+@adjoint reshape(xs::Zeros{T}, dims...) where T =
+                reshape(xs, dims...), _ -> nothing
+
+# Define basic ops
+for f in (:+, :-)
+  @eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
+    @assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
+    a
+  end
+end
+
++(a::Zeros, b::AbstractArray) = b + a
+-(a::Zeros, b::AbstractArray) = -b + a
+
+Base.copy(xs::Zeros{T,N}) where {T,N} = xs
+
+# Define broadcasting behaviour
+for op in (:+, :-)
+  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
+    bs = Broadcast.broadcast_shape(size(a), size(b))
+    size(a) == bs && return a
+    sz = similar(a, bs)
+    sz .= a
+  end
+end
+
+broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
+broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
+
+function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
+  Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+end
+
+broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
+
+for op in (:+, :-, :*)
+  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+end
+
+# Some opportunities to avoid scalar indexing, intermediaries
+broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
+broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
+broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
+broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
\ No newline at end of file

From 29215fa5d7c8d82c5d8d19cb3cb7fafc9ef34324 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 29 Apr 2020 16:17:44 +0530
Subject: [PATCH 38/39] comment on possible future deprecations

---
 src/zeros.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/zeros.jl b/src/zeros.jl
index d281d3eb..d31adcd9 100644
--- a/src/zeros.jl
+++ b/src/zeros.jl
@@ -94,6 +94,9 @@ for op in (:+, :-, :*)
 end
 
 # Some opportunities to avoid scalar indexing, intermediaries
+# Since it replicates a little of what we expect Base to do,
+# it should be possible to remove in the future, but for now,
+# these help with performance.
 broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
 broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
 broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a

From 8f877f2dbfd4937f52615a0b798a0d582c456da1 Mon Sep 17 00:00:00 2001
From: Mike J Innes <mike.j.innes@gmail.com>
Date: Fri, 1 May 2020 14:22:46 +0100
Subject: [PATCH 39/39] quick fix

---
 src/zeros.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/zeros.jl b/src/zeros.jl
index d31adcd9..1aec7b02 100644
--- a/src/zeros.jl
+++ b/src/zeros.jl
@@ -50,7 +50,7 @@ Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
 
 Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
 Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
-              Zeros(T, inds.stop)
+              Zeros(T, length(inds))
 
 Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
 
@@ -103,4 +103,4 @@ broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
 broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
 broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
 broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
-broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
\ No newline at end of file
+broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)