Resolving Merge Conflicts

2019-06-12 21:34:42 +05:30 · 2019-06-12 21:34:42 +05:30 · c7c0ee2cbc
commit c7c0ee2cbc
parent dfd2965e85 b47238eb74
13 changed files with 244 additions and 45 deletions
--- a/CITATION.bib
+++ b/CITATION.bib
@ -0,0 +1,29 @@
+@article{Flux.jl-2018,
+  author    = {Michael Innes and
+               Elliot Saba and
+               Keno Fischer and
+               Dhairya Gandhi and
+               Marco Concetto Rudilosso and
+               Neethu Mariya Joy and
+               Tejan Karmali and
+               Avik Pal and
+               Viral Shah},
+  title     = {Fashionable Modelling with Flux},
+  journal   = {CoRR},
+  volume    = {abs/1811.01457},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1811.01457},
+  archivePrefix = {arXiv},
+  eprint    = {1811.01457},
+  timestamp = {Thu, 22 Nov 2018 17:58:30 +0100},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1811-01457},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{innes:2018,
+  author    = {Mike Innes},
+  title     = {Flux: Elegant Machine Learning with Julia},
+  journal   = {Journal of Open Source Software},
+  year      = {2018},
+  doi       = {10.21105/joss.00602},
+}
--- a/NEWS.md
+++ b/NEWS.md
@ -1,5 +1,10 @@
+# v0.9.0
+* [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
+* New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.
+
 # v0.8.0

+* [Dropout now has a `dims` argument for specifying the unbroadcast dimensions.](https://github.com/FluxML/Flux.jl/pull/563)
 * New [ConvTranspose layer](https://github.com/FluxML/Flux.jl/pull/311).
 * New [Maxout layer](https://github.com/FluxML/Flux.jl/pull/647)
 * Datasets are now [hash verified on download](https://github.com/FluxML/Flux.jl/pull/585) to avoid corruption.
@ -13,6 +18,7 @@
 * [Data.Iris](https://github.com/FluxML/Flux.jl/pull/652) makes Fisher's Iris dataset available with `Iris.labels` and `Iris.features`.
 * New [InstanceNorm](https://github.com/FluxML/Flux.jl/pull/634), as popularized by [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
 * New [GroupNorm](https://github.com/FluxML/Flux.jl/pull/696), as described in [Group Normalization](https://arxiv.org/abs/1803.08494).
+* New [CrossCor](https://github.com/FluxML/Flux.jl/pull/762).

 AD Changes:

--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@ -17,6 +17,7 @@ MaxPool
 MeanPool
 DepthwiseConv
 ConvTranspose
+CrossCor
 ```

 ## Recurrent Layers
@ -36,17 +37,7 @@ But in contrast to the layers described in the other sections are not readily gr

 ```@docs
 Maxout
-```
-
-# Normalisation & Regularisation
-
-These layers don't affect the structure of the network but may improve training times or reduce overfitting.
-
-```@docs
-Flux.testmode!
-BatchNorm
-Dropout
-LayerNorm
+SkipConnection
 ```

 ## Activation Functions
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@ -18,7 +18,7 @@ And you use less memory.
 Not only should your activation functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
 they should also preserve the type of their inputs.

-A very artificial example using an activatioon function like
+A very artificial example using an activation function like

 ```
    my_tanh(x) = Float64(tanh(x))
@ -73,4 +73,4 @@ end
 ```

 When doing this kind of concatenation use `reduce(hcat, xs)` rather than `hcat(xs...)`.
-This will avoid the splatting penality, and will hit the optimised `reduce` method.
+This will avoid the splatting penalty, and will hit the optimised `reduce` method.
--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -10,7 +10,7 @@ using Zygote: Params, @adjoint, gradient

 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, ConvTranspose, MaxPool, MeanPool,
       DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
-       params, mapleaves, cpu, gpu, f32, f64, param, data
+       SkipConnection,params, mapleaves, cpu, gpu, f32, f64, param, data

 include("optimise/Optimise.jl")
 using .Optimise
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@ -190,5 +190,37 @@ function (mo::Maxout)(input::AbstractArray)
    mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end

+"""
+    SkipConnection(layers...)
+
+Creates a Skip Connection, which constitutes of a layer or Chain of consecutive layers
+and a shortcut connection linking the input to the block to the
+output through a user-supplied callable.
+
+`SkipConnection` requires the output dimension to be the same as the input.
+
+A 'ResNet'-type skip-connection with identity shortcut would simply be
+```julia
+    SkipConnection(layer, (a,b) -> a + b)
+```
+"""
+
+struct SkipConnection
+  layers
+  connection  #user can pass arbitrary connections here, such as (a,b) -> a + b
+end
+
+@treelike SkipConnection
+
+function (skip::SkipConnection)(input)
+  #We apply the layers to the input and return the result of the application of the layers and the original input
+  skip.connection(skip.layers(input), input)
+end
+
+function Base.show(io::IO, b::SkipConnection)
+  print(io, "SkipConnection(")
+  join(io, b.layers, ", ")
+  print(io, ")")
+end
 param(x) = x
 data(x) = x
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@ -14,11 +14,11 @@ Example: Applying Conv layer to a 1-channel input using a 2x2 window size,

    size = (2,2)
    in = 1
-    out = 16 
+    out = 16
    Conv((2, 2), 1=>16, relu)

-Data should be stored in WHCN order (width, height, # channels, # batches). 
-In other words, a 100×100 RGB image would be a `100×100×3×1` array, 
+Data should be stored in WHCN order (width, height, # channels, # batches).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.

 Takes the keyword arguments `pad`, `stride` and `dilation`.
@ -136,18 +136,17 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  a(T.(x))
 """
-    DepthwiseConv(size, in)
-    DepthwiseConv(size, in=>mul)
-    DepthwiseConv(size, in=>mul, relu)
+    DepthwiseConv(size, in=>out)
+    DepthwiseConv(size, in=>out, relu)

 Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
-`in` and `mul` specify the number of input channels and channel multiplier respectively.
-In case the `mul` is not specified it is taken as 1.
+`in` and `out` specify the number of input and output channels respectively.
+Note that `out` must be an integer multiple of `in`.

 Data should be stored in WHCN order. In other words, a 100×100 RGB image would
 be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.

-Takes the keyword arguments `pad` and `stride`.
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 """
 struct DepthwiseConv{N,M,F,A,V}
  σ::F
@ -187,8 +186,8 @@ function (c::DepthwiseConv)(x)
 end

 function Base.show(io::IO, l::DepthwiseConv)
-  print(io, "DepthwiseConv(", size(l.weight)[1:ndims(l.weight)-2])
-  print(io, ", ", size(l.weight, ndims(l.weight)), "=>", size(l.weight, ndims(l.weight)-1))
+  print(io, "DepthwiseConv(", size(l.weight)[1:end-2])
+  print(io, ", ", size(l.weight)[end], "=>", prod(size(l.weight)[end-1:end]))
  l.σ == identity || print(io, ", ", l.σ)
  print(io, ")")
 end
@ -198,6 +197,76 @@ end

 (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  a(T.(x))
+"""
+    CrossCor(size, in=>out)
+    CrossCor(size, in=>out, relu)
+
+Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
+`in` and `out` specify the number of input and output channels respectively.
+
+Example: Applying CrossCor layer to a 1-channel input using a 2x2 window size,
+         giving us a 16-channel output. Output is activated with ReLU.
+
+    size = (2,2)
+    in = 1
+    out = 16
+    CrossCor((2, 2), 1=>16, relu)
+
+Data should be stored in WHCN order (width, height, # channels, # batches).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
+and a batch of 50 would be a `100×100×3×50` array.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+"""
+struct CrossCor{N,M,F,A,V}
+  σ::F
+  weight::A
+  bias::V
+  stride::NTuple{N,Int}
+  pad::NTuple{M,Int}
+  dilation::NTuple{N,Int}
+end
+
+function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
+              stride = 1, pad = 0, dilation = 1) where {T,N}
+  stride = expand(Val(N-2), stride)
+  pad = expand(Val(2*(N-2)), pad)
+  dilation = expand(Val(N-2), dilation)
+  return CrossCor(σ, w, b, stride, pad, dilation)
+end
+
+CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
+  CrossCor(param(init(k..., ch...)), param(zeros(ch[2])), σ,
+       stride = stride, pad = pad, dilation = dilation)
+
+@treelike CrossCor
+
+function crosscor(x, w, ddims::DenseConvDims)
+  ddims = DenseConvDims(ddims, F=true)
+  return conv(x, w, ddims)
+end
+
+function (c::CrossCor)(x::AbstractArray)
+  # TODO: breaks gpu broadcast :(
+  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
+  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
+  cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
+  σ.(crosscor(x, c.weight, cdims) .+ b)
+end
+
+function Base.show(io::IO, l::CrossCor)
+  print(io, "CrossCor(", size(l.weight)[1:ndims(l.weight)-2])
+  print(io, ", ", size(l.weight, ndims(l.weight)-1), "=>", size(l.weight, ndims(l.weight)))
+  l.σ == identity || print(io, ", ", l.σ)
+  print(io, ")")
+end
+
+(a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
+  invoke(a, Tuple{AbstractArray}, x)
+
+(a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
+  a(T.(x))

 """
    MaxPool(k)
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@ -3,11 +3,12 @@ istraining() = false
@adjoint istraining() = true, _ -> nothing

 """
-    Dropout(p)
+    Dropout(p, dims = :)

 A Dropout layer. For each input, either sets that input to `0` (with probability
-`p`) or scales it by `1/(1-p)`. This is used as a regularisation, i.e. it
-reduces overfitting during training.
+`p`) or scales it by `1/(1-p)`. The `dims` argument is to specified the unbroadcasted
+ dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
+ used as a regularisation, i.e. it reduces overfitting during training. see also [`dropout`](@ref).

 Does nothing to the input once in [`testmode!`](@ref).
 """
@ -19,13 +20,16 @@ mutable struct Dropout{F}
  end
 end

+_dropout_shape(s, ::Colon) = size(s)
+_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
+
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)

 function (a::Dropout)(x)
  istraining() || return x
  y = similar(x)
  rand!(y)
-  y .= _dropout_kernel.(y, a.p, 1 - a.p)
+  y .= _dropout_kernel.(y, p, 1 - p)
  return x .* y
 end

--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@ -36,6 +36,10 @@ c = gpu(Conv((2,2),3=>4))
 l = c(gpu(rand(10,10,3,2)))
 Flux.back!(sum(l))

+c = gpu(CrossCor((2,2),3=>4))
+l = c(gpu(rand(10,10,3,2)))
+Flux.back!(sum(l))
+
 end

@testset "onecold gpu" begin
--- a/test/cuda/cudnn.jl
+++ b/test/cuda/cudnn.jl
@ -1,16 +1,17 @@
 using Flux, CuArrays, Test
-
+trainmode(f, x...) = forward(f, x...)[1]
+#
 # @testset "CUDNN BatchNorm" begin
 #     @testset "4D Input" begin
-#         x = TrackedArray(Float64.(collect(reshape(1:12, 2, 2, 3, 1))))
+#         x = Float64.(collect(reshape(1:12, 2, 2, 3, 1)))
 #         m = BatchNorm(3)
 #         cx = gpu(x)
 #         cm = gpu(m)
 #
-#         y = m(x)
-#         cy = cm(cx)
+#         y = trainmode(m, x)
+#         cy = trainmode(cm, cx)
 #
-#         @test cy isa TrackedArray{Float32,4,CuArray{Float32,4}}
+#         # @test cy isa TrackedArray{Float32,4,CuArray{Float32,4}}
 #
 #         @test cpu(data(cy)) ≈ data(y)
 #
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@ -72,4 +72,16 @@ import Flux: activations
      @test length(ps) == 8  #4 alts, each with weight and bias
    end
  end
+
+  @testset "SkipConnection" begin
+    @testset "zero sum" begin
+      input = randn(10, 10, 10, 10)
+      @test SkipConnection(x -> zeros(size(x)), (a,b) -> a + b)(input) == input
+    end
+
+    @testset "concat size" begin
+      input = randn(10, 2)
+      @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
+    end
+  end
 end
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@ -39,20 +39,14 @@ end

@testset "Depthwise Conv" begin
  r = zeros(Float32, 28, 28, 3, 5)
-  m1 = DepthwiseConv((2, 2), 3=>5)
+  m1 = DepthwiseConv((2, 2), 3=>15)
  @test size(m1(r), 3) == 15
-  m2 = DepthwiseConv((2, 2), 3)
-  @test size(m2(r), 3) == 3

-  x = zeros(Float64, 28, 28, 3, 5)
+  m3 = DepthwiseConv((2, 3), 3=>9)
+  @test size(m3(r), 3) == 9

-  m3 = DepthwiseConv((2, 2), 3 => 5)
-
-  @test size(m3(r), 3) == 15
-
-  m4 = DepthwiseConv((2, 2), 3)
-
-  @test size(m4(r), 3) == 3
+  # Test that we cannot ask for non-integer multiplication factors
+  @test_throws AssertionError DepthwiseConv((2,2), 3=>10)
 end

@testset "ConvTranspose" begin
@ -61,3 +55,50 @@ end
  x_hat = ConvTranspose((3, 3), 1 => 1)(y)
  @test size(x_hat) == size(x)
 end
+
+@testset "CrossCor" begin
+  x = rand(Float32, 28, 28, 1, 1)
+  w = rand(2,2,1,1)
+  y = CrossCor(w, [0.0])
+
+  @test sum(w .* x[1:2, 1:2, :, :]) == y(x)[1, 1, 1, 1]
+
+  r = zeros(Float32, 28, 28, 1, 5)
+  m = Chain(
+    CrossCor((2, 2), 1=>16, relu),
+    MaxPool((2,2)),
+    CrossCor((2, 2), 16=>8, relu),
+    MaxPool((2,2)),
+    x -> reshape(x, :, size(x, 4)),
+    Dense(288, 10), softmax)
+
+  @test size(m(r)) == (10, 5)
+  @test y(x) != Conv(w, [0.0])(x)
+  @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x) == Conv(w, [0.0])(x)
+end
+
+@testset "Conv with non quadratic window #700" begin
+  data = zeros(Float32, 7,7,1,1)
+  data[4,4,1,1] = 1
+
+  l = Conv((3,3), 1=>1)
+  expected = zeros(eltype(l.weight),5,5,1,1)
+  expected[2:end-1,2:end-1,1,1] = l.weight
+  @test expected == l(data)
+
+  l = Conv((3,1), 1=>1)
+  expected = zeros(eltype(l.weight),5,7,1,1)
+  expected[2:end-1,4,1,1] = l.weight
+  @test expected == l(data)
+
+  l = Conv((1,3), 1=>1)
+  expected = zeros(eltype(l.weight),7,5,1,1)
+  expected[4,2:end-1,1,1] = l.weight
+  @test expected == l(data)
+
+  @test begin
+    # we test that the next expression does not throw
+    randn(Float32, 10,10,1,1) |> Conv((6,1), 1=>1, Flux.σ)
+    true
+  end
+end
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@ -25,6 +25,16 @@ trainmode(f, x...) = forward(f, x...)[1]
  @test count(a->a == 0, y) > 50
  y = m(x)
  @test count(a->a == 0, y) == 0
+
+  x = rand(100, 50)
+  m = Dropout(0.5, dims = 2)
+  y = m(x)
+  c = map(i->count(a->a==0, @view y[i, :]), 1:100)
+  @test minimum(c) == maximum(c)
+  m = Dropout(0.5, dims = 1)
+  y = m(x)
+  c = map(i->count(a->a==0, @view y[:, i]), 1:50)
+  @test minimum(c) == maximum(c)
 end

@testset "BatchNorm" begin