merge conflicts

2019-09-24 00:31:44 +05:30 · 2019-09-24 00:31:44 +05:30 · 822288d63d
commit 822288d63d
parent d8a069b304 b60df53ba1
10 changed files with 64 additions and 24 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1 +1,2 @@
 paper/* linguist-documentation
+CITATION.bib linguist-detectable=false
--- a/Manifest.toml
+++ b/Manifest.toml
@ -46,9 +46,9 @@ version = "0.6.2"

 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "9b2b4b71d6b7f946c9689bb4dea03ff92e3c7091"
+git-tree-sha1 = "e063efb91cfefd7e6afd92c435d01398107a500b"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "1.1.0"
+version = "1.2.0"

 [[CUDAdrv]]
 deps = ["CUDAapi", "Libdl", "Printf"]
@ -147,9 +147,15 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"

 [[FFTW]]
 deps = ["AbstractFFTs", "BinaryProvider", "Conda", "Libdl", "LinearAlgebra", "Reexport", "Test"]
+<<<<<<< HEAD
 git-tree-sha1 = "03f8776fbdae28c20c0d1d2ae4e090cd1dfcd247"
 uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 version = "1.0.0"
+=======
+git-tree-sha1 = "6c5b420da0b8c12098048561b8d58f81adea506f"
+uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+version = "1.0.1"
+>>>>>>> upstream/master

 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
@ -170,9 +176,15 @@ version = "0.10.3"

 [[GPUArrays]]
 deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "StaticArrays", "Test"]
+<<<<<<< HEAD
 git-tree-sha1 = "b5009ac44b141ded5e6f04c4db83807970f56e91"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 version = "1.0.2"
+=======
+git-tree-sha1 = "77e27264276fe97a7e7fb928bf8999a145abc018"
+uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+version = "1.0.3"
+>>>>>>> upstream/master

 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
@ -388,7 +400,7 @@ version = "0.8.3"

 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "ce6d7142d665b1e4c71c678fa7db4da3bbc6743f"
+git-tree-sha1 = "38241b40ebd8748bcacad5e6c7ba3ab3cc7a15c9"
 repo-rev = "master"
 repo-url = "https://github.com/FluxML/Zygote.jl.git"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
@ -396,6 +408,8 @@ version = "0.3.4"

 [[ZygoteRules]]
 deps = ["MacroTools"]
-git-tree-sha1 = "def5f96ac2895fd9b48435f6b97020979ee0a4c6"
+git-tree-sha1 = "c4c29b30b8ff3be13d4244e78be7df2a42bc54d0"
+repo-rev = "master"
+repo-url = "https://github.com/FluxML/ZygoteRules.jl.git"
 uuid = "700de1a5-db45-46bc-99cf-38207098b444"
-version = "0.1.0"
+version = "0.2.0"
--- a/Project.toml
+++ b/Project.toml
@ -24,6 +24,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"

 [compat]
 CUDAapi = "1.1"
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@ -5,7 +5,7 @@ Consider a [simple linear regression](../models/basics.md). We create some dummy
 ```julia
 using Flux

-W = rand(2, 5))
+W = rand(2, 5)
 b = rand(2)

 predict(x) = (W * x) .+ b
@ -15,7 +15,7 @@ x, y = rand(5), rand(2) # Dummy data
 l = loss(x, y) # ~ 3

 θ = Params([W, b])
-grads = Zygote.gradient(() -> loss(x, y), θ)
+grads = gradient(() -> loss(x, y), θ)
 ```

 We want to update each parameter, using the gradient, in order to improve (reduce) the loss. Here's one way to do that:
--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -6,7 +6,7 @@ using Base: tail
 using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using MacroTools: @forward
@reexport using NNlib
-using Zygote: Params, @adjoint, gradient, forward
+using Zygote: Params, @adjoint, gradient, pullback
 export gradient

 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@ -271,7 +271,8 @@ function desc(rnn)
  return d
 end

-using ..Flux: @adjoint
+import Zygote
+using Zygote: @adjoint

 function (m::CuRNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
  y, h′ = forward(desc(m), x, h)
@ -299,15 +300,29 @@ unbroadcast(x::AbstractArray, Δ) =
  length(x) == length(Δ) ? trim(x, Δ) :
    trim(x, sum(Δ, dims = ntuple(i -> size(x, i) == 1 ? i : ndims(Δ)+1, Val(ndims(Δ)))))

+coerce_cuda(x::Union{CuArray,Nothing}) = x
+coerce_cuda(x::Tuple) = coerce_cuda.(x)
+
+coerce_cuda(x) = x .+ CuArrays.fill(0)
+
+function struct_grad!(cx::Zygote.Context, x, x̄)
+  for f in fieldnames(typeof(x))
+    Zygote.accum_param(cx, getfield(x, f), getfield(x̄, f))
+  end
+  dx = Zygote.grad_mut(cx, x)
+  dx[] = Zygote.accum(dx[], x̄)
+  return dx
+end
+
 for RNN in (CuRNN, CuGRU)
  @eval @adjoint function (m::$RNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
    reserve, (y, ho) = forwardTrain(desc(m), x, h)
    (ho, y), function (Δ)
-      dho, dy = Δ
+      dho, dy = coerce_cuda(Δ)
      h_ = hBatch(x, h)
      dx, dh = backwardData(descs[m], y, dy, dho, h_, reserve)
      (dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
-      dm = Ref{Any}((σ=nothing,Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing))
+      dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing))
      (dm, unbroadcast(h, dh), dx)
    end
  end
@ -316,13 +331,13 @@ end
@adjoint function (m::CuLSTM)((h, c)::Tuple{CuArray{T},CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
  reserve, (y, ho, co) = forwardTrain(desc(m), x, h, c)
  ((ho, co), y), function (Δ)
-    dhc, dy = Δ
+    dhc, dy = coerce_cuda(Δ)
    dho, dco = dhc === nothing ? (nothing, nothing) : dhc
    h_ = hBatch(x, h)
    c_ = hBatch(x, c)
    dx, dh, dc = backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve)
    (dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
-    dm = Ref{Any}((Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing,c=nothing))
+    dm = struct_grad!(__context__, m, (Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing,c=nothing))
    (dm, (unbroadcast(h, dh), unbroadcast(c, dc)), dx)
  end
 end
--- a/test/cuda/cudnn.jl
+++ b/test/cuda/cudnn.jl
@ -1,5 +1,5 @@
 using Flux, CuArrays, Test
-using Flux: forward
+using Flux: pullback

@testset "CUDNN BatchNorm" begin
    @testset "4D Input" begin
@ -8,8 +8,8 @@ using Flux: forward
        cx = gpu(x)
        cm = gpu(m)

-        y, back = forward((m, x) -> m(x), m, x)
-        cy, cback = forward((m, x) -> m(x), cm, cx)
+        y, back = pullback((m, x) -> m(x), m, x)
+        cy, cback = pullback((m, x) -> m(x), cm, cx)

        @test cpu(cy) ≈ y

@ -28,8 +28,8 @@ using Flux: forward
        cx = gpu(x)
        cm = gpu(m)

-        y, back = forward((m, x) -> m(x), m, x)
-        cy, cback = forward((m, x) -> m(x), cm, cx)
+        y, back = pullback((m, x) -> m(x), m, x)
+        cy, cback = pullback((m, x) -> m(x), cm, cx)

        @test cpu(cy) ≈ y

--- a/test/cuda/curnn.jl
+++ b/test/cuda/curnn.jl
@ -1,5 +1,14 @@
 using Flux, CuArrays, Test
-using Flux: forward
+using Flux: pullback
+
+@testset for R in [RNN, GRU, LSTM]
+  m = R(10, 5) |> gpu
+  x = gpu(rand(10))
+  (m̄,) = gradient(m -> sum(m(x)), m)
+  Flux.reset!(m)
+  θ = gradient(() -> sum(m(x)), params(m))
+  @test collect(m̄[].cell[].Wi) == collect(θ[m.cell.Wi])
+end

@testset "RNN" begin
  @testset for R in [RNN, GRU, LSTM], batch_size in (1, 5)
@ -13,8 +22,8 @@ using Flux: forward
      rand(10, batch_size)
    cux = gpu(x)

-    y, back = forward((r, x) -> (r(x)), rnn, x)
-    cuy, cuback = forward((r, x) -> (r(x)), curnn, cux)
+    y, back = pullback((r, x) -> (r(x)), rnn, x)
+    cuy, cuback = pullback((r, x) -> (r(x)), curnn, cux)

    @test y ≈ collect(cuy)
    @test haskey(Flux.CUDA.descs, curnn.cell)
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@ -1,7 +1,7 @@
 using Flux, Test, Statistics
-using Zygote: forward
+using Zygote: pullback

-trainmode(f, x...) = forward(f, x...)[1]
+trainmode(f, x...) = pullback(f, x...)[1]
 trainmode(f) = (x...) -> trainmode(f, x...)

@testset "Dropout" begin
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@ -55,7 +55,7 @@ const ϵ = 1e-7
      y = rand(T, 2)
      ŷ = rand(T, 2)
      for f in (mse, crossentropy, logitcrossentropy)
-        fwd, back = Flux.forward(f, ŷ, y)
+        fwd, back = Flux.pullback(f, ŷ, y)
        @test fwd isa T
        @test eltype(back(one(T))[1]) == T
      end