diff --git a/docs/src/index.md b/docs/src/index.md index 4fc58f72..4b5668a1 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -3,7 +3,7 @@ Flux is a library for machine learning. It comes "batteries-included" with many useful tools built in, but also lets you use the full power of the Julia language where you need it. We follow a few key principles: * **Doing the obvious thing**. Flux has relatively few explicit APIs for features like regularisation or embeddings. Instead, writing down the mathematical form will work – and be fast. -* **You could have written Flux**. All of it, from [LSTMs](https://github.com/FluxML/Flux.jl/blob/ec16a2c77dbf6ab8b92b0eecd11661be7a62feef/src/layers/recurrent.jl#L131) to [GPU kernels](https://github.com/JuliaGPU/CuArrays.jl), is straightforward Julia code. When it doubt, it’s well worth looking at [the source](https://github.com/FluxML/Flux.jl/). If you need something different, you can easily roll your own. +* **You could have written Flux**. All of it, from [LSTMs](https://github.com/FluxML/Flux.jl/blob/ec16a2c77dbf6ab8b92b0eecd11661be7a62feef/src/layers/recurrent.jl#L131) to [GPU kernels](https://github.com/JuliaGPU/CuArrays.jl), is straightforward Julia code. When in doubt, it’s well worth looking at [the source](https://github.com/FluxML/Flux.jl/). If you need something different, you can easily roll your own. * **Play nicely with others**. Flux works well with Julia libraries from [data frames](https://github.com/JuliaComputing/JuliaDB.jl) and [images](https://github.com/JuliaImages/Images.jl) to [differential equation solvers](https://github.com/JuliaDiffEq/DifferentialEquations.jl), so you can easily build complex data processing pipelines that integrate Flux models. ## Installation diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md index 88fa0a05..a0a39ab5 100644 --- a/docs/src/models/basics.md +++ b/docs/src/models/basics.md @@ -10,14 +10,14 @@ using Flux.Tracker f(x) = 3x^2 + 2x + 1 # df/dx = 6x + 2 -f′(x) = Tracker.gradient(f, x)[1] +df(x) = Tracker.gradient(f, x)[1] -f′(2) # 14.0 (tracked) +df(2) # 14.0 (tracked) # d²f/dx² = 6 -f′′(x) = Tracker.gradient(f′, x)[1] +d2f(x) = Tracker.gradient(df, x)[1] -f′′(2) # 6.0 (tracked) +d2f(2) # 6.0 (tracked) ``` (We'll learn more about why these numbers show up as `(tracked)` below.) diff --git a/src/cuda/cuda.jl b/src/cuda/cuda.jl index 36961800..8b79a9e3 100644 --- a/src/cuda/cuda.jl +++ b/src/cuda/cuda.jl @@ -2,7 +2,7 @@ module CUDA using ..CuArrays -if CuArrays.cudnn_available() +if CuArrays.libcudnn != nothing include("curnn.jl") include("cudnn.jl") end diff --git a/src/data/Data.jl b/src/data/Data.jl index d5b5f38d..ddf0624b 100644 --- a/src/data/Data.jl +++ b/src/data/Data.jl @@ -13,6 +13,9 @@ end include("mnist.jl") export MNIST +include("fashion-mnist.jl") +export FashionMNIST + include("cmudict.jl") using .CMUDict diff --git a/src/data/fashion-mnist.jl b/src/data/fashion-mnist.jl new file mode 100644 index 00000000..e4510b47 --- /dev/null +++ b/src/data/fashion-mnist.jl @@ -0,0 +1,64 @@ +module FashionMNIST + +using ..MNIST: gzopen, imageheader, rawimage, labelheader, rawlabel + +const dir = joinpath(@__DIR__, "../../deps/fashion-mnist") + +function load() + mkpath(dir) + cd(dir) do + for file in ["train-images-idx3-ubyte", + "train-labels-idx1-ubyte", + "t10k-images-idx3-ubyte", + "t10k-labels-idx1-ubyte"] + isfile(file) && continue + @info "Downloading Fashion-MNIST dataset" + download("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/$file.gz", "$file.gz") + open(file, "w") do io + write(io, gzopen(read, "$file.gz")) + end + end + end +end + +const TRAINIMAGES = joinpath(dir, "train-images-idx3-ubyte") +const TRAINLABELS = joinpath(dir, "train-labels-idx1-ubyte") +const TESTIMAGES = joinpath(dir, "t10k-images-idx3-ubyte") +const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte") + +""" + images() + images(:test) + +Load the Fashion-MNIST images. + +Each image is a 28×28 array of `Gray` colour values (see Colors.jl). + +Returns the 60,000 training images by default; pass `:test` to retreive the +10,000 test images. +""" +function images(set = :train) + load() + io = IOBuffer(read(set == :train ? TRAINIMAGES : TESTIMAGES)) + _, N, nrows, ncols = imageheader(io) + [rawimage(io) for _ in 1:N] +end + +""" + labels() + labels(:test) + +Load the labels corresponding to each of the images returned from `images()`. +Each label is a number from 0-9. + +Returns the 60,000 training labels by default; pass `:test` to retreive the +10,000 test labels. +""" +function labels(set = :train) + load() + io = IOBuffer(read(set == :train ? TRAINLABELS : TESTLABELS)) + _, N = labelheader(io) + [rawlabel(io) for _ = 1:N] +end + +end diff --git a/src/data/sentiment.jl b/src/data/sentiment.jl index a58cd9d4..56c9e8ea 100644 --- a/src/data/sentiment.jl +++ b/src/data/sentiment.jl @@ -4,7 +4,7 @@ using ZipFile using ..Data: deps function load() - isfile(deps("sentiment.zip")) || return + isfile(deps("sentiment.zip")) && return @info "Downloading sentiment treebank dataset" download("https://cache.julialang.org/https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip", deps("sentiment.zip")) @@ -26,9 +26,10 @@ totree_(n, a, b) = Tree{Any}((parse(Int, n), nothing), totree(a), totree(b)) totree(t::Expr) = totree_(t.args...) function parsetree(s) - s = replace(s, r"\$", s -> "\\\$") - s = replace(s, r"[^\s\(\)]+", s -> "\"$s\"") - s = replace(s, " ", ", ") + s = replace(s, "\\" => "") + s = replace(s, "\$" => "\\\$") + s = replace(s, r"[^ \n\(\)]+" => s -> "\"$s\"") + s = replace(s, " " => ", ") return totree(Meta.parse(s)) end diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 3e887472..0c2d3715 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -75,7 +75,7 @@ end @treelike Dense -function (a::Dense)(x) +function (a::Dense)(x::AbstractArray) W, b, σ = a.W, a.b, a.σ σ.(W*x .+ b) end diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index 3b40af04..40cd322a 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -148,7 +148,7 @@ Base.show(io::IO, l::LSTMCell) = print(io, "LSTMCell(", size(l.Wi, 2), ", ", size(l.Wi, 1)÷4, ")") """ - LSTM(in::Integer, out::Integer, σ = tanh) + LSTM(in::Integer, out::Integer) Long Short Term Memory recurrent layer. Behaves like an RNN but generally exhibits a longer memory span over sequences. @@ -189,7 +189,7 @@ Base.show(io::IO, l::GRUCell) = print(io, "GRUCell(", size(l.Wi, 2), ", ", size(l.Wi, 1)÷3, ")") """ - GRU(in::Integer, out::Integer, σ = tanh) + GRU(in::Integer, out::Integer) Gated Recurrent Unit layer. Behaves like an RNN but generally exhibits a longer memory span over sequences. diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 190837ab..94f9a94c 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -108,10 +108,8 @@ param(xs::AbstractArray) = TrackedArray(float.(xs)) param(x::TrackedReal) = track(identity, x) param(x::TrackedArray) = track(identity, x) -import NNlib.cudata import Adapt.adapt -cudata(x::TrackedArray) = data(x) adapt(T, xs::TrackedArray) = param(adapt(T, data(xs))) end diff --git a/src/tracker/array.jl b/src/tracker/array.jl index 882a866c..3d9836d0 100644 --- a/src/tracker/array.jl +++ b/src/tracker/array.jl @@ -1,6 +1,8 @@ import Base: * import LinearAlgebra +import LinearAlgebra: inv, \, / + using Statistics using LinearAlgebra: Transpose, Adjoint, diagm, diag @@ -205,6 +207,41 @@ Base.kron(a::TrackedMatrix, b::TrackedMatrix) = _kron(a, b) Base.kron(a::TrackedMatrix, b::AbstractMatrix) = _kron(a, b) Base.kron(a::AbstractMatrix, b::TrackedMatrix) = _kron(a, b) + +inv(A::TrackedArray) = Tracker.track(inv, A) +@grad function inv(A) + return inv(Tracker.data(A)), function (Δ) + Ainv = inv(A) + ∇A = - Ainv' * Δ * Ainv' + return (∇A, ) + end +end + +# (/) rdivide +A::TrackedArray / B::TrackedArray = Tracker.track(/, A, B) +A::AbstractVecOrMat / B::TrackedArray = Tracker.track(/, A, B) +A::TrackedArray / B::AbstractVecOrMat = Tracker.track(/, A, B) +@grad function (A / B) + return Tracker.data(A) / Tracker.data(B), function (Δ) + Binv = inv(B) + ∇B = - Binv' * A' * Δ * Binv' + return (Δ * Binv', ∇B) + end +end + +# (\) ldivide (left vec divide needs more work to resolve dispatch ambiguity) +A::TrackedArray \ B::TrackedArray = Tracker.track(\, A, B) +A::AbstractArray \ B::TrackedArray = Tracker.track(\, A, B) +A::TrackedArray \ B::AbstractVecOrMat = Tracker.track(\, A, B) +@grad function (A \ B) + return Tracker.data(A) \ Tracker.data(B), function (Δ) + Ainv = inv(A) + ∇A = - Ainv' * Δ * B' * Ainv' + return (∇A, Ainv' * Δ) + end +end + + # Reductions Base.sum(xs::TrackedArray; dims = :) = track(sum, xs, dims = dims) @@ -353,9 +390,9 @@ end eltype(y) <: Real || return y eltype(y) == Bool && return y function back(Δ) - Δargs = ntuple(i -> partial.(f, data(Δ), i, args...), Val(N)) - dxs = unbroadcast.(args, Δargs) - return nobacksies(:broadcast, dxs) + Δargs = ntuple(i -> partial.(f, Δ, i, args...), Val(N)) + dxs = map(unbroadcast, args, Δargs) + return dxs end # So we can return non-tracked arrays track(Call(back, tracker.(args)), y) diff --git a/src/tracker/scalar.jl b/src/tracker/scalar.jl index 81ccb9a3..1b6098fb 100644 --- a/src/tracker/scalar.jl +++ b/src/tracker/scalar.jl @@ -63,7 +63,9 @@ for (M, f, arity) in DiffRules.diffrules() da, db = DiffRules.diffrule(M, f, :a, :b) f = :($M.$f) @eval begin - @grad $f(a::Real, b::Real) = $f(data(a), data(b)), Δ -> (Δ * $da, Δ * $db) + @grad $f(a::TrackedReal, b::TrackedReal) = $f(data(a), data(b)), Δ -> (Δ * $da, Δ * $db) + @grad $f(a::TrackedReal, b::Real) = $f(data(a), b), Δ -> (Δ * $da, zero(b)) + @grad $f(a::Real, b::TrackedReal) = $f(a, data(b)), Δ -> (zero(a), Δ * $db) $f(a::TrackedReal, b::TrackedReal) = track($f, a, b) $f(a::TrackedReal, b::Real) = track($f, a, b) $f(a::Real, b::TrackedReal) = track($f, a, b) diff --git a/src/treelike.jl b/src/treelike.jl index 3d83d448..9b3518d3 100644 --- a/src/treelike.jl +++ b/src/treelike.jl @@ -54,7 +54,7 @@ function loadparams!(m, xs) for (p, x) in zip(params(m), xs) size(p) == size(x) || error("Expected param size $(size(p)), got $(size(x))") - copy!(data(p), data(x)) + copyto!(data(p), data(x)) end end diff --git a/src/utils.jl b/src/utils.jl index c53f7864..74d479bd 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -24,7 +24,7 @@ julia> chunk(1:10, 3) """ chunk(xs, n) = collect(Iterators.partition(xs, ceil(Int, length(xs)/n))) -batchindex(xs, i) = (reverse(Base.tail(reverse(indices(xs))))..., i) +batchindex(xs, i) = (reverse(Base.tail(reverse(axes(xs))))..., i) """ frequencies(xs) @@ -66,7 +66,7 @@ julia> batch([[1,2,3],[4,5,6]]) function batch(xs) data = first(xs) isa AbstractArray ? similar(first(xs), size(first(xs))..., length(xs)) : - Vector{eltype(xs)}(length(xs)) + Vector{eltype(xs)}(undef, length(xs)) for (i, x) in enumerate(xs) data[batchindex(data, i)...] = x end @@ -153,3 +153,18 @@ function jacobian(m,x) end J' end + +""" + @jit ... + +The `@jit` annotation can be applied to any code, and the code will be compiled +for performance. + + @jit f(x) = @jit(x) + @jit(x) + +Note that compilation happens regardless of the `@jit` macro, so it should only +be used for aesthetic purposes, or by recovering Python users. +""" +macro jit(ex) + esc(ex) +end diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl index ddc070f7..aa422dfd 100644 --- a/test/cuda/cuda.jl +++ b/test/cuda/cuda.jl @@ -1,42 +1,42 @@ using Flux, Flux.Tracker, CuArrays, Test using Flux: gpu -# @info "Testing GPU Support" -# -# @testset "CuArrays" begin -# -# CuArrays.allowscalar(false) -# -# x = param(randn(5, 5)) -# cx = gpu(x) -# @test cx isa TrackedArray && cx.data isa CuArray -# -# x = Flux.onehotbatch([1, 2, 3], 1:3) -# cx = gpu(x) -# @test cx isa Flux.OneHotMatrix && cx.data isa CuArray -# @test (cx .+ 1) isa CuArray -# -# m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) -# cm = gpu(m) -# -# @test all(p isa TrackedArray && p.data isa CuArray for p in params(cm)) -# @test cm(gpu(rand(10, 10))) isa TrackedArray{Float32,2,CuArray{Float32,2}} -# -# x = [1,2,3] -# cx = gpu(x) -# @test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx) -# -# xs = param(rand(5,5)) -# ys = Flux.onehotbatch(1:5,1:5) -# @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys) -# -# c = gpu(Conv((2,2),3=>4)) -# l = c(gpu(rand(10,10,3,2))) -# Flux.back!(sum(l)) -# -# end +@info "Testing GPU Support" -if CuArrays.cudnn_available() +@testset "CuArrays" begin + +CuArrays.allowscalar(false) + +x = param(randn(5, 5)) +cx = gpu(x) +@test cx isa TrackedArray && cx.data isa CuArray + +x = Flux.onehotbatch([1, 2, 3], 1:3) +cx = gpu(x) +@test cx isa Flux.OneHotMatrix && cx.data isa CuArray +@test (cx .+ 1) isa CuArray + +m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) +cm = gpu(m) + +@test all(p isa TrackedArray && p.data isa CuArray for p in params(cm)) +@test cm(gpu(rand(10, 10))) isa TrackedArray{Float32,2,CuArray{Float32,2}} + +x = [1,2,3] +cx = gpu(x) +@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx) + +xs = param(rand(5,5)) +ys = Flux.onehotbatch(1:5,1:5) +@test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys) + +c = gpu(Conv((2,2),3=>4)) +l = c(gpu(rand(10,10,3,2))) +Flux.back!(sum(l)) + +end + +if CuArrays.libcudnn != nothing @info "Testing Flux/CUDNN BatchNorm" include("cudnn.jl") @info "Testing Flux/CUDNN RNN" diff --git a/test/data.jl b/test/data.jl index 7a27c651..a73d1ec3 100644 --- a/test/data.jl +++ b/test/data.jl @@ -9,3 +9,8 @@ using Test @test MNIST.images()[1] isa Matrix @test MNIST.labels() isa Vector{Int64} + +@test FashionMNIST.images()[1] isa Matrix +@test FashionMNIST.labels() isa Vector{Int64} + +@test Data.Sentiment.train() isa Vector{Data.Tree{Any}} diff --git a/test/layers/basic.jl b/test/layers/basic.jl new file mode 100644 index 00000000..b8d9efd1 --- /dev/null +++ b/test/layers/basic.jl @@ -0,0 +1,33 @@ +using Test, Random + +@testset "basic" begin + @testset "Chain" begin + @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10)) + @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10)) + # numeric test should be put into testset of corresponding layer + end + + @testset "Dense" begin + @test length(Dense(10, 5)(randn(10))) == 5 + @test_throws DimensionMismatch Dense(10, 5)(randn(1)) + @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting + @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting + + @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(1, 1) + @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2) + @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1) + @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20] + + end + + @testset "Diagonal" begin + @test length(Flux.Diagonal(10)(randn(10))) == 10 + @test length(Flux.Diagonal(10)(1)) == 10 + @test length(Flux.Diagonal(10)(randn(1))) == 10 + @test_throws DimensionMismatch Flux.Diagonal(10)(randn(2)) + + @test Flux.Diagonal(2)([1 2]) == [1 2; 1 2] + @test Flux.Diagonal(2)([1,2]) == [1,2] + @test Flux.Diagonal(2)([1 2; 3 4]) == [1 2; 3 4] + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 892b9ffb..ef7ed208 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -32,6 +32,7 @@ include("data.jl") @info "Testing Layers" +include("layers/basic.jl") include("layers/normalisation.jl") include("layers/stateless.jl") include("layers/conv.jl") diff --git a/test/tracker.jl b/test/tracker.jl index 9a4cb793..a4772f2e 100644 --- a/test/tracker.jl +++ b/test/tracker.jl @@ -129,6 +129,11 @@ end @test gradtest(f-> Matrix(Diagonal(f)), rand(3)) +@test gradtest(W -> inv(log.(W * W)), (5,5)) +@test gradtest((A, B) -> A / B , (1,5), (5,5)) +@test gradtest((A, B) -> log.(A * A) / exp.(B * B), (5,5), (5,5)) +@test gradtest((A, B) -> log.(A * A) \ exp.(B * B), (5,5), (5,5)) + @testset "mean" begin @test gradtest(mean, rand(2, 3))