diff --git a/docs/src/index.md b/docs/src/index.md
index 4fc58f72..4b5668a1 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -3,7 +3,7 @@
 Flux is a library for machine learning. It comes "batteries-included" with many useful tools built in, but also lets you use the full power of the Julia language where you need it. We follow a few key principles:
 
 * **Doing the obvious thing**. Flux has relatively few explicit APIs for features like regularisation or embeddings. Instead, writing down the mathematical form will work – and be fast.
-* **You could have written Flux**. All of it, from [LSTMs](https://github.com/FluxML/Flux.jl/blob/ec16a2c77dbf6ab8b92b0eecd11661be7a62feef/src/layers/recurrent.jl#L131) to [GPU kernels](https://github.com/JuliaGPU/CuArrays.jl), is straightforward Julia code. When it doubt, it’s well worth looking at [the source](https://github.com/FluxML/Flux.jl/). If you need something different, you can easily roll your own.
+* **You could have written Flux**. All of it, from [LSTMs](https://github.com/FluxML/Flux.jl/blob/ec16a2c77dbf6ab8b92b0eecd11661be7a62feef/src/layers/recurrent.jl#L131) to [GPU kernels](https://github.com/JuliaGPU/CuArrays.jl), is straightforward Julia code. When in doubt, it’s well worth looking at [the source](https://github.com/FluxML/Flux.jl/). If you need something different, you can easily roll your own.
 * **Play nicely with others**. Flux works well with Julia libraries from [data frames](https://github.com/JuliaComputing/JuliaDB.jl) and [images](https://github.com/JuliaImages/Images.jl) to [differential equation solvers](https://github.com/JuliaDiffEq/DifferentialEquations.jl), so you can easily build complex data processing pipelines that integrate Flux models.
 
 ## Installation
diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 88fa0a05..a0a39ab5 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -10,14 +10,14 @@ using Flux.Tracker
 f(x) = 3x^2 + 2x + 1
 
 # df/dx = 6x + 2
-f′(x) = Tracker.gradient(f, x)[1]
+df(x) = Tracker.gradient(f, x)[1]
 
-f′(2) # 14.0 (tracked)
+df(2) # 14.0 (tracked)
 
 # d²f/dx² = 6
-f′′(x) = Tracker.gradient(f′, x)[1]
+d2f(x) = Tracker.gradient(df, x)[1]
 
-f′′(2) # 6.0 (tracked)
+d2f(2) # 6.0 (tracked)
 ```
 
 (We'll learn more about why these numbers show up as `(tracked)` below.)
diff --git a/src/cuda/cuda.jl b/src/cuda/cuda.jl
index 36961800..8b79a9e3 100644
--- a/src/cuda/cuda.jl
+++ b/src/cuda/cuda.jl
@@ -2,7 +2,7 @@ module CUDA
 
 using ..CuArrays
 
-if CuArrays.cudnn_available()
+if CuArrays.libcudnn != nothing
     include("curnn.jl")
     include("cudnn.jl")
 end
diff --git a/src/data/Data.jl b/src/data/Data.jl
index d5b5f38d..ddf0624b 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -13,6 +13,9 @@ end
 include("mnist.jl")
 export MNIST
 
+include("fashion-mnist.jl")
+export FashionMNIST
+
 include("cmudict.jl")
 using .CMUDict
 
diff --git a/src/data/fashion-mnist.jl b/src/data/fashion-mnist.jl
new file mode 100644
index 00000000..e4510b47
--- /dev/null
+++ b/src/data/fashion-mnist.jl
@@ -0,0 +1,64 @@
+module FashionMNIST
+
+using ..MNIST: gzopen, imageheader, rawimage, labelheader, rawlabel
+
+const dir = joinpath(@__DIR__, "../../deps/fashion-mnist")
+
+function load()
+  mkpath(dir)
+  cd(dir) do
+    for file in ["train-images-idx3-ubyte",
+                 "train-labels-idx1-ubyte",
+                 "t10k-images-idx3-ubyte",
+                 "t10k-labels-idx1-ubyte"]
+      isfile(file) && continue
+      @info "Downloading Fashion-MNIST dataset"
+      download("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/$file.gz", "$file.gz")
+      open(file, "w") do io
+        write(io, gzopen(read, "$file.gz"))
+      end
+    end
+  end
+end
+
+const TRAINIMAGES = joinpath(dir, "train-images-idx3-ubyte")
+const TRAINLABELS = joinpath(dir, "train-labels-idx1-ubyte")
+const TESTIMAGES = joinpath(dir, "t10k-images-idx3-ubyte")
+const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte")
+
+"""
+    images()
+    images(:test)
+
+Load the Fashion-MNIST images.
+
+Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
+
+Returns the 60,000 training images by default; pass `:test` to retreive the
+10,000 test images.
+"""
+function images(set = :train)
+  load()
+  io = IOBuffer(read(set == :train ? TRAINIMAGES : TESTIMAGES))
+  _, N, nrows, ncols = imageheader(io)
+  [rawimage(io) for _ in 1:N]
+end
+
+"""
+    labels()
+    labels(:test)
+
+Load the labels corresponding to each of the images returned from `images()`.
+Each label is a number from 0-9.
+
+Returns the 60,000 training labels by default; pass `:test` to retreive the
+10,000 test labels.
+"""
+function labels(set = :train)
+  load()
+  io = IOBuffer(read(set == :train ? TRAINLABELS : TESTLABELS))
+  _, N = labelheader(io)
+  [rawlabel(io) for _ = 1:N]
+end
+
+end
diff --git a/src/data/sentiment.jl b/src/data/sentiment.jl
index a58cd9d4..56c9e8ea 100644
--- a/src/data/sentiment.jl
+++ b/src/data/sentiment.jl
@@ -4,7 +4,7 @@ using ZipFile
 using ..Data: deps
 
 function load()
-  isfile(deps("sentiment.zip")) || return
+  isfile(deps("sentiment.zip")) && return
   @info "Downloading sentiment treebank dataset"
   download("https://cache.julialang.org/https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip",
            deps("sentiment.zip"))
@@ -26,9 +26,10 @@ totree_(n, a, b) = Tree{Any}((parse(Int, n), nothing), totree(a), totree(b))
 totree(t::Expr) = totree_(t.args...)
 
 function parsetree(s)
-  s = replace(s, r"\$", s -> "\\\$")
-  s = replace(s, r"[^\s\(\)]+", s -> "\"$s\"")
-  s = replace(s, " ", ", ")
+  s = replace(s, "\\" => "")
+  s = replace(s, "\$" => "\\\$")
+  s = replace(s, r"[^ \n\(\)]+" => s -> "\"$s\"")
+  s = replace(s, " " => ", ")
   return totree(Meta.parse(s))
 end
 
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 3e887472..0c2d3715 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -75,7 +75,7 @@ end
 
 @treelike Dense
 
-function (a::Dense)(x)
+function (a::Dense)(x::AbstractArray)
   W, b, σ = a.W, a.b, a.σ
   σ.(W*x .+ b)
 end
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 3b40af04..40cd322a 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -148,7 +148,7 @@ Base.show(io::IO, l::LSTMCell) =
   print(io, "LSTMCell(", size(l.Wi, 2), ", ", size(l.Wi, 1)÷4, ")")
 
 """
-    LSTM(in::Integer, out::Integer, σ = tanh)
+    LSTM(in::Integer, out::Integer)
 
 Long Short Term Memory recurrent layer. Behaves like an RNN but generally
 exhibits a longer memory span over sequences.
@@ -189,7 +189,7 @@ Base.show(io::IO, l::GRUCell) =
   print(io, "GRUCell(", size(l.Wi, 2), ", ", size(l.Wi, 1)÷3, ")")
 
 """
-    GRU(in::Integer, out::Integer, σ = tanh)
+    GRU(in::Integer, out::Integer)
 
 Gated Recurrent Unit layer. Behaves like an RNN but generally
 exhibits a longer memory span over sequences.
diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl
index 190837ab..94f9a94c 100644
--- a/src/tracker/Tracker.jl
+++ b/src/tracker/Tracker.jl
@@ -108,10 +108,8 @@ param(xs::AbstractArray) = TrackedArray(float.(xs))
 param(x::TrackedReal) = track(identity, x)
 param(x::TrackedArray) = track(identity, x)
 
-import NNlib.cudata
 import Adapt.adapt
 
-cudata(x::TrackedArray) = data(x)
 adapt(T, xs::TrackedArray) = param(adapt(T, data(xs)))
 
 end
diff --git a/src/tracker/array.jl b/src/tracker/array.jl
index 882a866c..3d9836d0 100644
--- a/src/tracker/array.jl
+++ b/src/tracker/array.jl
@@ -1,6 +1,8 @@
 import Base: *
 
 import LinearAlgebra
+import LinearAlgebra: inv, \, /
+
 using Statistics
 using LinearAlgebra: Transpose, Adjoint, diagm, diag
 
@@ -205,6 +207,41 @@ Base.kron(a::TrackedMatrix, b::TrackedMatrix)  = _kron(a, b)
 Base.kron(a::TrackedMatrix, b::AbstractMatrix) = _kron(a, b)
 Base.kron(a::AbstractMatrix, b::TrackedMatrix) = _kron(a, b)
 
+
+inv(A::TrackedArray) = Tracker.track(inv, A)
+@grad function inv(A)
+    return inv(Tracker.data(A)), function (Δ)
+        Ainv = inv(A)
+        ∇A = - Ainv' * Δ * Ainv'
+        return (∇A, )
+    end
+end
+
+#       (/) rdivide
+A::TrackedArray     / B::TrackedArray     = Tracker.track(/, A, B)
+A::AbstractVecOrMat / B::TrackedArray     = Tracker.track(/, A, B)
+A::TrackedArray     / B::AbstractVecOrMat = Tracker.track(/, A, B)
+@grad function (A / B)
+    return Tracker.data(A) / Tracker.data(B), function (Δ)
+        Binv = inv(B)
+        ∇B = - Binv' * A' * Δ * Binv'
+        return (Δ * Binv',  ∇B)
+    end
+end
+
+#       (\) ldivide  (left vec divide needs more work to resolve dispatch ambiguity)
+A::TrackedArray     \ B::TrackedArray     = Tracker.track(\, A, B)
+A::AbstractArray    \ B::TrackedArray     = Tracker.track(\, A, B)
+A::TrackedArray     \ B::AbstractVecOrMat = Tracker.track(\, A, B)
+@grad function (A \ B)
+    return Tracker.data(A) \ Tracker.data(B), function (Δ)
+        Ainv = inv(A)
+        ∇A = - Ainv' * Δ * B' * Ainv'
+        return (∇A,  Ainv' * Δ)
+    end
+end
+
+
 # Reductions
 
 Base.sum(xs::TrackedArray; dims = :) = track(sum, xs, dims = dims)
@@ -353,9 +390,9 @@ end
   eltype(y) <: Real || return y
   eltype(y) == Bool && return y
   function back(Δ)
-    Δargs = ntuple(i -> partial.(f, data(Δ), i, args...), Val(N))
-    dxs = unbroadcast.(args, Δargs)
-    return nobacksies(:broadcast, dxs)
+    Δargs = ntuple(i -> partial.(f, Δ, i, args...), Val(N))
+    dxs = map(unbroadcast, args, Δargs)
+    return dxs
   end
   # So we can return non-tracked arrays
   track(Call(back, tracker.(args)), y)
diff --git a/src/tracker/scalar.jl b/src/tracker/scalar.jl
index 81ccb9a3..1b6098fb 100644
--- a/src/tracker/scalar.jl
+++ b/src/tracker/scalar.jl
@@ -63,7 +63,9 @@ for (M, f, arity) in DiffRules.diffrules()
   da, db = DiffRules.diffrule(M, f, :a, :b)
   f = :($M.$f)
   @eval begin
-    @grad $f(a::Real, b::Real) = $f(data(a), data(b)), Δ -> (Δ * $da, Δ * $db)
+    @grad $f(a::TrackedReal, b::TrackedReal) = $f(data(a), data(b)), Δ -> (Δ * $da, Δ * $db)
+    @grad $f(a::TrackedReal, b::Real) = $f(data(a), b), Δ -> (Δ * $da, zero(b))
+    @grad $f(a::Real, b::TrackedReal) = $f(a, data(b)), Δ -> (zero(a), Δ * $db)
     $f(a::TrackedReal, b::TrackedReal)  = track($f, a, b)
     $f(a::TrackedReal, b::Real) = track($f, a, b)
     $f(a::Real, b::TrackedReal) = track($f, a, b)
diff --git a/src/treelike.jl b/src/treelike.jl
index 3d83d448..9b3518d3 100644
--- a/src/treelike.jl
+++ b/src/treelike.jl
@@ -54,7 +54,7 @@ function loadparams!(m, xs)
   for (p, x) in zip(params(m), xs)
     size(p) == size(x) ||
       error("Expected param size $(size(p)), got $(size(x))")
-    copy!(data(p), data(x))
+    copyto!(data(p), data(x))
   end
 end
 
diff --git a/src/utils.jl b/src/utils.jl
index c53f7864..74d479bd 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -24,7 +24,7 @@ julia> chunk(1:10, 3)
 """
 chunk(xs, n) = collect(Iterators.partition(xs, ceil(Int, length(xs)/n)))
 
-batchindex(xs, i) = (reverse(Base.tail(reverse(indices(xs))))..., i)
+batchindex(xs, i) = (reverse(Base.tail(reverse(axes(xs))))..., i)
 
 """
     frequencies(xs)
@@ -66,7 +66,7 @@ julia> batch([[1,2,3],[4,5,6]])
 function batch(xs)
   data = first(xs) isa AbstractArray ?
     similar(first(xs), size(first(xs))..., length(xs)) :
-    Vector{eltype(xs)}(length(xs))
+    Vector{eltype(xs)}(undef, length(xs))
   for (i, x) in enumerate(xs)
     data[batchindex(data, i)...] = x
   end
@@ -153,3 +153,18 @@ function jacobian(m,x)
     end
     J'
 end
+
+"""
+    @jit ...
+
+The `@jit` annotation can be applied to any code, and the code will be compiled
+for performance.
+
+    @jit f(x) = @jit(x) + @jit(x)
+
+Note that compilation happens regardless of the `@jit` macro, so it should only
+be used for aesthetic purposes, or by recovering Python users.
+"""
+macro jit(ex)
+  esc(ex)
+end
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index ddc070f7..aa422dfd 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -1,42 +1,42 @@
 using Flux, Flux.Tracker, CuArrays, Test
 using Flux: gpu
 
-# @info "Testing GPU Support"
-#
-# @testset "CuArrays" begin
-#
-# CuArrays.allowscalar(false)
-#
-# x = param(randn(5, 5))
-# cx = gpu(x)
-# @test cx isa TrackedArray && cx.data isa CuArray
-#
-# x = Flux.onehotbatch([1, 2, 3], 1:3)
-# cx = gpu(x)
-# @test cx isa Flux.OneHotMatrix && cx.data isa CuArray
-# @test (cx .+ 1) isa CuArray
-#
-# m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
-# cm = gpu(m)
-#
-# @test all(p isa TrackedArray && p.data isa CuArray for p in params(cm))
-# @test cm(gpu(rand(10, 10))) isa TrackedArray{Float32,2,CuArray{Float32,2}}
-#
-# x = [1,2,3]
-# cx = gpu(x)
-# @test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
-#
-# xs = param(rand(5,5))
-# ys = Flux.onehotbatch(1:5,1:5)
-# @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
-#
-# c = gpu(Conv((2,2),3=>4))
-# l = c(gpu(rand(10,10,3,2)))
-# Flux.back!(sum(l))
-#
-# end
+@info "Testing GPU Support"
 
-if CuArrays.cudnn_available()
+@testset "CuArrays" begin
+
+CuArrays.allowscalar(false)
+
+x = param(randn(5, 5))
+cx = gpu(x)
+@test cx isa TrackedArray && cx.data isa CuArray
+
+x = Flux.onehotbatch([1, 2, 3], 1:3)
+cx = gpu(x)
+@test cx isa Flux.OneHotMatrix && cx.data isa CuArray
+@test (cx .+ 1) isa CuArray
+
+m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
+cm = gpu(m)
+
+@test all(p isa TrackedArray && p.data isa CuArray for p in params(cm))
+@test cm(gpu(rand(10, 10))) isa TrackedArray{Float32,2,CuArray{Float32,2}}
+
+x = [1,2,3]
+cx = gpu(x)
+@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
+
+xs = param(rand(5,5))
+ys = Flux.onehotbatch(1:5,1:5)
+@test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
+
+c = gpu(Conv((2,2),3=>4))
+l = c(gpu(rand(10,10,3,2)))
+Flux.back!(sum(l))
+
+end
+
+if CuArrays.libcudnn != nothing
     @info "Testing Flux/CUDNN BatchNorm"
     include("cudnn.jl")
     @info "Testing Flux/CUDNN RNN"
diff --git a/test/data.jl b/test/data.jl
index 7a27c651..a73d1ec3 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -9,3 +9,8 @@ using Test
 
 @test MNIST.images()[1] isa Matrix
 @test MNIST.labels() isa Vector{Int64}
+
+@test FashionMNIST.images()[1] isa Matrix
+@test FashionMNIST.labels() isa Vector{Int64}
+
+@test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
new file mode 100644
index 00000000..b8d9efd1
--- /dev/null
+++ b/test/layers/basic.jl
@@ -0,0 +1,33 @@
+using Test, Random
+
+@testset "basic" begin
+    @testset "Chain" begin
+        @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10))
+        @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10))
+        # numeric test should be put into testset of corresponding layer
+    end
+
+    @testset "Dense" begin
+        @test  length(Dense(10, 5)(randn(10))) == 5
+        @test_throws DimensionMismatch Dense(10, 5)(randn(1))
+        @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
+        @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting
+
+        @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(1, 1)
+        @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
+        @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
+        @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
+
+    end
+
+    @testset "Diagonal" begin
+        @test length(Flux.Diagonal(10)(randn(10))) == 10
+        @test length(Flux.Diagonal(10)(1)) == 10
+        @test length(Flux.Diagonal(10)(randn(1))) == 10
+        @test_throws DimensionMismatch Flux.Diagonal(10)(randn(2))
+
+        @test Flux.Diagonal(2)([1 2]) == [1 2; 1 2]
+        @test Flux.Diagonal(2)([1,2]) == [1,2]
+        @test Flux.Diagonal(2)([1 2; 3 4]) == [1 2; 3 4]
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 892b9ffb..ef7ed208 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -32,6 +32,7 @@ include("data.jl")
 
 @info "Testing Layers"
 
+include("layers/basic.jl")
 include("layers/normalisation.jl")
 include("layers/stateless.jl")
 include("layers/conv.jl")
diff --git a/test/tracker.jl b/test/tracker.jl
index 9a4cb793..a4772f2e 100644
--- a/test/tracker.jl
+++ b/test/tracker.jl
@@ -129,6 +129,11 @@ end
 
 @test gradtest(f-> Matrix(Diagonal(f)), rand(3))
 
+@test gradtest(W -> inv(log.(W * W)), (5,5))
+@test gradtest((A, B) -> A / B , (1,5), (5,5))
+@test gradtest((A, B) -> log.(A * A) / exp.(B * B), (5,5), (5,5))
+@test gradtest((A, B) -> log.(A * A) \ exp.(B * B), (5,5), (5,5))
+
 @testset "mean" begin
   @test gradtest(mean, rand(2, 3))