From c8d460ff8445c2a1f677ba03cb66f334a5903d79 Mon Sep 17 00:00:00 2001 From: Mike Innes Date: Tue, 10 Sep 2019 15:02:43 +0100 Subject: [PATCH] doctests passing --- Project.toml | 3 +- docs/src/models/basics.md | 81 ++++++++++++++++++--------------------- src/data/iris.jl | 21 +++++----- src/onehot.jl | 29 +++++++------- test/runtests.jl | 7 ++-- 5 files changed, 69 insertions(+), 72 deletions(-) diff --git a/Project.toml b/Project.toml index b0d50b27..2fcdc943 100644 --- a/Project.toml +++ b/Project.toml @@ -33,7 +33,8 @@ Zygote = "0.3" julia = "1.1" [extras] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test"] +test = ["Test", "Documenter"] diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md index 3b7b2a8e..ddd81992 100644 --- a/docs/src/models/basics.md +++ b/docs/src/models/basics.md @@ -5,55 +5,56 @@ Flux's core feature is taking gradients of Julia code. The `gradient` function takes another Julia function `f` and a set of arguments, and returns the gradient with respect to each argument. (It's a good idea to try pasting these examples in the Julia terminal.) ```jldoctest basics -julia> using Flux.Tracker +julia> using Flux julia> f(x) = 3x^2 + 2x + 1; -julia> df(x) = Tracker.gradient(f, x; nest = true)[1]; # df/dx = 6x + 2 +julia> df(x) = gradient(f, x)[1]; # df/dx = 6x + 2 julia> df(2) -14.0 (tracked) +14 -julia> d2f(x) = Tracker.gradient(df, x; nest = true)[1]; # d²f/dx² = 6 +julia> d2f(x) = gradient(df, x)[1]; # d²f/dx² = 6 julia> d2f(2) -6.0 (tracked) +6 ``` -(We'll learn more about why these numbers show up as `(tracked)` below.) - -When a function has many parameters, we can pass them all in explicitly: +When a function has many parameters, we can get gradients of each one at the same time: ```jldoctest basics -julia> f(W, b, x) = W * x + b; +julia> f(x, y) = sum((x .- y).^2); -julia> Tracker.gradient(f, 2, 3, 4) -(4.0 (tracked), 1.0 (tracked), 2.0 (tracked)) +julia> gradient(f, [2, 1], [2, 0]) +([0, 2], [0, -2]) ``` -But machine learning models can have *hundreds* of parameters! Flux offers a nice way to handle this. We can tell Flux to treat something as a parameter via `param`. Then we can collect these together and tell `gradient` to collect the gradients of all `params` at once. +But machine learning models can have *hundreds* of parameters! To handle this, Flux lets you work with collections of parameters, via `params`. You can get the gradient of all parameters used in a program without explicitly passing them in. ```jldoctest basics julia> using Flux -julia> W = param(2) -2.0 (tracked) +julia> x = [2, 1]; -julia> b = param(3) -3.0 (tracked) +julia> y = [2, 0]; -julia> f(x) = W * x + b; +julia> gs = gradient(params(x, y)) do + f(x, y) + end +Grads(...) -julia> grads = Tracker.gradient(() -> f(4), params(W, b)); +julia> gs[x] +2-element Array{Int64,1}: + 0 + 2 -julia> grads[W] -4.0 (tracked) - -julia> grads[b] -1.0 (tracked) +julia> gs[y] +2-element Array{Int64,1}: + 0 + -2 ``` -There are a few things to notice here. Firstly, `W` and `b` now show up as *tracked*. Tracked things behave like normal numbers or arrays, but keep records of everything you do with them, allowing Flux to calculate their gradients. `gradient` takes a zero-argument function; no arguments are necessary because the `params` tell it what to differentiate. +Here, `gradient` takes a zero-argument function; no arguments are necessary because the `params` tell it what to differentiate. This will come in really handy when dealing with big, complicated models. For now, though, let's start with something simple. @@ -76,26 +77,20 @@ x, y = rand(5), rand(2) # Dummy data loss(x, y) # ~ 3 ``` -To improve the prediction we can take the gradients of `W` and `b` with respect to the loss and perform gradient descent. Let's tell Flux that `W` and `b` are parameters, just like we did above. +To improve the prediction we can take the gradients of `W` and `b` with respect to the loss and perform gradient descent. ```julia -using Flux.Tracker +using Flux -W = param(W) -b = param(b) - -gs = Tracker.gradient(() -> loss(x, y), params(W, b)) +gs = gradient(() -> loss(x, y), params(W, b)) ``` -Now that we have gradients, we can pull them out and update `W` to train the model. The `update!(W, Δ)` function applies `W = W + Δ`, which we can use for gradient descent. +Now that we have gradients, we can pull them out and update `W` to train the model. ```julia -using Flux.Tracker: update! +W̄ = gs[W] -Δ = gs[W] - -# Update the parameter and reset the gradient -update!(W, -0.1Δ) +W .-= 0.1 .* W̄ loss(x, y) # ~ 2.5 ``` @@ -111,12 +106,12 @@ It's common to create more complex models than the linear regression above. For ```julia using Flux -W1 = param(rand(3, 5)) -b1 = param(rand(3)) +W1 = rand(3, 5) +b1 = rand(3) layer1(x) = W1 * x .+ b1 -W2 = param(rand(2, 3)) -b2 = param(rand(2)) +W2 = rand(2, 3) +b2 = rand(2) layer2(x) = W2 * x .+ b2 model(x) = layer2(σ.(layer1(x))) @@ -128,8 +123,8 @@ This works but is fairly unwieldy, with a lot of repetition – especially as we ```julia function linear(in, out) - W = param(randn(out, in)) - b = param(randn(out)) + W = randn(out, in) + b = randn(out) x -> W * x .+ b end @@ -150,7 +145,7 @@ struct Affine end Affine(in::Integer, out::Integer) = - Affine(param(randn(out, in)), param(randn(out))) + Affine(randn(out, in), randn(out)) # Overload call, so the object can be used as a function (m::Affine)(x) = m.W * x .+ m.b diff --git a/src/data/iris.jl b/src/data/iris.jl index 3da90330..d78606d8 100644 --- a/src/data/iris.jl +++ b/src/data/iris.jl @@ -1,14 +1,10 @@ - """ - - Iris - Fisher's classic iris dataset. -Measurements from 3 different species of iris: setosa, versicolor and +Measurements from 3 different species of iris: setosa, versicolor and virginica. There are 50 examples of each species. -There are 4 measurements for each example: sepal length, sepal width, petal +There are 4 measurements for each example: sepal length, sepal width, petal length and petal width. The measurements are in centimeters. The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris). @@ -35,10 +31,12 @@ end labels() -Get the labels of the iris dataset, a 150 element array of strings listing the +Get the labels of the iris dataset, a 150 element array of strings listing the species of each example. ```jldoctest +julia> using Flux + julia> labels = Flux.Data.Iris.labels(); julia> summary(labels) @@ -58,11 +56,13 @@ end features() -Get the features of the iris dataset. This is a 4x150 matrix of Float64 -elements. It has a row for each feature (sepal length, sepal width, +Get the features of the iris dataset. This is a 4x150 matrix of Float64 +elements. It has a row for each feature (sepal length, sepal width, petal length, petal width) and a column for each example. ```jldoctest +julia> using Flux + julia> features = Flux.Data.Iris.features(); julia> summary(features) @@ -81,6 +81,5 @@ function features() iris = readdlm(deps("iris.data"), ',') Matrix{Float64}(iris[1:end, 1:4]') end + end - - diff --git a/src/onehot.jl b/src/onehot.jl index c9f77412..fe93c5c5 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -54,17 +54,19 @@ it will error. ## Examples ```jldoctest +julia> using Flux: onehot + julia> onehot(:b, [:a, :b, :c]) 3-element Flux.OneHotVector: - false - true - false + 0 + 1 + 0 julia> onehot(:c, [:a, :b, :c]) 3-element Flux.OneHotVector: - false - false - true + 0 + 0 + 1 ``` """ function onehot(l, labels) @@ -88,12 +90,13 @@ Create an [`OneHotMatrix`](@ref) with a batch of labels based on possible `label ## Examples ```jldoctest -julia> onehotbatch([:b, :a, :b], [:a, :b, :c]) -3×3 Flux.OneHotMatrix: - false true false - true false true - false false false +julia> using Flux: onehotbatch +julia> onehotbatch([:b, :a, :b], [:a, :b, :c]) +3×3 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}: + 0 1 0 + 1 0 1 + 0 0 0 ``` """ onehotbatch(ls, labels, unk...) = @@ -106,9 +109,9 @@ Base.argmax(xs::OneHotVector) = xs.ix Inverse operations of [`onehot`](@ref). -## Examples - ```jldoctest +julia> using Flux: onecold + julia> onecold([true, false, false], [:a, :b, :c]) :a diff --git a/test/runtests.jl b/test/runtests.jl index bd66e254..1da02de4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,11 +1,8 @@ -using Flux, Test, Random, Statistics +using Flux, Test, Random, Statistics, Documenter using Random Random.seed!(0) -# So we can use the system CuArrays -insert!(LOAD_PATH, 2, "@v#.#") - @testset "Flux" begin @info "Testing Basics" @@ -32,4 +29,6 @@ else @warn "CUDA unavailable, not testing GPU support" end +doctest(Flux) + end