Merge #1238

1238: Fix inline code block r=dhairyagandhi96 a=harryscholes ### PR Checklist - [ ] Tests are added - [ ] Entry in NEWS.md - [x] Documentation, if applicable - [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes). Co-authored-by: harryscholes <harryscholes@gmail.com>
Fix inline code block
2020-06-19 08:28:41 +00:00 · 2020-06-19 09:24:44 +01:00 · 2020-06-16 17:21:28 +00:00 · 2020-06-16 13:04:20 +00:00 · 2020-06-16 14:02:24 +01:00 · 2020-06-16 13:32:27 +02:00
28 changed files with 449 additions and 179 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,12 @@
 [Please delete this text and describe your change here.
 For bugfixes, please detail the bug and include a test case which your patch fixes.
 If you are adding a new feature, please clearly describe the design, its rationale, the possible alternatives considered.
 It is easiest to merge new features when there is clear precedent in other systems; we need to know we're taking
 the right direction since it can be hard to change later.]
 ### PR Checklist
 - [ ] Tests are added
 - [ ] Entry in NEWS.md
 - [ ] Documentation, if applicable
 - [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes).
--- a/Manifest.toml
+++ b/Manifest.toml
@ -8,35 +8,35 @@ version = "0.5.0"
 [[AbstractTrees]]
 deps = ["Markdown"]
-git-tree-sha1 = "86d092c2599f1f7bb01668bf8eb3412f98d61e47"
+git-tree-sha1 = "33e450545eaf7699da1a6e755f9ea65f14077a45"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.3.2"
+version = "0.3.3"
 [[Adapt]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
+git-tree-sha1 = "fd04049c7dd78cfef0b06cdc1f0f181467655712"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "1.0.1"
+version = "1.1.0"
 [[ArrayLayouts]]
 deps = ["FillArrays", "LinearAlgebra"]
-git-tree-sha1 = "41956a49a8a4fefa1bf6664bca4a3035aba4c3a0"
+git-tree-sha1 = "a504dca2ac7eda8761c8f7c1ed52427a1be75a3c"
 uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
-version = "0.2.3"
+version = "0.2.6"
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 [[BinaryProvider]]
-deps = ["Libdl", "SHA"]
+deps = ["Libdl", "Logging", "SHA"]
-git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
+git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
 uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.8"
+version = "0.5.10"
 [[CEnum]]
-git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
+git-tree-sha1 = "1b77a77c3b28e0b3f413f7567c9bb8dd9bdccd14"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.2.0"
+version = "0.3.0"
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
@ -46,21 +46,21 @@ version = "4.0.0"
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "e650cbaee92b60433313157926b1e80d0c3a0e2e"
+git-tree-sha1 = "f56bbf18c86bcff7a961a32a4947a5abb2963a29"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "6.2.2"
+version = "6.3.0"
 [[CUDAnative]]
-deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
+deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "ExprTools", "GPUCompiler", "LLVM", "Libdl", "Pkg", "Printf"]
-git-tree-sha1 = "d1fc99635d0002c8a819b78cb1f441eb44310725"
+git-tree-sha1 = "ac86db2b05fdfec96b011e25a504ffe7476e8a68"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "3.0.2"
+version = "3.1.0"
 [[CodeTracking]]
 deps = ["InteractiveUtils", "UUIDs"]
-git-tree-sha1 = "0becdab7e6fbbcb7b88d8de5b72e5bb2f28239f3"
+git-tree-sha1 = "cab4da992adc0a64f63fa30d2db2fd8bec40cab4"
 uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
-version = "0.5.8"
+version = "0.5.11"
 [[CodecZlib]]
 deps = ["TranscodingStreams", "Zlib_jll"]
@ -70,15 +70,15 @@ version = "0.7.0"
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "c4c1cca28748906265ed62c788d6fe6f0134d264"
+git-tree-sha1 = "c73d9cfc2a9d8433dc77f5bff4bddf46b1d78c20"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.10.0"
+version = "0.10.3"
 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"]
-git-tree-sha1 = "2fdeb981ebcf52cd800ddb6a0aa5eac34153552d"
+git-tree-sha1 = "1e9bba7984e78aa8cdeea7f9f7cc984ad4e4b1c7"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.12.0"
+version = "0.12.2"
 [[CommonSubexpressions]]
 deps = ["Test"]
@ -93,27 +93,27 @@ uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
 version = "0.3.3+0"
 [[Cthulhu]]
-deps = ["CodeTracking", "InteractiveUtils", "REPL", "Unicode"]
+deps = ["CodeTracking", "InteractiveUtils", "REPL", "UUIDs", "Unicode"]
-git-tree-sha1 = "484790098c85c26f8e59051f8ff1a0745c034a7d"
+git-tree-sha1 = "f3643e78353199d3097821e806348bd83f364155"
 uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
-version = "1.0.1"
+version = "1.1.1"
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "e8c55b38dcca955f5aed8ec4479cdc95810db1e1"
+git-tree-sha1 = "1582b74d2322df7dd94549d4ac9d095e0f20e884"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "2.0.1"
+version = "2.2.1"
 [[DataAPI]]
-git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
+git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.1.0"
+version = "1.3.0"
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "73eb18320fe3ba58790c8b8f6f89420f0a622773"
+git-tree-sha1 = "af6d9c86e191c917c2276fbede1137e8ea20157f"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.11"
+version = "0.17.17"
 [[Dates]]
 deps = ["Printf"]
@ -139,11 +139,16 @@ version = "1.0.1"
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 [[ExprTools]]
 git-tree-sha1 = "6f0517056812fd6aa3af23d4b70d5325a2ae4e95"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
 version = "0.1.1"
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "51cc2f9bc4eb9c6c0e81ec2f779d1085583cc956"
+git-tree-sha1 = "44f561e293987ffc84272cd3d2b14b0b93123d63"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.7"
+version = "0.8.10"
 [[FixedPointNumbers]]
 git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238"
@ -162,17 +167,27 @@ git-tree-sha1 = "f40adc6422f548176bb4351ebd29e4abf773040a"
 uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 version = "0.1.0"
 [[Future]]
 deps = ["Random"]
 uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
-git-tree-sha1 = "d586762b08dcda13228df8967119b9cb6f22ade5"
+git-tree-sha1 = "d887693eb1bd5e1fd573262a978745481895ec7d"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "3.1.0"
+version = "3.4.1"
 [[GPUCompiler]]
 deps = ["Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs"]
 git-tree-sha1 = "5275aa268ecd09640b32560e1eae90c78816e4d1"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
 version = "0.2.0"
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
+git-tree-sha1 = "90ee39f9beaaa186e4968417ea2b8ed5673c91c0"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.3.1"
+version = "0.3.3"
 [[InteractiveUtils]]
 deps = ["Markdown"]
@ -180,15 +195,15 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 [[Juno]]
 deps = ["Base64", "Logging", "Media", "Profile"]
-git-tree-sha1 = "e1ba2a612645b3e07c773c3a208f215745081fe6"
+git-tree-sha1 = "a686b0cf235fa3e491b79b4783c2d2382292b436"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.8.1"
+version = "0.8.2"
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "b6b86801ae2f2682e0a4889315dc76b68db2de71"
+git-tree-sha1 = "dd3f584c3dbefe39b2a8fbafa1a3b77e31e21255"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.3.4"
+version = "1.5.1"
 [[LibGit2]]
 deps = ["Printf"]
@ -247,10 +262,9 @@ uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
 version = "0.5.3+3"
 [[OrderedCollections]]
-deps = ["Random", "Serialization", "Test"]
+git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3"
 git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.1.0"
+version = "1.2.0"
 [[Pkg]]
 deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
@ -305,15 +319,15 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 [[SpecialFunctions]]
 deps = ["OpenSpecFun_jll"]
-git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
+git-tree-sha1 = "d8d8b8a9f4119829410ecd706da4cc8594a1e020"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.10.0"
+version = "0.10.3"
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
+git-tree-sha1 = "5c06c0aeb81bef54aed4b3f446847905eb6cbda0"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.12.1"
+version = "0.12.3"
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
@ -331,9 +345,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 [[TimerOutputs]]
 deps = ["Printf"]
-git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
+git-tree-sha1 = "f458ca23ff80e46a630922c555d838303e4b9603"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.3"
+version = "0.5.6"
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
@ -350,21 +364,21 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 [[ZipFile]]
 deps = ["Libdl", "Printf", "Zlib_jll"]
-git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
+git-tree-sha1 = "254975fef2fc526583bb9b7c9420fe66ffe09f2f"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.9.1"
+version = "0.9.2"
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "2f6c3e15e20e036ee0a0965879b31442b7ec50fa"
+git-tree-sha1 = "a2e0d558f6031002e380a90613b199e37a8565bf"
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.11+9"
+version = "1.2.11+10"
 [[Zygote]]
-deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
+deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "Future", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "1ccbfbe8930376e31752b812daa2532c723dc332"
+git-tree-sha1 = "707ceea58e2bd0ff3077ab13a92f8355181d3ee4"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.13"
+version = "0.4.20"
 [[ZygoteRules]]
 deps = ["MacroTools"]
--- a/NEWS.md
+++ b/NEWS.md
@ -1,5 +1,18 @@
 # v0.11
 * Change to `DataLoader`'s constructor [https://github.com/FluxML/Flux.jl/pull/1152]
 * Use `DataLoader` with `NamedTuple`s, so that tensors can be accessed by name [https://github.com/FluxML/Flux.jl/pull/1221].
 * Error if Dense layers weights and biases are not arrays [https://github.com/FluxML/Flux.jl/pull/1218].
 # v0.10.5
 * Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`.
 * Added option to set `bias` to [Flux.Zeros](https://github.com/FluxML/Flux.jl/pull/873) to eliminating `bias` from being trained.
 * Added `GlobalMaxPool` and `GlobalMeanPool` [layers](https://github.com/FluxML/Flux.jl/pull/950) for performing global pooling operations.
 * Added `ClipValue` and `ClipNorm` in this [pr](https://github.com/FluxML/Flux.jl/pull/1133) to `Flux.Optimise` to provide a cleaner API for gradient clipping.
 * Added new kwarg-only [constructors](https://github.com/FluxML/Flux.jl/pull/873) for the various convolutional layers.
 * Documented the convolutional layer constructors accepting `weight` and `bias` keyword arguments to supply custom arrays for those fields.
 * Testing suite improvements now test for gradients of all layers along with GPU support.
 * Functors have now moved to [Functors.jl](https://github.com/FluxML/Flux.jl/pull/1174) to allow for their use outside of Flux.
 * Added [helper functions](https://github.com/FluxML/Flux.jl/pull/873) `Flux.convfilter` and `Flux.depthwiseconvfilter` to construct weight arrays for convolutions outside of layer constructors so as to not have to depend on the default layers for custom implementations.
 # v0.10.0
 * The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
--- a/Project.toml
+++ b/Project.toml
@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.5"
+version = "0.11.0-DEV"
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@ -11,6 +11,7 @@ CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@ -26,10 +27,11 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 AbstractTrees = "0.2, 0.3"
-Adapt = "1"
+Adapt = "1, 2.0"
 CodecZlib = "0.5, 0.6, 0.7"
 Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
 CuArrays = "2"
 Functors = "0.1"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
--- a/docs/src/data/onehot.md
+++ b/docs/src/data/onehot.md
@ -7,15 +7,15 @@ julia> using Flux: onehot, onecold
 julia> onehot(:b, [:a, :b, :c])
 3-element Flux.OneHotVector:
- false
+ 0
-  true
+ 1
- false
+ 0
 julia> onehot(:c, [:a, :b, :c])
 3-element Flux.OneHotVector:
- false
+ 0
- false
+ 0
-  true
+ 1
 ```
 The inverse is `onecold` (which can take a general probability distribution, as well as just booleans).
--- a/docs/src/models/advanced.md
+++ b/docs/src/models/advanced.md
@ -19,7 +19,7 @@ Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0
 julia> Flux.params(a) # default behavior
 Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])
-julia> Flux.trainable(a::Affine) = (a.W, a.b,)
+julia> Flux.trainable(a::Affine) = (a.W,)
 julia> Flux.params(a)
 Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@ -20,7 +20,11 @@ GlobalMeanPool
 DepthwiseConv
 ConvTranspose
 CrossCor
 SamePad
 flatten
 Flux.Zeros
 Flux.convfilter
 Flux.depthwiseconvfilter
 ```
 ## Recurrent Layers
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@ -39,7 +39,7 @@ E.g. the following will have run into the same problem as above:
    leaky_tanh(x) = 0.01*x + tanh(x)
 ```
-While one could change the activation function (e.g. to use `0.01f0x`), the idiomatic (and safe way)  to avoid type casts whenever inputs changes is to use `oftype`:
+While one could change the activation function (e.g. to use `0.01f0*x`), the idiomatic (and safe way)  to avoid type casts whenever inputs changes is to use `oftype`:
 ```
    leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x)
 ```
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@ -140,3 +140,16 @@ ExpDecay
 InvDecay
 WeightDecay
 ```
 ## Gradient Clipping
 Gradient clipping is useful for training recurrent neural networks, which have a tendency to suffer from the exploding gradient problem. An example usage is
 ```julia
 opt = Optimiser(ClipValue(1e-3), ADAM(1e-3))
 ```
 ```@docs
 ClipValue
 ClipNorm
 ```
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@ -142,7 +142,7 @@ function my_custom_train!(loss, ps, data, opt)
  for d in data
    gs = gradient(ps) do
      training_loss = loss(d...)
-      # Insert what ever code you want here that needs Training loss, e.g. logging
+      # Insert whatever code you want here that needs Training loss, e.g. logging
      return training_loss
    end
    # insert what ever code you want here that needs gradient
--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -3,7 +3,8 @@ module Flux
 # Zero Flux Given
 using Base: tail
-using Zygote, MacroTools, Juno, Reexport, Statistics, Random
+using Statistics, Random, LinearAlgebra
 using Zygote, MacroTools, Juno, Reexport
 using MacroTools: @forward
@reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
@ -20,7 +21,8 @@ using .Optimise
 using .Optimise: @epochs
 export Descent, ADAM, Momentum, Nesterov, RMSProp,
  ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
-  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
+  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay,
  ClipValue, ClipNorm
 using CuArrays
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@ -51,4 +51,6 @@ export Iris
 include("housing.jl")
 export Housing
@deprecate DataLoader(x...; kws...) DataLoader(x; kws...)
 end
--- a/src/data/dataloader.jl
+++ b/src/data/dataloader.jl
@ -1,7 +1,7 @@
 # Adapted from Knet's src/data.jl (author: Deniz Yuret)
-struct DataLoader
+struct DataLoader{D}
-    data
+    data::D
    batchsize::Int
    nobs::Int
    partial::Bool
@ -11,21 +11,20 @@ struct DataLoader
 end
 """
-    DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+    DataLoader(data; batchsize=1, shuffle=false, partial=true)
 An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
 (except possibly the last one). 
-Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in 
+Takes as input a single data tensor, or a tuple (or a named tuple) of tensors.
-supervised learning. The last dimension in each tensor is considered to be the observation
+The last dimension in each tensor is considered to be the observation dimension.
 dimension. 
 If `shuffle=true`, shuffles the observations each time iterations are re-started.
 If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
-The original data is preserved as a tuple in the `data` field of the DataLoader. 
+The original data is preserved in the `data` field of the DataLoader. 
-Example usage:
+Usage example:
    Xtrain = rand(10, 100)
    train_loader = DataLoader(Xtrain, batchsize=2) 
@ -37,9 +36,16 @@ Example usage:
    train_loader.data   # original dataset
    # similar, but yielding tuples
    train_loader = DataLoader((Xtrain,), batchsize=2) 
    for (x,) in train_loader
        @assert size(x) == (10, 2)
        ...
    end
    Xtrain = rand(10, 100)
    Ytrain = rand(100)
-    train_loader = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true) 
+    train_loader = DataLoader((Xtrain, Ytrain), batchsize=2, shuffle=true) 
    for epoch in 1:100
        for (x, y) in train_loader
            @assert size(x) == (10, 2)
@ -51,26 +57,26 @@ Example usage:
    # train for 10 epochs
    using IterTools: ncycle 
    Flux.train!(loss, ps, ncycle(train_loader, 10), opt)
    # can use NamedTuple to name tensors
    train_loader = DataLoader((images=Xtrain, labels=Ytrain), batchsize=2, shuffle=true)
    for datum in train_loader
        @assert size(datum.images) == (10, 2)
        @assert size(datum.labels) == (2,)
    end
 """
-function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+function DataLoader(data; batchsize=1, shuffle=false, partial=true)
    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
    batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
-    nx = size(data[1])[end]
+    n = _nobs(data) 
-    for i=2:length(data)
+    if n < batchsize
-        nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
+        @warn "Number of observations less than batchsize, decreasing the batchsize to $n"
        batchsize = n
    end
-    if nx < batchsize
+    imax = partial ? n : n - batchsize + 1
-        @warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
+    DataLoader(data, batchsize, n, partial, imax, [1:n;], shuffle)
        batchsize = nx
    end
    imax = partial ? nx : nx - batchsize + 1
    ids = 1:min(nx, batchsize)
    DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
 end
 getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
@propagate_inbounds function Base.iterate(d::DataLoader, i=0)     # returns data in d.indices[i+1:i+batchsize]
    i >= d.imax && return nothing
    if d.shuffle && i == 0
@ -78,11 +84,7 @@ getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
    end
    nexti = min(i + d.batchsize, d.nobs)
    ids = d.indices[i+1:nexti]
-    if length(d.data) == 1
+    batch = _getobs(d.data, ids)
        batch = getdata(d.data[1], ids)
    else
        batch = ((getdata(x, ids) for x in d.data)...,)
    end
    return (batch, nexti)
 end
@ -90,3 +92,19 @@ function Base.length(d::DataLoader)
    n = d.nobs / d.batchsize
    d.partial ? ceil(Int,n) : floor(Int,n)
 end
 _nobs(data::AbstractArray) = size(data)[end]
 function _nobs(data::Union{Tuple, NamedTuple})
    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
    n = _nobs(data[1])
    if !all(x -> _nobs(x) == n, Base.tail(data))
        throw(DimensionMismatch("All data should contain same number of observations"))
    end
    return n
 end
 _getobs(data::AbstractArray, i) = data[ntuple(i -> Colon(), Val(ndims(data) - 1))..., i]
 _getobs(data::Union{Tuple, NamedTuple}, i) = map(Base.Fix2(_getobs, i), data)
 Base.eltype(::DataLoader{D}) where D = D
--- a/src/functor.jl
+++ b/src/functor.jl
@ -24,7 +24,7 @@ testmode!(m, mode = true) = m
    trainmode!(m, mode = true)
 Set a layer of model's train mode (see below).
-Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
+Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`).
 _Note_: if you manually set a model into train mode, you need to manually place
 it into test mode during testing phase.
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@ -102,7 +102,7 @@ julia> d(rand(5))
  -0.16210233
   0.12311903```
 """
-struct Dense{F,S,T}
+struct Dense{F,S<:AbstractArray,T<:AbstractArray}
  W::S
  b::T
  σ::F
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@ -132,7 +132,7 @@ end
 function (c::Conv)(x::AbstractArray)
  # TODO: breaks gpu broadcast :(
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
-  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
+  σ, b = c.σ, reshape(c.bias, ntuple(_->1, length(c.stride))..., :, 1)
  cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
  σ.(conv(x, c.weight, cdims) .+ b)
 end
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@ -46,9 +46,10 @@ given the prediction `ŷ` and true values `y`.
    Huber loss = |
                 |  δ * (|ŷ - y| - 0.5 * δ), otherwise
 """
 #TODO: remove dropgrad when Zygote can handle this function with CuArrays
 function huber_loss(ŷ, y;  δ=eltype(ŷ)(1))
   abs_error = abs.(ŷ .- y)
-   temp = abs_error .<  δ
+   temp = Zygote.dropgrad(abs_error .<  δ)
   x = eltype(ŷ)(0.5)
   hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
 end
@ -132,7 +133,7 @@ CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1
    logitbinarycrossentropy(ŷ, y)
 `logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to
-[`Flux.binarycrossentropy(σ(log(ŷ)), y)`](@ref) but it is more numerically stable.
+[`Flux.binarycrossentropy(σ(ŷ), y)`](@ref) but it is more numerically stable.
 See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref)
--- a/src/onehot.jl
+++ b/src/onehot.jl
@ -27,7 +27,8 @@ Base.getindex(xs::OneHotMatrix, ::Colon, ::Colon) = OneHotMatrix(xs.height, copy
 Base.getindex(xs::OneHotMatrix, i::Integer, ::Colon) = map(x -> x[i], xs.data)
-A::AbstractMatrix * B::OneHotMatrix = A[:, map(x->x.ix, B.data)]
+# remove workaround when https://github.com/JuliaGPU/CuArrays.jl/issues/676 is fixed
 A::AbstractMatrix * B::OneHotMatrix = A[:, cpu(map(x->x.ix, B.data))]
 Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...])
@ -48,7 +49,7 @@ cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.d
 Create a `OneHotVector` with its `l`-th element `true` based on the
 possible set of `labels`.
 If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
-in `labels`; otherwise it will error.
+in `labels`; otherwise, it will raise an error.
 # Examples
 ```jldoctest
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@ -1,9 +1,12 @@
 module Optimise
 using LinearAlgebra
 export train!, update!,
 	Descent, ADAM, Momentum, Nesterov, RMSProp,
 	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
-	InvDecay, ExpDecay, WeightDecay, stop, Optimiser
+	InvDecay, ExpDecay, WeightDecay, stop, Optimiser,
 	ClipValue, ClipNorm
 include("optimisers.jl")
 include("train.jl")
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@ -509,7 +509,7 @@ function apply!(o::ExpDecay, x, Δ)
  η, s, decay = o.eta, o.step, o.decay
  n = o.current[x] = get(o.current, x, 0) + 1
  if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1
-    η = max(η * decay^(s / n), o.clip)
+    η = max(η * decay, o.clip)
    o.eta = η
  end
  @. Δ *= η
@ -533,3 +533,31 @@ function apply!(o::WeightDecay, x, Δ)
  wd = o.wd
  @. Δ += wd * x
 end
 """
    ClipValue(thresh)
 Clip gradients when their absolute value exceeds `thresh`.
 """
 mutable struct ClipValue{T}
    thresh::T
 end
 apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
 """
    ClipNorm(thresh)
 Clip gradients when their L2 norm exceeds `thresh`.
 """
 mutable struct ClipNorm{T}
    thresh::T
 end
 function apply!(o::ClipNorm, x, Δ)
    Δnrm = norm(Δ)
    if Δnrm > o.thresh
        rmul!(Δ, o.thresh / Δnrm)
    end
    return Δ
 end
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@ -68,8 +68,7 @@ and compute the gradient of `loss(d)`.
 A callback is given with the keyword argument `cb`. For example, this will print
 "training" every 10 seconds (using [`Flux.throttle`](@ref)):
-  train!(loss, params, data, opt,
+    train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))
         cb = throttle(() -> println("training"), 10))
 The callback can call [`Flux.stop`](@ref) to interrupt the training loop.
--- a/src/utils.jl
+++ b/src/utils.jl
@ -246,6 +246,10 @@ function _restructure(m, xs)
  end
 end
@adjoint function _restructure(m, xs)
  _restructure(m, xs), dm -> (nothing,destructure(dm)[1])
 end
 """
    destructure(m)
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@ -69,6 +69,7 @@ if CuArrays.has_cudnn()
  @info "Testing Flux/CUDNN"
  include("cudnn.jl")
  include("curnn.jl")
  include("layers.jl")
 else
  @warn "CUDNN unavailable, not testing GPU DNN support"
 end
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@ -0,0 +1,98 @@
 # Test layers and data/model movements on and off the GPU
 # Add tests for layers and their gradients on the GPU
 # Most of the forward passes should be fine being applied
 # to bitstype objects, but this gives higher coverage for our use-cases
 # Check that getting the gradients does not throw
 # generic movement tests
@testset "Basic GPU Movement" begin
  @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
  @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
 end
 # TODO: These layers get into scalar indexing
 # `AlphaDropout` throws a compilation error on GPUs,
 # whereas, the rest are scalar indexing issues.
 const BROKEN_LAYERS = [DepthwiseConv,
 		       AlphaDropout,
                       InstanceNorm,
                       GroupNorm]
 function gradtest(name::String, layers::Vector, xs = nothing, args...)
  isnothing(xs) && error("Missing input to test the layers against.")
  @testset "$name GPU grad tests" begin
    for layer in layers
      @testset "$layer GPU grad test" begin
        l = gpu(layer(args...))
        xs = gpu(xs)
        if any(x -> isa(l, x), BROKEN_LAYERS)
          ps = Flux.params(l)
          @test_broken gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
        else
          ps = Flux.params(l)
          @test gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
          gs = gradient(() -> sum(l(xs)), ps)
          # Handle pooling layers
          if !isempty(ps)
            @test gs[first(ps)] isa Flux.CuArrays.CuArray
          end
        end
      end
    end
  end
 end
 # Repeats from Conv, CrossCor
 r = rand(Float32, 28, 28, 1, 1)
 conv_layers = [Conv, ConvTranspose, CrossCor, DepthwiseConv]
 gradtest("Conv", conv_layers, r, (2,2), 1=>3)
 pooling_layers = [MaxPool, MeanPool]
 gradtest("Pooling", pooling_layers, r, (2,2))
 dropout_layers = [Dropout, AlphaDropout]
 gradtest("Dropout", dropout_layers, r, 0.5f0)
 norm_layers = [LayerNorm, BatchNorm]
 gradtest("Normalising", norm_layers, rand(Float32, 28,28,3,1), 1)
 instancenorm = [InstanceNorm]
 gradtest("InstanceNorm", instancenorm, r, 1)
 groupnorm = [GroupNorm]
 gradtest("GroupNorm", groupnorm, rand(Float32, 28,28,3,1), 3, 1)
 const stateless_layers = [Flux.mse,
                          Flux.crossentropy,
                          Flux.logitcrossentropy,
                          Flux.normalise]
 const stateless_layers_broadcasted = [Flux.binarycrossentropy,
                                      Flux.logitbinarycrossentropy]
 function stateless_gradtest(f, args...)
  @test gradient((args...) -> sum(f(args...)), args...)[1] isa CuArray
 end
 function stateless_gradtest_broadcasted(f, args...)
  @test gradient((args...) -> sum(f.(args...)), args...)[1] isa CuArray
 end
@testset "Stateless GPU grad tests" begin
  x = gpu(rand(3,3))
  y = gpu(rand(3,3))
  for layer in stateless_layers
    if layer == Flux.normalise
      stateless_gradtest(layer, x)
    else
      stateless_gradtest(layer, x, y)
    end
  end
  for layer in stateless_layers_broadcasted
    stateless_gradtest_broadcasted(layer, x, y)
  end
 end
--- a/test/data.jl
+++ b/test/data.jl
@ -3,20 +3,34 @@
    Y = [1:5;]
    d = DataLoader(X, batchsize=2)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == typeof(X)
    @test length(batches) == 3
    @test batches[1] == X[:,1:2]
    @test batches[2] == X[:,3:4]
    @test batches[3] == X[:,5:5]
    d = DataLoader(X, batchsize=2, partial=false)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == typeof(X)
    @test length(batches) == 2
    @test batches[1] == X[:,1:2]
    @test batches[2] == X[:,3:4]
-    d = DataLoader(X, Y, batchsize=2)
+    d = DataLoader((X,), batchsize=2, partial=false)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == Tuple{typeof(X)}
    @test length(batches) == 2
    @test batches[1] == (X[:,1:2],)
    @test batches[2] == (X[:,3:4],)
    d = DataLoader((X, Y), batchsize=2)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
    @test length(batches) == 3
    @test length(batches[1]) == 2
    @test length(batches[2]) == 2
@ -28,6 +42,22 @@
    @test batches[3][1] == X[:,5:5]
    @test batches[3][2] == Y[5:5]
    # test with NamedTuple
    d = DataLoader((x=X, y=Y), batchsize=2)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
    @test length(batches) == 3
    @test length(batches[1]) == 2
    @test length(batches[2]) == 2
    @test length(batches[3]) == 2
    @test batches[1][1] == batches[1].x == X[:,1:2]
    @test batches[1][2] == batches[1].y == Y[1:2]
    @test batches[2][1] == batches[2].x == X[:,3:4]
    @test batches[2][2] == batches[2].y == Y[3:4]
    @test batches[3][1] == batches[3].x == X[:,5:5]
    @test batches[3][2] == batches[3].y == Y[5:5]
    # test interaction with `train!`
    θ = ones(2)
    X = zeros(2, 10)
@ -41,7 +71,7 @@
    X = ones(2, 10)
    Y = fill(2, 10)
    loss(x, y) = sum((y - x'*θ).^2)
-    d  = DataLoader(X, Y) 
+    d  = DataLoader((X, Y)) 
    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
    @test norm(θ .- 1) < 1e-10
 end
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@ -28,6 +28,14 @@ import Flux: activations
  end
  @testset "Dense" begin
    @testset "constructors" begin
      @test size(Dense(10, 100).W) == (100, 10)
      @test Dense(rand(100,10), rand(10)).σ == identity
      @test_throws MethodError Dense(10, 10.5)
      @test_throws MethodError Dense(10, 10.5, tanh)
    end
    @test  length(Dense(10, 5)(randn(10))) == 5
    @test_throws DimensionMismatch Dense(10, 5)(randn(1))
    @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
@ -37,7 +45,6 @@ import Flux: activations
    @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
    @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
    @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
  end
  @testset "Diagonal" begin
--- a/test/optimise.jl
+++ b/test/optimise.jl
@ -57,35 +57,57 @@ end
 end
@testset "ExpDecay" begin
-    w = randn(10, 10)
+
-    o = ExpDecay(0.1, 0.1, 1000, 1e-4)
+  @testset "Sanity Check" begin
-    w1 = randn(10,10)
+    o = ExpDecay(0.2, 0.5, 1, 1e-3)
-    loss(x) = Flux.mse(w*x, w1*x)
+    p = [0.0]
-    flag = 1
+    steps = 1:8
-    decay_steps = []
+    eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
-    for t = 1:10^5
+    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
-      prev_eta = o.eta
+    @test eta_actual == eta_expected
-      θ = Params([w1])
+  end
-      x = rand(10)
+
-      θ̄ = gradient(() -> loss(x), θ)
+  w = randn(10, 10)
-      prev_grad = collect(θ̄[w1])
+  o = ExpDecay(0.1, 0.1, 1000, 1e-4)
-      delta = Optimise.apply!(o, w1, θ̄[w1])
+  w1 = randn(10,10)
-      w1 .-= delta
+  loss(x) = Flux.mse(w*x, w1*x)
-      new_eta = o.eta
+  flag = 1
-      if new_eta != prev_eta
+  decay_steps = []
-        push!(decay_steps, t)
+  for t = 1:10^5
-      end
+    prev_eta = o.eta
-      array = fill(o.eta, size(prev_grad))
+    θ = Params([w1])
-      if array .* prev_grad != delta
+    x = rand(10)
-        flag = 0
+    θ̄ = gradient(() -> loss(x), θ)
-      end
+    prev_grad = collect(θ̄[w1])
    delta = Optimise.apply!(o, w1, θ̄[w1])
    w1 .-= delta
    new_eta = o.eta
    if new_eta != prev_eta
      push!(decay_steps, t)
    end
-    @test flag == 1
+    array = fill(o.eta, size(prev_grad))
-    # Test to check if decay happens at decay steps. Eta reaches clip value eventually.
+    if array .* prev_grad != delta
-    ground_truth = []
+      flag = 0
    for i in 1:11
      push!(ground_truth, 1000*i)  # Expected decay steps for this example.
    end
-    @test decay_steps == ground_truth
+  end
-    @test o.eta == o.clip
+  @test flag == 1
  # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
  ground_truth = []
  for i in 1:4
    push!(ground_truth, 1000*i)  # Expected decay steps for this example.
  end
  @test decay_steps == ground_truth
  @test o.eta == o.clip
 end
@testset "Clipping" begin
    w = randn(10, 10)
    loss(x) = sum(w * x)
    θ = Params([w])
    x = 1000 * randn(10)
    w̄ = gradient(() -> loss(x), θ)[w]
    w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
    @test all(w̄_value .<= 1)
    w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
    @test norm(w̄_norm) <= 1
 end
--- a/test/runtests.jl
+++ b/test/runtests.jl
@ -2,49 +2,45 @@ using Flux
 using Flux.Data
 using Test 
 using Random, Statistics, LinearAlgebra
 using Documenter
 using IterTools: ncycle
 Random.seed!(0)
-@testset "Flux" begin
+@testset "Utils" begin
  include("utils.jl")
 end
-  @testset "Utils" begin
+@testset "Onehot" begin
-    include("utils.jl")
+  include("onehot.jl")
-  end
+end
-
+
-  @testset "Onehot" begin
+@testset "Optimise" begin
-    include("onehot.jl")
+  include("optimise.jl")
-  end
+end
-
+
-  @testset "Optimise" begin
+@testset "Data" begin
-    include("optimise.jl")
+  include("data.jl")
-  end
+end
-
+
-  @testset "Data" begin
+@testset "Layers" begin
-    include("data.jl")
+  include("layers/basic.jl")
-  end
+  include("layers/normalisation.jl")
-
+  include("layers/stateless.jl")
-  @testset "Layers" begin
+  include("layers/conv.jl")
-    include("layers/basic.jl")
+end
-    include("layers/normalisation.jl")
+
-    include("layers/stateless.jl")
+@testset "CUDA" begin
-    include("layers/conv.jl")
+  if Flux.use_cuda[]
-  end
+    include("cuda/cuda.jl")
-
+  else
-  @testset "CUDA" begin
+    @warn "CUDA unavailable, not testing GPU support"
    if Flux.use_cuda[]
      include("cuda/cuda.jl")
    else
      @warn "CUDA unavailable, not testing GPU support"
    end
  end
 end
@static if VERSION >= v"1.4"
  using Documenter
  @testset "Docs" begin
-    if VERSION >= v"1.4"
+    DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
-      DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
+    doctest(Flux)
      doctest(Flux)
    end
  end
-
+end
 end # testset Flux
Author	SHA1	Message	Date
bors[bot]	7035ee9bea	Merge #1238 1238: Fix inline code block r=dhairyagandhi96 a=harryscholes ### PR Checklist - [ ] Tests are added - [ ] Entry in NEWS.md - [x] Documentation, if applicable - [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes). Co-authored-by: harryscholes <harryscholes@gmail.com>	2020-06-19 08:28:41 +00:00
harryscholes	57efd7fead	Fix inline code block	2020-06-19 09:24:44 +01:00
bors[bot]	19b45b49d3	Merge #1221 1221: DataLoader with NamedTuple r=CarloLucibello a=cossio Just a couple of small changes, so that `DataLoader` can be created with a `NamedTuple` of tensors instead of `Tuple`. This way the tensors can be referred to by name. For example ``` train_loader = DataLoader((images = Xtrain, labels = Ytrain), batchsize=16) batch = first(train_loader) y = model(batch.images) logitcrossentropy(y, batch.labels) ``` If we only use tuples, then in datasets with multiple tensors one has to be careful about the order in which the tensors are fed into the `DataLoader` constructor and be consistent with this elsewhere. With `NamedTuples` one just have to be consistent about the names used, which I think is a minor improvement. CC @CarloLucibello ### PR Checklist - [x] Tests are added - [x] Entry in NEWS.md - [x] Documentation, if applicable I don't think this qualifies as an API change. It's just a minor feature addition. So final review probably not required. - [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes). Co-authored-by: cossio <j.cossio.diaz@gmail.com> Co-authored-by: cossio <cossio@users.noreply.github.com>	2020-06-16 17:21:28 +00:00
bors[bot]	254e4a7058	Merge #1231 1231: use `ntuple` in conv r=MikeInnes a=MikeInnes This is the right abstraction over `map`, and in particular is a bit easier to compile away in some cases. As this is a trivial change from Flux's perspective it's not easy to test here, but there are downstream tests in XLA.jl. Co-authored-by: Mike J Innes <mike.j.innes@gmail.com>	2020-06-16 13:04:20 +00:00
Mike J Innes	9f931dd7fa	use `ntuple` in conv	2020-06-16 14:02:24 +01:00
cossio	9078f85096	revert selectdim selectdim can lead to type instability, see https://discourse.julialang.org/t/why-selectdim-is-type-instable/25271/5	2020-06-16 13:32:27 +02:00
cossio	1dbaf32810	DataLoader type inference tests	2020-06-16 13:32:27 +02:00
cossio	cb34bb848b	simplify _getobs	2020-06-16 13:32:27 +02:00
cossio	75692161a7	Apply suggestions from code review accept suggested changes Co-authored-by: Carlo Lucibello <carlo.lucibello@gmail.com>	2020-06-16 13:32:27 +02:00
cossio	909a55ac10	news and docs	2020-06-16 13:32:27 +02:00
cossio	02ee6ba426	DataLoader with NamedTuple	2020-06-16 13:31:29 +02:00
bors[bot]	97406507fd	Merge #1218 1218: Require weight and bias to be AbstractArrays r=CarloLucibello a=oxinabox closes #1199 While in theory someone could be using Dense with weights and biases that are not abstract arrays, I would be surprised. So allowing it is just leaving a food-gun laying around. If it is common then we can instead close #1199 by adding a special constructor for `Number` subtypes that error if they are not integers, or something a long those lines. ### PR Checklist - [x] Tests are added - [x] Entry in NEWS.md I think this is a bug-fix thus the following are not required: - [ ] Documentation, if applicable - [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes). Co-authored-by: Lyndon White <lyndon.white@invenialabs.co.uk> Co-authored-by: Lyndon White <oxinabox@ucc.asn.au>	2020-06-15 15:21:21 +00:00
Lyndon White	e61787c1c8	Update test/layers/basic.jl	2020-06-12 13:58:10 +01:00
Lyndon White	601f842eaf	bonus test	2020-06-11 23:17:40 +01:00
bors[bot]	99ec30c8c2	Merge #1220 1220: CompatHelper: bump compat for "Adapt" to "2.0" r=CarloLucibello a=github-actions[bot] This pull request changes the compat entry for the `Adapt` package from `1` to `1, 2.0`. This keeps the compat entries for earlier versions. Note: I have not tested your package with this new compat entry. It is your responsibility to make sure that your package tests pass before you merge this pull request. Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>	2020-06-11 09:54:46 +00:00
github-actions[bot]	fbfc973011	CompatHelper: bump compat for "Adapt" to "2.0"	2020-06-11 00:18:47 +00:00
Lyndon White	a1623aca76	move into 0.11 news	2020-06-10 12:39:00 +01:00
Lyndon White	15c7354c4e	Make release as DEV	2020-06-10 12:38:33 +01:00
Lyndon White	97b0aa4d36	bump version	2020-06-10 12:14:47 +01:00
Lyndon White	cf90517a8a	update news.md	2020-06-10 12:14:19 +01:00
Lyndon White	df84628c29	Require weight and bias to be AbstractArrays	2020-06-10 12:06:57 +01:00
bors[bot]	e1f80d4627	Merge #1213 1213: Fixing indentation in train! docstring r=CarloLucibello a=natema One code block is not correctly displayed in the doc of [Flux.Optimise.train! ](https://fluxml.ai/Flux.jl/stable/training/training/#Flux.Optimise.train!). Based on the previous code block, I guess it's an indentation problem. Co-authored-by: natema <natema@users.noreply.github.com>	2020-06-08 18:29:46 +00:00
bors[bot]	a7bbd3d35b	Merge #1152 1152: extend dataloader r=CarloLucibello a=CarloLucibello cfr discussion in #1149. Currently DataLoader interface supports 1. `for x in DataLoader(X)` 2. `for (x, y) in DataLoader(X, Y)` This PR adds 3. `for (x,) in DataLoader((X,))` 4. `for (x, y) in DataLoader((X, Y))` Edit: the constructor in 2. is removed in this PR Co-authored-by: CarloLucibello <carlo.lucibello@gmail.com>	2020-06-08 18:01:06 +00:00
CarloLucibello	0cf46432cf	cleanup	2020-06-08 19:59:34 +02:00
natema	70bbf18180	Fixing indentation in train! docstring One code block is not correctly displayed in the doc of [Flux.Optimise.train! ](https://fluxml.ai/Flux.jl/stable/training/training/#Flux.Optimise.train!). Based on the previous code block, I guess it's an indentation problem.	2020-06-07 15:44:04 +02:00
bors[bot]	d9b07475b0	Merge #1129 1129: Added dropgrad in huber_loss r=CarloLucibello a=HenriDeh Workaround to prevent `iterate(::nothing)` when working with CuArrays. See issue #1128 Co-authored-by: HenriDeh <47037088+HenriDeh@users.noreply.github.com>	2020-06-06 17:21:19 +00:00
bors[bot]	9ebbe8cb4c	Merge #1141 1141: Speedup matmul of CuMatrix and OneHotMatrix r=CarloLucibello a=AStupidBear This solves #189. ```julia julia> using Flux julia> using Flux: CuArrays julia> A = zeros(300, 10000) \|> gpu; julia> B = Flux.onehotbatch(rand(1:10000, 256), 1:10000) \|> gpu; julia> A * B; CuArrays.@time A * B; ┌ Warning: Performing scalar operations on GPU arrays: This is very slow, consider disallowing these operations with `allowscalar(false)` └ @ GPUArrays ~/shared/.julia/packages/GPUArrays/OXvxB/src/host/indexing.jl:43 0.002824 seconds (951 CPU allocations: 38.156 KiB) (2 GPU allocations: 301.000 KiB, 2.32% gc time of which 46.42% spent allocating) julia> import Base: * julia> A::AbstractMatrix * B::Flux.OneHotMatrix = @inbounds A[:, map(x->x.ix, B.data)] * (generic function with 522 methods) julia> A * B; CuArrays.@time A * B; 0.000343 seconds (169 CPU allocations: 5.000 KiB) (2 GPU allocations: 301.000 KiB, 15.53% gc time of which 65.97% spent allocating) ``` Co-authored-by: Yao Lu <luyaocns@gmail.com>	2020-06-06 17:00:01 +00:00
CarloLucibello	b1f226eb34	add news	2020-06-06 18:15:04 +02:00
CarloLucibello	a643cb6758	extend dataloader	2020-06-06 18:02:03 +02:00
bors[bot]	792a1c54f8	Merge #1211 1211: Fixing syntax in onehot docstring r=CarloLucibello a=natema `otherwise, it will error` -> `otherwise, it will raise an error` Co-authored-by: natema <natema@users.noreply.github.com>	2020-06-06 15:02:40 +00:00
natema	8f6aed5770	Fixing syntax in onehot docstring `otherwise, it will error` -> `otherwise, it will raise an error`	2020-06-05 18:20:50 +02:00
bors[bot]	22d5e318e5	Merge #1192 1192: Improve `restructure` performance r=dhairyagandhi96 a=MikeInnes A small change, but it significantly improves the performance on the following test case: ```julia julia> VERSION v"1.5.0-DEV.876" julia> using Flux, DiffEqFlux, BenchmarkTools julia> using Flux: mse julia> fastdense = FastDense(784, 32, tanh); julia> p = initial_params(fastdense); julia> dense = Dense(784, 32, tanh); julia> p,re = Flux.destructure(dense); julia> x = rand(Float32, 784, 10); julia> y = rand(Float32, 32, 10); julia> @btime gradient((x,p) -> mse(fastdense(x, p), y), x, p); 505.530 μs (87 allocations: 240.73 KiB) julia> @btime gradient((x,p) -> mse(re(p)(x), y), x, p); 107.796 μs (139 allocations: 340.94 KiB) ``` Co-authored-by: Mike J Innes <mike.j.innes@gmail.com>	2020-06-05 14:53:11 +00:00
bors[bot]	71ebd51e45	Merge #1208 1208: Fixing output format for `onehot` r=dhairyagandhi96 a=natema Currently `Flux.OneHotVector` is displayed as a binary vector (0/1) rather than a boolean one (true/false). This is also shown in successive examples in the same page. I fixed the `onehot(:b, [:a, :b, :c])` and `onehot(:c, [:a, :b, :c])` outputs in the first example of the page accordingly. Co-authored-by: natema <natema@users.noreply.github.com>	2020-06-05 09:17:12 +00:00
bors[bot]	b5a73f8532	Merge #1207 1207: Fixing typo in docs r=dhairyagandhi96 a=natema `what ever` -> `whatever` Co-authored-by: natema <natema@users.noreply.github.com>	2020-06-05 09:00:06 +00:00
natema	48d6f2d0c0	Fixing output format for `onehot` `Flux.OneHotVector` is displayed as a binary vector (0/1) rather than a boolean (true/false) one, as is also shown in successive examples in the same page, so I fixed the `onehot(:b, [:a, :b, :c])` and `onehot(:c, [:a, :b, :c])` output as given by the current Julia version 1.4.2.	2020-06-03 17:03:08 +02:00
natema	2c4b1e521e	Fixing typo in docs `what ever` -> `whatever`	2020-06-02 19:20:41 +02:00
bors[bot]	ca1b1b2c7c	Merge #1206 1206: Fixing ambiguous remark in Preserve inputs' types r=dhairyagandhi96 a=natema This PR is based on the [discussion in the forum](https://discourse.julialang.org/t/not-clear-what-0-01f0x-is-in-the-flux-docs/40553?u=mathematics) on the ambiguity of `0.01f0x` in the line > While one could change the activation function (e.g. to use `0.01f0x`) Co-authored-by: natema <natema@users.noreply.github.com>	2020-06-02 17:09:58 +00:00
natema	a24f46b606	Fixing ambiguous remark in Preserve inputs' types This PR is based on the [discussion in the forum](https://discourse.julialang.org/t/not-clear-what-0-01f0x-is-in-the-flux-docs/40553?u=mathematics) on the ambiguity of `0.01f0x` in the line > While one could change the activation function (e.g. to use `0.01f0x`)	2020-06-02 18:48:07 +02:00
Mike J Innes	089ec0832c	improved restructure adjoint	2020-05-27 12:28:22 +01:00
bors[bot]	ddd0f4e747	Merge #1191 1191: Pull Request Template r=MikeInnes a=MikeInnes Hopefully makes it a little clearer what the requirements are, which will lead to easier review, and encourage things like NEWS.md that we want to be better in sync. cc @dhairyagandhi96 and @CarloLucibello for thoughts. Co-authored-by: Mike J Innes <mike.j.innes@gmail.com>	2020-05-27 11:15:26 +00:00
Mike J Innes	e10818bbad	Update pull_request_template.md	2020-05-27 12:12:13 +01:00
Mike J Innes	8c3a80c940	Create pull_request_template.md	2020-05-26 12:52:28 +01:00
bors[bot]	85c39e2309	Merge #1190 1190: Correcting advanced.md r=dhairyagandhi96 a=Sleort To make the example consistent, it should be ``` julia> Flux.trainable(a::Affine) = (a.W,) ``` not ``` julia> Flux.trainable(a::Affine) = (a.W, a.b) ``` Co-authored-by: Troels Arnfred Bojesen <tr-ab@online.no>	2020-05-25 14:47:42 +00:00
Troels Arnfred Bojesen	17bb00a3fa	Correcting advanced.md To make the example consistent, it should be ``` julia> Flux.trainable(a::Affine) = (a.W,) ``` not ``` julia> Flux.trainable(a::Affine) = (a.W, a.b) ```	2020-05-25 23:33:09 +09:00
bors[bot]	bd152ca099	Merge #1177 1177: Align ExpDecay implementation with documentation r=dhairyagandhi96 a=DrChainsaw Fix for #1176 Co-authored-by: DrChainsaw <Christian.kyril.skarby@gmail.com>	2020-05-21 14:33:20 +00:00
bors[bot]	f343172daf	Merge #1185 1185: Add some news r=dhairyagandhi96 a=dhairyagandhi96 cc @CarloLucibello please add to this list as well Co-authored-by: Dhairya Gandhi <dhairya@juliacopmuting.com>	2020-05-21 12:46:39 +00:00
bors[bot]	472e1fbf5e	Merge #957 957: Add some gradient checking tests on GPUs r=dhairyagandhi96 a=dhairyagandhi96 Good to add generic tests for tracking gradients through the various layers on the GPU. Co-authored-by: Dhairya Gandhi <dhairya@juliacopmuting.com> Co-authored-by: Dhairya Gandhi <dhairya@juliacomputing.com>	2020-05-21 12:25:53 +00:00
Dhairya Gandhi	0801064d50	add comment on broken layers	2020-05-20 00:11:38 +05:30
Dhairya Gandhi	c4409fa6d1	clearing failures	2020-05-19 23:54:18 +05:30
bors[bot]	87ba651add	Merge #1165 1165: Fix docstring of logitcrossentropy r=dhairyagandhi96 a=cossio Since `y` is a logit, there is no log (see the diff). Co-authored-by: cossio <cossio@users.noreply.github.com>	2020-05-19 11:07:15 +00:00
Dhairya Gandhi	55430e207d	add news	2020-05-19 16:34:28 +05:30
bors[bot]	0b10f1a8df	Merge #1184 1184: Add some functions to docs r=dhairyagandhi96 a=dhairyagandhi96 Co-authored-by: Dhairya Gandhi <dhairya@juliacopmuting.com>	2020-05-18 21:10:46 +00:00
DrChainsaw	9a24ee0bd7	Change intendation to 2 spaces	2020-05-18 21:52:40 +02:00
Dhairya Gandhi	bdfe567519	add some layers to docs	2020-05-18 23:53:11 +05:30
bors[bot]	b6a5dd7152	Merge #1133 1133: add ClipValue and ClipNorm r=CarloLucibello a=AStupidBear Co-authored-by: Yao Lu <luyaocns@gmail.com>	2020-05-15 17:15:07 +00:00
Yao Lu	007586858c	fix export merge conflict	2020-05-14 17:13:35 +08:00
Dhairya Gandhi	fab53e0a01	Merge pull request #1179 from FluxML/compathelper/new_version/2020-05-13-00-13-17-919-1190174363 CompatHelper: add new compat entry for "Functors" at version "0.1"	2020-05-13 11:27:40 +05:30
github-actions[bot]	3fa9e91c41	CompatHelper: add new compat entry for "Functors" at version "0.1"	2020-05-13 00:13:46 +00:00
DrChainsaw	e8433d0abe	Align ExpDecay implementation with documentation	2020-05-12 22:50:17 +02:00
bors[bot]	de39d1095b	Merge #1175 1175: xlogy broadcast adjoint r=MikeInnes a=MikeInnes This is helpful for performance, since it avoids having to differentiate `xlogy` itself inside of a map. Co-authored-by: Mike J Innes <mike.j.innes@gmail.com>	2020-05-12 17:10:58 +00:00
Yao Lu	5a9eb7411a	cpu	2020-05-10 14:39:48 +08:00
Yao Lu	888f286c51	use @inbounds	2020-05-09 19:40:46 +08:00
Yao Lu	63cb70dd23	remove importing CuMatrix	2020-05-09 19:13:52 +08:00
Yao Lu	30648910c8	transfer onehot indices back to cpu	2020-05-09 19:10:46 +08:00
Yao Lu	d1ad8db625	add to docs	2020-05-09 16:40:26 +08:00
cossio	9e1fd883d5	Fix docstring of logitbinarycrossentropy and logitcrossentropy	2020-05-05 16:29:29 +02:00
Yao Lu	114f63a214	norm(Δ)	2020-04-26 17:28:07 +08:00
Yao Lu	eb6898ea19	speedup matmul of CuMatrix and OneHotMatrix	2020-04-25 23:22:46 +08:00
Yao Lu	7d6f711c6f	Merge branch 'master' into clip	2020-04-25 22:18:58 +08:00
Yao Lu	58a72ec879	Merge branch 'master' of https://github.com/FluxML/Flux.jl into clip	2020-04-22 01:29:13 +08:00
Yao Lu	c4f5e83697	resolve conflict	2020-04-22 01:24:13 +08:00
Yao Lu	1dfec7f38b	add test	2020-04-22 01:22:34 +08:00
Yao Lu	def19b058e	simplify docstrings	2020-04-21 10:56:38 +08:00
Yao Lu	cc1dcd5590	rm requires	2020-04-20 20:02:29 +08:00
Yao Lu	68b84bba36	add LinearAlgebra	2020-04-20 19:54:44 +08:00
Yao Lu	ba0fca5a19	remove onehot	2020-04-20 19:45:15 +08:00
Yao Lu	b33c4b49be	add ClipValue and ClipNorm	2020-04-20 19:41:10 +08:00
Yao Lu	427c55af92	speedup matmul of CuMatrix and OneHotMatrix	2020-04-20 19:11:57 +08:00
HenriDeh	ac94754281	Update stateless.jl	2020-04-18 13:23:11 +02:00
HenriDeh	1f2643c95c	Add dropgrad in huber_loss Workaround for issue #1128	2020-04-17 13:34:04 +02:00
Dhairya Gandhi	26631e1361	test_broken AlphaDropout	2020-02-16 21:22:37 +05:30
Dhairya Gandhi	29ab410794	test gradients are allocated on the gpu	2020-01-17 15:52:26 +05:30
Dhairya Gandhi	b1e68813a8	cpu -> test_throws	2019-12-20 23:02:44 +05:30
Dhairya Gandhi	efa2cbfd0e	checkin Manifest#master	2019-12-11 14:13:41 +05:30
Dhairya Gandhi	9b6155c77d	Merge branch 'master' into dg/gradtests	2019-12-05 18:17:47 +05:30
Dhairya Gandhi	76dc8ea9d4	formatting fixes	2019-12-05 18:14:04 +05:30
Dhairya Gandhi	717ad9328d	add some grad tests on GPU	2019-12-05 18:12:23 +05:30