34 changed files with 262 additions and 915 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -1,12 +0,0 @@
 [Please delete this text and describe your change here.
 For bugfixes, please detail the bug and include a test case which your patch fixes.
 If you are adding a new feature, please clearly describe the design, its rationale, the possible alternatives considered.
 It is easiest to merge new features when there is clear precedent in other systems; we need to know we're taking
 the right direction since it can be hard to change later.]
 ### PR Checklist
 - [ ] Tests are added
 - [ ] Entry in NEWS.md
 - [ ] Documentation, if applicable
 - [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes).
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@ -6,8 +6,16 @@ on:
 jobs:
  CompatHelper:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        julia-version: [1.3]
        julia-arch: [x64]
        os: [ubuntu-latest]
    steps:
      - uses: julia-actions/setup-julia@latest
        with:
          version: ${{ matrix.julia-version }}
      - name: Pkg.add("CompatHelper")
        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
      - name: CompatHelper.main()
--- a/.travis.yml
+++ b/.travis.yml
@ -16,7 +16,7 @@ notifications:
 jobs:
  include:
    - stage: "Documentation"
-      julia: 1.3
+      julia: 1
      os: linux
      script:
        - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
--- a/Manifest.toml
+++ b/Manifest.toml
@ -8,35 +8,35 @@ version = "0.5.0"
 [[AbstractTrees]]
 deps = ["Markdown"]
-git-tree-sha1 = "33e450545eaf7699da1a6e755f9ea65f14077a45"
+git-tree-sha1 = "86d092c2599f1f7bb01668bf8eb3412f98d61e47"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.3.3"
+version = "0.3.2"
 [[Adapt]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "fd04049c7dd78cfef0b06cdc1f0f181467655712"
+git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "1.1.0"
+version = "1.0.1"
 [[ArrayLayouts]]
 deps = ["FillArrays", "LinearAlgebra"]
-git-tree-sha1 = "a504dca2ac7eda8761c8f7c1ed52427a1be75a3c"
+git-tree-sha1 = "41956a49a8a4fefa1bf6664bca4a3035aba4c3a0"
 uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
-version = "0.2.6"
+version = "0.2.3"
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 [[BinaryProvider]]
-deps = ["Libdl", "Logging", "SHA"]
+deps = ["Libdl", "SHA"]
-git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
+git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
 uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.10"
+version = "0.5.8"
 [[CEnum]]
-git-tree-sha1 = "1b77a77c3b28e0b3f413f7567c9bb8dd9bdccd14"
+git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.3.0"
+version = "0.2.0"
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
@ -46,21 +46,21 @@ version = "4.0.0"
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "f56bbf18c86bcff7a961a32a4947a5abb2963a29"
+git-tree-sha1 = "e650cbaee92b60433313157926b1e80d0c3a0e2e"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "6.3.0"
+version = "6.2.2"
 [[CUDAnative]]
-deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "ExprTools", "GPUCompiler", "LLVM", "Libdl", "Pkg", "Printf"]
+deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
-git-tree-sha1 = "ac86db2b05fdfec96b011e25a504ffe7476e8a68"
+git-tree-sha1 = "d1fc99635d0002c8a819b78cb1f441eb44310725"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "3.1.0"
+version = "3.0.2"
 [[CodeTracking]]
 deps = ["InteractiveUtils", "UUIDs"]
-git-tree-sha1 = "cab4da992adc0a64f63fa30d2db2fd8bec40cab4"
+git-tree-sha1 = "0becdab7e6fbbcb7b88d8de5b72e5bb2f28239f3"
 uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
-version = "0.5.11"
+version = "0.5.8"
 [[CodecZlib]]
 deps = ["TranscodingStreams", "Zlib_jll"]
@ -70,15 +70,15 @@ version = "0.7.0"
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "c73d9cfc2a9d8433dc77f5bff4bddf46b1d78c20"
+git-tree-sha1 = "c4c1cca28748906265ed62c788d6fe6f0134d264"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.10.3"
+version = "0.10.0"
 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"]
-git-tree-sha1 = "1e9bba7984e78aa8cdeea7f9f7cc984ad4e4b1c7"
+git-tree-sha1 = "2fdeb981ebcf52cd800ddb6a0aa5eac34153552d"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.12.2"
+version = "0.12.0"
 [[CommonSubexpressions]]
 deps = ["Test"]
@ -93,27 +93,27 @@ uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
 version = "0.3.3+0"
 [[Cthulhu]]
-deps = ["CodeTracking", "InteractiveUtils", "REPL", "UUIDs", "Unicode"]
+deps = ["CodeTracking", "InteractiveUtils", "REPL", "Unicode"]
-git-tree-sha1 = "f3643e78353199d3097821e806348bd83f364155"
+git-tree-sha1 = "484790098c85c26f8e59051f8ff1a0745c034a7d"
 uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
-version = "1.1.1"
+version = "1.0.1"
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "1582b74d2322df7dd94549d4ac9d095e0f20e884"
+git-tree-sha1 = "e8c55b38dcca955f5aed8ec4479cdc95810db1e1"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "2.2.1"
+version = "2.0.1"
 [[DataAPI]]
-git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32"
+git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.3.0"
+version = "1.1.0"
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "af6d9c86e191c917c2276fbede1137e8ea20157f"
+git-tree-sha1 = "73eb18320fe3ba58790c8b8f6f89420f0a622773"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.17"
+version = "0.17.11"
 [[Dates]]
 deps = ["Printf"]
@ -139,16 +139,11 @@ version = "1.0.1"
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 [[ExprTools]]
 git-tree-sha1 = "6f0517056812fd6aa3af23d4b70d5325a2ae4e95"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
 version = "0.1.1"
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "44f561e293987ffc84272cd3d2b14b0b93123d63"
+git-tree-sha1 = "51cc2f9bc4eb9c6c0e81ec2f779d1085583cc956"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.10"
+version = "0.8.7"
 [[FixedPointNumbers]]
 git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238"
@ -167,27 +162,17 @@ git-tree-sha1 = "f40adc6422f548176bb4351ebd29e4abf773040a"
 uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 version = "0.1.0"
 [[Future]]
 deps = ["Random"]
 uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
-git-tree-sha1 = "d887693eb1bd5e1fd573262a978745481895ec7d"
+git-tree-sha1 = "d586762b08dcda13228df8967119b9cb6f22ade5"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "3.4.1"
+version = "3.1.0"
 [[GPUCompiler]]
 deps = ["Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs"]
 git-tree-sha1 = "5275aa268ecd09640b32560e1eae90c78816e4d1"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
 version = "0.2.0"
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "90ee39f9beaaa186e4968417ea2b8ed5673c91c0"
+git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.3.3"
+version = "0.3.1"
 [[InteractiveUtils]]
 deps = ["Markdown"]
@ -195,15 +180,15 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 [[Juno]]
 deps = ["Base64", "Logging", "Media", "Profile"]
-git-tree-sha1 = "a686b0cf235fa3e491b79b4783c2d2382292b436"
+git-tree-sha1 = "e1ba2a612645b3e07c773c3a208f215745081fe6"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.8.2"
+version = "0.8.1"
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "dd3f584c3dbefe39b2a8fbafa1a3b77e31e21255"
+git-tree-sha1 = "b6b86801ae2f2682e0a4889315dc76b68db2de71"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.5.1"
+version = "1.3.4"
 [[LibGit2]]
 deps = ["Printf"]
@ -262,9 +247,10 @@ uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
 version = "0.5.3+3"
 [[OrderedCollections]]
-git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3"
+deps = ["Random", "Serialization", "Test"]
 git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.2.0"
+version = "1.1.0"
 [[Pkg]]
 deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
@ -319,15 +305,15 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 [[SpecialFunctions]]
 deps = ["OpenSpecFun_jll"]
-git-tree-sha1 = "d8d8b8a9f4119829410ecd706da4cc8594a1e020"
+git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.10.3"
+version = "0.10.0"
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "5c06c0aeb81bef54aed4b3f446847905eb6cbda0"
+git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.12.3"
+version = "0.12.1"
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
@ -345,9 +331,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 [[TimerOutputs]]
 deps = ["Printf"]
-git-tree-sha1 = "f458ca23ff80e46a630922c555d838303e4b9603"
+git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.6"
+version = "0.5.3"
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
@ -364,21 +350,21 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 [[ZipFile]]
 deps = ["Libdl", "Printf", "Zlib_jll"]
-git-tree-sha1 = "254975fef2fc526583bb9b7c9420fe66ffe09f2f"
+git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.9.2"
+version = "0.9.1"
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "a2e0d558f6031002e380a90613b199e37a8565bf"
+git-tree-sha1 = "2f6c3e15e20e036ee0a0965879b31442b7ec50fa"
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.11+10"
+version = "1.2.11+9"
 [[Zygote]]
-deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "Future", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
+deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "707ceea58e2bd0ff3077ab13a92f8355181d3ee4"
+git-tree-sha1 = "1ccbfbe8930376e31752b812daa2532c723dc332"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.20"
+version = "0.4.13"
 [[ZygoteRules]]
 deps = ["MacroTools"]
--- a/NEWS.md
+++ b/NEWS.md
@ -1,19 +1,3 @@
 # v0.11
 * Change to `DataLoader`'s constructor [https://github.com/FluxML/Flux.jl/pull/1152]
 * Use `DataLoader` with `NamedTuple`s, so that tensors can be accessed by name [https://github.com/FluxML/Flux.jl/pull/1221].
 * Error if Dense layers weights and biases are not arrays [https://github.com/FluxML/Flux.jl/pull/1218].
 # v0.10.5
 * Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`.
 * Added option to set `bias` to [Flux.Zeros](https://github.com/FluxML/Flux.jl/pull/873) to eliminating `bias` from being trained.
 * Added `GlobalMaxPool` and `GlobalMeanPool` [layers](https://github.com/FluxML/Flux.jl/pull/950) for performing global pooling operations.
 * Added `ClipValue` and `ClipNorm` in this [pr](https://github.com/FluxML/Flux.jl/pull/1133) to `Flux.Optimise` to provide a cleaner API for gradient clipping.
 * Added new kwarg-only [constructors](https://github.com/FluxML/Flux.jl/pull/873) for the various convolutional layers.
 * Documented the convolutional layer constructors accepting `weight` and `bias` keyword arguments to supply custom arrays for those fields.
 * Testing suite improvements now test for gradients of all layers along with GPU support.
 * Functors have now moved to [Functors.jl](https://github.com/FluxML/Flux.jl/pull/1174) to allow for their use outside of Flux.
 * Added [helper functions](https://github.com/FluxML/Flux.jl/pull/873) `Flux.convfilter` and `Flux.depthwiseconvfilter` to construct weight arrays for convolutions outside of layer constructors so as to not have to depend on the default layers for custom implementations.
 # v0.10.0
 * The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
  - The dependency on Tracker.jl has been removed.
--- a/Project.toml
+++ b/Project.toml
@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.11.0-DEV"
+version = "0.10.4"
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@ -11,7 +11,6 @@ CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@ -27,11 +26,10 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 AbstractTrees = "0.2, 0.3"
-Adapt = "1, 2.0"
+Adapt = "1"
 CodecZlib = "0.5, 0.6, 0.7"
 Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
 CuArrays = "2"
 Functors = "0.1"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
--- a/docs/src/data/onehot.md
+++ b/docs/src/data/onehot.md
@ -7,15 +7,15 @@ julia> using Flux: onehot, onecold
 julia> onehot(:b, [:a, :b, :c])
 3-element Flux.OneHotVector:
- 0
+ false
- 1
+  true
- 0
+ false
 julia> onehot(:c, [:a, :b, :c])
 3-element Flux.OneHotVector:
- 0
+ false
- 0
+ false
- 1
+  true
 ```
 The inverse is `onecold` (which can take a general probability distribution, as well as just booleans).
--- a/docs/src/models/advanced.md
+++ b/docs/src/models/advanced.md
@ -19,7 +19,7 @@ Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0
 julia> Flux.params(a) # default behavior
 Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])
-julia> Flux.trainable(a::Affine) = (a.W,)
+julia> Flux.trainable(a::Affine) = (a.W, a.b,)
 julia> Flux.params(a)
 Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@ -32,6 +32,8 @@ julia> gradient(f, [2, 1], [2, 0])
 But machine learning models can have *hundreds* of parameters! To handle this, Flux lets you work with collections of parameters, via `params`. You can get the gradient of all parameters used in a program without explicitly passing them in.
 ```jldoctest basics
 julia> using Flux
 julia> x = [2, 1];
 julia> y = [2, 0];
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@ -20,11 +20,7 @@ GlobalMeanPool
 DepthwiseConv
 ConvTranspose
 CrossCor
 SamePad
 flatten
 Flux.Zeros
 Flux.convfilter
 Flux.depthwiseconvfilter
 ```
 ## Recurrent Layers
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@ -39,7 +39,7 @@ E.g. the following will have run into the same problem as above:
    leaky_tanh(x) = 0.01*x + tanh(x)
 ```
-While one could change the activation function (e.g. to use `0.01f0*x`), the idiomatic (and safe way)  to avoid type casts whenever inputs changes is to use `oftype`:
+While one could change the activation function (e.g. to use `0.01f0x`), the idiomatic (and safe way)  to avoid type casts whenever inputs changes is to use `oftype`:
 ```
    leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x)
 ```
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@ -80,7 +80,7 @@ Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
 The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
 ```julia
-function Flux.Optimise.apply!(o::Momentum, x, Δ)
+function apply!(o::Momentum, x, Δ)
  η, ρ = o.eta, o.rho
  v = get!(o.velocity, x, zero(x))::typeof(x)
  @. v = ρ * v - η * Δ
@ -140,16 +140,3 @@ ExpDecay
 InvDecay
 WeightDecay
 ```
 ## Gradient Clipping
 Gradient clipping is useful for training recurrent neural networks, which have a tendency to suffer from the exploding gradient problem. An example usage is
 ```julia
 opt = Optimiser(ClipValue(1e-3), ADAM(1e-3))
 ```
 ```@docs
 ClipValue
 ClipNorm
 ```
--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -3,15 +3,14 @@ module Flux
 # Zero Flux Given
 using Base: tail
-using Statistics, Random, LinearAlgebra
+using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using Zygote, MacroTools, Juno, Reexport
 using MacroTools: @forward
@reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
-export Chain, Dense, Maxout, RNN, LSTM, GRU, SamePad, Conv, CrossCor, ConvTranspose,
+export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose,
       GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, flatten,
       DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
       SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!
@ -19,17 +18,15 @@ export Chain, Dense, Maxout, RNN, LSTM, GRU, SamePad, Conv, CrossCor, ConvTransp
 include("optimise/Optimise.jl")
 using .Optimise
 using .Optimise: @epochs
-export Descent, ADAM, Momentum, Nesterov, RMSProp,
+export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
  ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
-  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay,
+  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
  ClipValue, ClipNorm
 using CuArrays
 const use_cuda = Ref(false)
 include("utils.jl")
 include("zeros.jl")
 include("onehot.jl")
 include("functor.jl")
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@ -51,6 +51,4 @@ export Iris
 include("housing.jl")
 export Housing
@deprecate DataLoader(x...; kws...) DataLoader(x; kws...)
 end
--- a/src/data/dataloader.jl
+++ b/src/data/dataloader.jl
@ -1,7 +1,7 @@
 # Adapted from Knet's src/data.jl (author: Deniz Yuret)
-struct DataLoader{D}
+struct DataLoader
-    data::D
+    data
    batchsize::Int
    nobs::Int
    partial::Bool
@ -11,20 +11,21 @@ struct DataLoader{D}
 end
 """
-    DataLoader(data; batchsize=1, shuffle=false, partial=true)
+    DataLoader(data...; batchsize=1, shuffle=false, partial=true)
 An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
 (except possibly the last one). 
-Takes as input a single data tensor, or a tuple (or a named tuple) of tensors.
+Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in 
-The last dimension in each tensor is considered to be the observation dimension.
+supervised learning. The last dimension in each tensor is considered to be the observation
 dimension. 
 If `shuffle=true`, shuffles the observations each time iterations are re-started.
 If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
-The original data is preserved in the `data` field of the DataLoader. 
+The original data is preserved as a tuple in the `data` field of the DataLoader. 
-Usage example:
+Example usage:
    Xtrain = rand(10, 100)
    train_loader = DataLoader(Xtrain, batchsize=2) 
@ -36,16 +37,9 @@ Usage example:
    train_loader.data   # original dataset
    # similar, but yielding tuples
    train_loader = DataLoader((Xtrain,), batchsize=2) 
    for (x,) in train_loader
        @assert size(x) == (10, 2)
        ...
    end
    Xtrain = rand(10, 100)
    Ytrain = rand(100)
-    train_loader = DataLoader((Xtrain, Ytrain), batchsize=2, shuffle=true) 
+    train_loader = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true) 
    for epoch in 1:100
        for (x, y) in train_loader
            @assert size(x) == (10, 2)
@ -57,25 +51,25 @@ Usage example:
    # train for 10 epochs
    using IterTools: ncycle 
    Flux.train!(loss, ps, ncycle(train_loader, 10), opt)
    # can use NamedTuple to name tensors
    train_loader = DataLoader((images=Xtrain, labels=Ytrain), batchsize=2, shuffle=true)
    for datum in train_loader
        @assert size(datum.images) == (10, 2)
        @assert size(datum.labels) == (2,)
    end
 """
-function DataLoader(data; batchsize=1, shuffle=false, partial=true)
+function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
    batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
-    n = _nobs(data) 
+    nx = size(data[1])[end]
-    if n < batchsize
+    for i=2:length(data)
-        @warn "Number of observations less than batchsize, decreasing the batchsize to $n"
+        nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
        batchsize = n
    end
-    imax = partial ? n : n - batchsize + 1
+    if nx < batchsize
-    DataLoader(data, batchsize, n, partial, imax, [1:n;], shuffle)
+        @warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
        batchsize = nx
    end
    imax = partial ? nx : nx - batchsize + 1
    ids = 1:min(nx, batchsize)
    DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
 end
 getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
@propagate_inbounds function Base.iterate(d::DataLoader, i=0)     # returns data in d.indices[i+1:i+batchsize]
    i >= d.imax && return nothing
@ -84,7 +78,11 @@ end
    end
    nexti = min(i + d.batchsize, d.nobs)
    ids = d.indices[i+1:nexti]
-    batch = _getobs(d.data, ids)
+    if length(d.data) == 1
        batch = getdata(d.data[1], ids)
    else
        batch = ((getdata(x, ids) for x in d.data)...,)
    end
    return (batch, nexti)
 end
@ -92,19 +90,3 @@ function Base.length(d::DataLoader)
    n = d.nobs / d.batchsize
    d.partial ? ceil(Int,n) : floor(Int,n)
 end
 _nobs(data::AbstractArray) = size(data)[end]
 function _nobs(data::Union{Tuple, NamedTuple})
    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
    n = _nobs(data[1])
    if !all(x -> _nobs(x) == n, Base.tail(data))
        throw(DimensionMismatch("All data should contain same number of observations"))
    end
    return n
 end
 _getobs(data::AbstractArray, i) = data[ntuple(i -> Colon(), Val(ndims(data) - 1))..., i]
 _getobs(data::Union{Tuple, NamedTuple}, i) = map(Base.Fix2(_getobs, i), data)
 Base.eltype(::DataLoader{D}) where D = D
--- a/src/functor.jl
+++ b/src/functor.jl
@ -24,7 +24,7 @@ testmode!(m, mode = true) = m
    trainmode!(m, mode = true)
 Set a layer of model's train mode (see below).
-Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`).
+Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
 _Note_: if you manually set a model into train mode, you need to manually place
 it into test mode during testing phase.
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@ -102,7 +102,7 @@ julia> d(rand(5))
  -0.16210233
   0.12311903```
 """
-struct Dense{F,S<:AbstractArray,T<:AbstractArray}
+struct Dense{F,S,T}
  W::S
  b::T
  σ::F
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@ -7,59 +7,26 @@ _convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
 """
-    SamePad
+    Conv(size, in => out, σ = identity; init = glorot_uniform,
 Padding for convolutional layers will be calculated so that outputshape == inputshape when stride = 1.
 For stride > 1 the output shape depends on the type of convolution layer.
 """
 struct SamePad end
 calc_padding(pad, k::NTuple{N,T}, dilation, stride) where {T,N}= expand(Val(2*N), pad)
 function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
  #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/pdf/1603.07285
  # Effective kernel size, including dilation
  k_eff = @. k + (k - 1) * (dilation - 1)
  # How much total padding needs to be applied?
  pad_amt = @. k_eff - 1
  # In case amount of padding is odd we need to apply different amounts to each side.
  return Tuple(mapfoldl(i -> [ceil(Int, i/2), floor(Int, i/2)], vcat, pad_amt))
 end
 """
    Conv(filter, in => out, σ = identity; init = glorot_uniform,
         stride = 1, pad = 0, dilation = 1)
-    filter = (2,2)
+Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
    in = 1
    out = 16
    Conv((2, 2), 1=>16, relu)
 Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
 Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 # Examples
-Apply a `Conv` layer to a 1-channel input using a 2×2 window filter size, giving us a
+Apply a `Conv` layer to a 1-channel input using a 2×2 window size, giving us a
 16-channel output. Output is activated with ReLU.
 ```julia
-filter = (2,2)
+size = (2,2)
 in = 1
 out = 16
-Conv(filter, in => out, relu)
+Conv(size, in => out, relu)
 ```
 """
 struct Conv{N,M,F,A,V}
@ -71,68 +38,25 @@ struct Conv{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end
-"""
+function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
    Conv(weight::AbstractArray, bias::AbstractArray)
    Conv(weight::AbstractArray, bias::AbstractArray, activation)
 Constructs the convolutional layer with user defined weight and bias arrays.
 Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 There is also a keyword-only constuctor available for all convoultional
 layers.
 ```julia
 weight = rand(Float32, 3, 3, 5)
 bias = zeros(Float32, 5)
 Conv(weight = weight,
    bias = bias,
    σ = sigmoid)
 ```
 """
 function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
              stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
  pad = expand(Val(2*(N-2)), pad)
  dilation = expand(Val(N-2), dilation)
  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return Conv(σ, w, b, stride, pad, dilation)
 end
-function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-              activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1) where N =
-  Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+  Conv(init(k..., ch...), zeros(ch[2]), σ,
 end
 """
    convfilter(filter::Tuple, in=>out)
 Constructs a standard convolutional weight matrix with given `filter` and
 channels from `in` to `out`.
 Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
 distribution.
 See also: [`depthwiseconvfilter`](@ref)
 """
 convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
          init = glorot_uniform) where N = init(filter..., ch...)
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
            init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
            weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
  Conv(weight, bias, σ,
       stride = stride, pad = pad, dilation = dilation)
 end
@functor Conv
 function (c::Conv)(x::AbstractArray)
  # TODO: breaks gpu broadcast :(
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
-  σ, b = c.σ, reshape(c.bias, ntuple(_->1, length(c.stride))..., :, 1)
+  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
  cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
  σ.(conv(x, c.weight, cdims) .+ b)
 end
@ -166,23 +90,15 @@ outdims(l::Conv, isize) =
  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 """
-    ConvTranspose(filter, in=>out)
+    ConvTranspose(size, in => out, σ = identity; init = glorot_uniform,
    ConvTranspose(filter, in=>out, activation)
    ConvTranspose(filter, in => out, σ = identity; init = glorot_uniform,
                  stride = 1, pad = 0, dilation = 1)
-Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
+Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
 Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1.
 """
 struct ConvTranspose{N,M,F,A,V}
  σ::F
@ -193,39 +109,18 @@ struct ConvTranspose{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end
-"""
+function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
    ConvTranspose(weight::AbstractArray, bias::AbstractArray)
    ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
 Constructs the convolutional transpose layer with user defined weight and bias arrays.
 forward pass.
 Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For keyword-only constuctor, see also [`Conv`](@ref)
 """
 function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
              stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
  pad = expand(Val(2*(N-2)), pad)
  dilation = expand(Val(N-2), dilation)
  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return ConvTranspose(σ, w, b, stride, pad, dilation)
 end
-function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-                        activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+              init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
-  ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+ConvTranspose(init(k..., reverse(ch)...), zeros(ch[2]), σ,
 end
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
                      weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
  ConvTranspose(weight, bias, σ,
              stride = stride, pad = pad, dilation = dilation)
 end
@functor ConvTranspose
@ -250,7 +145,7 @@ function (c::ConvTranspose)(x::AbstractArray)
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
  cdims = conv_transpose_dims(c, x)
-  σ.(∇conv_data(x, c.weight, cdims) .+ b)
+  return σ.(∇conv_data(x, c.weight, cdims) .+ b)
 end
 function Base.show(io::IO, l::ConvTranspose)
@ -269,24 +164,16 @@ end
 outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
 """
-    DepthwiseConv(filter::Tuple, in=>out)
+    DepthwiseConv(size, in => out, σ = identity; init = glorot_uniform,
    DepthwiseConv(filter::Tuple, in=>out, activation)
    DepthwiseConv(filter, in => out, σ = identity; init = glorot_uniform,
                  stride = 1, pad = 0, dilation = 1)
-Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
+Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Note that `out` must be an integer multiple of `in`.
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
 Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 """
 struct DepthwiseConv{N,M,F,A,V}
  σ::F
@ -297,54 +184,20 @@ struct DepthwiseConv{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end
-"""
+function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
    DepthwiseConv(weight::AbstractArray, bias::AbstractArray)
    DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
 Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
 forward pass.
 Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For keyword-only constuctor, see also [`Conv`](@ref)
 """
 function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
                       stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
  pad = expand(Val(2*(N-2)), pad)
  dilation = expand(Val(N-2), dilation)
  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
  DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
 """
    depthwiseconvfilter(filter::Tuple, in=>out)
 Constructs a depthwise convolutional weight array defined by `filter` and channels
 from `in` to `out`.
 Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
 distribution.
 See also: [`convfilter`](@ref)
 """
 depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
                    init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N
                      weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
  @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
  return DepthwiseConv(
-    weight,
+    init(k..., div(ch[2], ch[1]), ch[1]),
-    bias,
+    zeros(ch[2]),
    σ;
    stride = stride,
    pad = pad,
@ -377,30 +230,22 @@ outdims(l::DepthwiseConv, isize) =
  output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 """
-    CrossCor(filter, in=>out)
+    CrossCor(size, in => out, σ = identity; init = glorot_uniform,
    CrossCor(filter, in=>out, activation)
    CrossCor(filter, in => out, σ = identity; init = glorot_uniform,
             stride = 1, pad = 0, dilation = 1)
-Standard cross convolutional layer. `filter` should be a tuple like `(2, 2)`.
+Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
 Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 # Examples
-Apply a `CrossCor` layer to a 1-channel input using a 2×2 window filter size, giving us a
+Apply a `CrossCor` layer to a 1-channel input using a 2×2 window size, giving us a
 16-channel output. Output is activated with ReLU.
 ```julia
-filter = (2,2)
+size = (2,2)
 in = 1
 out = 16
 CrossCor((2, 2), 1=>16, relu)
@ -415,39 +260,18 @@ struct CrossCor{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end
-"""
+function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
    CrossCor(weight::AbstractArray, bias::AbstractArray)
    CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
 Constructs the standard cross convolutional layer with user defined weight and bias
 arrays.
 Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For keyword-only constuctor, see also [`Conv`](@ref)
 """
 function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
              stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
  pad = expand(Val(2*(N-2)), pad)
  dilation = expand(Val(N-2), dilation)
  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return CrossCor(σ, w, b, stride, pad, dilation)
 end
-function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
-  CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+  CrossCor(init(k..., ch...), zeros(ch[2]), σ,
 end
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                  init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
                  weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
  CrossCor(weight, bias, σ,
       stride = stride, pad = pad, dilation = dilation)
 end
@functor CrossCor
@ -534,9 +358,6 @@ end
    MaxPool(k; pad = 0, stride = k)
 Max pooling layer. `k` is the size of the window for each dimension of the input.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 =======
 """
 struct MaxPool{N,M}
  k::NTuple{N,Int}
@ -546,7 +367,8 @@ end
 function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
  stride = expand(Val(N), stride)
-  pad = calc_padding(pad, k, 1, stride)
+  pad = expand(Val(2*N), pad)
  return MaxPool(k, pad, stride)
 end
@ -565,8 +387,6 @@ outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.
    MeanPool(k; pad = 0, stride = k)
 Mean pooling layer. `k` is the size of the window for each dimension of the input.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 """
 struct MeanPool{N,M}
    k::NTuple{N,Int}
@ -576,7 +396,7 @@ end
 function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
  stride = expand(Val(N), stride)
-  pad = calc_padding(pad, k, 1, stride)
+  pad = expand(Val(2*N), pad)
  return MeanPool(k, pad, stride)
 end
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@ -46,24 +46,23 @@ given the prediction `ŷ` and true values `y`.
    Huber loss = |
                 |  δ * (|ŷ - y| - 0.5 * δ), otherwise
 """
 #TODO: remove dropgrad when Zygote can handle this function with CuArrays
 function huber_loss(ŷ, y;  δ=eltype(ŷ)(1))
   abs_error = abs.(ŷ .- y)
-   temp = Zygote.dropgrad(abs_error .<  δ)
+   temp = abs_error .<  δ
   x = eltype(ŷ)(0.5)
   hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
 end
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
-  return -sum(xlogy.(y, ŷ)) * 1 // size(y, 2)
+  return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
 end
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
-  return -sum(xlogy.(y, ŷ)) .* weight * 1 // size(y, 2)
+  return -sum(y .* log.(ŷ)) .* weight * 1 // size(y, 2)
 end
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
-  return -sum(xlogy.(y, ŷ) .* weight) * 1 // size(y, 2)
+  return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
 end
 """
@ -92,7 +91,7 @@ Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
 calculated as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
 `logitcrossentropy(ŷ, y)` is mathematically equivalent to
-[`Flux.crossentropy(softmax(ŷ), y)`](@ref) but it is more numerically stable.
+[`Flux.crossentropy(softmax(log(ŷ)), y)`](@ref) but it is more numerically stable.
 See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
@ -124,7 +123,7 @@ julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
 0.8616703662235441
 ```
 """
-binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)
+binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 # Re-definition to fix interaction with CuArrays.
 CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
@ -133,7 +132,7 @@ CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1
    logitbinarycrossentropy(ŷ, y)
 `logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to
-[`Flux.binarycrossentropy(σ(ŷ), y)`](@ref) but it is more numerically stable.
+[`Flux.binarycrossentropy(σ(log(ŷ)), y)`](@ref) but it is more numerically stable.
 See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref)
@ -196,7 +195,7 @@ It is always non-negative and zero only when both the distributions are equal
 everywhere.
 """
 function kldivergence(ŷ, y)
-  entropy = sum(xlogx.(y)) * 1 //size(y,2)
+  entropy = sum(y .* log.(y)) * 1 //size(y,2)
  cross_entropy = crossentropy(ŷ, y)
  return entropy + cross_entropy
 end
@ -209,7 +208,7 @@ distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
 [More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
-poisson(ŷ, y) = sum(ŷ .- xlogy.(y, ŷ)) * 1 // size(y,2)
+poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) * 1 // size(y,2)
 """
    hinge(ŷ, y)
@ -263,34 +262,3 @@ by linearizing all values for each element in the batch.
 function flatten(x::AbstractArray)
  return reshape(x, :, size(x)[end])
 end
 """
    xlogx(x)
 Return `x * log(x)` for `x ≥ 0`, handling `x = 0` by taking the downward limit.
 """
 function xlogx(x)
  result = x * log(x)
  ifelse(iszero(x), zero(result), result)
 end
 CuArrays.@cufunc function xlogx(x)
  result = x * log(x)
  ifelse(iszero(x), zero(result), result)
 end
 """
    xlogy(x, y)
 Return `x * log(y)` for `y > 0` with correct limit at `x = 0`.
 """
 function xlogy(x, y)
  result = x * log(y)
  ifelse(iszero(x), zero(result), result)
 end
 CuArrays.@cufunc function xlogy(x, y)
  result = x * log(y)
  ifelse(iszero(x), zero(result), result)
 end
@adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric)
  res = xlogy.(x, y)
  res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
 end
--- a/src/onehot.jl
+++ b/src/onehot.jl
@ -27,8 +27,7 @@ Base.getindex(xs::OneHotMatrix, ::Colon, ::Colon) = OneHotMatrix(xs.height, copy
 Base.getindex(xs::OneHotMatrix, i::Integer, ::Colon) = map(x -> x[i], xs.data)
-# remove workaround when https://github.com/JuliaGPU/CuArrays.jl/issues/676 is fixed
+A::AbstractMatrix * B::OneHotMatrix = A[:, map(x->x.ix, B.data)]
 A::AbstractMatrix * B::OneHotMatrix = A[:, cpu(map(x->x.ix, B.data))]
 Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...])
@ -49,7 +48,7 @@ cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.d
 Create a `OneHotVector` with its `l`-th element `true` based on the
 possible set of `labels`.
 If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
-in `labels`; otherwise, it will raise an error.
+in `labels`; otherwise it will error.
 # Examples
 ```jldoctest
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@ -1,12 +1,9 @@
 module Optimise
 using LinearAlgebra
 export train!, update!,
-	Descent, ADAM, Momentum, Nesterov, RMSProp,
+	SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
 	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
-	InvDecay, ExpDecay, WeightDecay, stop, Optimiser,
+	InvDecay, ExpDecay, WeightDecay, stop, Optimiser
 	ClipValue, ClipNorm
 include("optimisers.jl")
 include("train.jl")
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@ -509,7 +509,7 @@ function apply!(o::ExpDecay, x, Δ)
  η, s, decay = o.eta, o.step, o.decay
  n = o.current[x] = get(o.current, x, 0) + 1
  if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1
-    η = max(η * decay, o.clip)
+    η = max(η * decay^(s / n), o.clip)
    o.eta = η
  end
  @. Δ *= η
@ -533,31 +533,3 @@ function apply!(o::WeightDecay, x, Δ)
  wd = o.wd
  @. Δ += wd * x
 end
 """
    ClipValue(thresh)
 Clip gradients when their absolute value exceeds `thresh`.
 """
 mutable struct ClipValue{T}
    thresh::T
 end
 apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
 """
    ClipNorm(thresh)
 Clip gradients when their L2 norm exceeds `thresh`.
 """
 mutable struct ClipNorm{T}
    thresh::T
 end
 function apply!(o::ClipNorm, x, Δ)
    Δnrm = norm(Δ)
    if Δnrm > o.thresh
        rmul!(Δ, o.thresh / Δnrm)
    end
    return Δ
 end
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@ -68,7 +68,8 @@ and compute the gradient of `loss(d)`.
 A callback is given with the keyword argument `cb`. For example, this will print
 "training" every 10 seconds (using [`Flux.throttle`](@ref)):
-    train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))
+  train!(loss, params, data, opt,
         cb = throttle(() -> println("training"), 10))
 The callback can call [`Flux.stop`](@ref) to interrupt the training loop.
--- a/src/utils.jl
+++ b/src/utils.jl
@ -24,7 +24,7 @@ glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum
    glorot_normal(dims...)
 Return an `Array` of size `dims` containing random variables taken from a normal
-distribution with mean 0 and standard deviation `sqrt(2 / sum(dims))`.
+distribution with mean 0 and standard deviation `(2 / sum(dims))`.
 # Examples
 ```jldoctest; setup = :(using Random; Random.seed!(0))
@ -246,10 +246,6 @@ function _restructure(m, xs)
  end
 end
@adjoint function _restructure(m, xs)
  _restructure(m, xs), dm -> (nothing,destructure(dm)[1])
 end
 """
    destructure(m)
--- a/src/zeros.jl
+++ b/src/zeros.jl
@ -1,106 +0,0 @@
 import Base: +, -, *, reshape, size
 import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
 """
    Zeros()
    Zeros(size...)
    Zeros(Type, size...)
 Acts as a stand-in for an array of zeros that can be
 used during training which is ignored by the optimisers.
 Useful to turn bias off for a forward pass of a layer.
 ## Examples
 ```julia
 julia> Flux.Zeros(3,3)
 3×3 Flux.Zeros{Bool,2}:
 false  false  false
 false  false  false
 false  false  false
 julia> Flux.Zeros(Float32, 3,3)
 3×3 Flux.Zeros{Float32,2}:
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 julia> rand(3,3) .+ Flux.Zeros()
 3×3 Array{Float64,2}:
 0.198739  0.490459  0.785386
 0.779074  0.39986   0.66383
 0.854981  0.447292  0.314497
 julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
 Conv((2, 2), 1=>3)
 ```
 """
 struct Zeros{T,N} <: AbstractArray{T,N}
  size::Tuple
 end
 Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
 Zeros(sz::Integer...) = Zeros(Bool, sz...)
 Base.size(xs::Zeros) = xs.size
 Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
 Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
 Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
 Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
              Zeros(T, length(inds))
 Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
@adjoint reshape(xs::Zeros{T}, dims...) where T =
                reshape(xs, dims...), _ -> nothing
 # Define basic ops
 for f in (:+, :-)
  @eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
    @assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
    a
  end
 end
 +(a::Zeros, b::AbstractArray) = b + a
 -(a::Zeros, b::AbstractArray) = -b + a
 Base.copy(xs::Zeros{T,N}) where {T,N} = xs
 # Define broadcasting behaviour
 for op in (:+, :-)
  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
    bs = Broadcast.broadcast_shape(size(a), size(b))
    size(a) == bs && return a
    sz = similar(a, bs)
    sz .= a
  end
 end
 broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
 broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
 function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
  Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
 end
 broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
 for op in (:+, :-, :*)
  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
 end
 # Some opportunities to avoid scalar indexing, intermediaries
 # Since it replicates a little of what we expect Base to do,
 # it should be possible to remove in the future, but for now,
 # these help with performance.
 broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
 broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
 broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
 broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
 broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
 broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
 broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@ -69,7 +69,6 @@ if CuArrays.has_cudnn()
  @info "Testing Flux/CUDNN"
  include("cudnn.jl")
  include("curnn.jl")
  include("layers.jl")
 else
  @warn "CUDNN unavailable, not testing GPU DNN support"
 end
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@ -1,98 +0,0 @@
 # Test layers and data/model movements on and off the GPU
 # Add tests for layers and their gradients on the GPU
 # Most of the forward passes should be fine being applied
 # to bitstype objects, but this gives higher coverage for our use-cases
 # Check that getting the gradients does not throw
 # generic movement tests
@testset "Basic GPU Movement" begin
  @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
  @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
 end
 # TODO: These layers get into scalar indexing
 # `AlphaDropout` throws a compilation error on GPUs,
 # whereas, the rest are scalar indexing issues.
 const BROKEN_LAYERS = [DepthwiseConv,
 		       AlphaDropout,
                       InstanceNorm,
                       GroupNorm]
 function gradtest(name::String, layers::Vector, xs = nothing, args...)
  isnothing(xs) && error("Missing input to test the layers against.")
  @testset "$name GPU grad tests" begin
    for layer in layers
      @testset "$layer GPU grad test" begin
        l = gpu(layer(args...))
        xs = gpu(xs)
        if any(x -> isa(l, x), BROKEN_LAYERS)
          ps = Flux.params(l)
          @test_broken gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
        else
          ps = Flux.params(l)
          @test gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
          gs = gradient(() -> sum(l(xs)), ps)
          # Handle pooling layers
          if !isempty(ps)
            @test gs[first(ps)] isa Flux.CuArrays.CuArray
          end
        end
      end
    end
  end
 end
 # Repeats from Conv, CrossCor
 r = rand(Float32, 28, 28, 1, 1)
 conv_layers = [Conv, ConvTranspose, CrossCor, DepthwiseConv]
 gradtest("Conv", conv_layers, r, (2,2), 1=>3)
 pooling_layers = [MaxPool, MeanPool]
 gradtest("Pooling", pooling_layers, r, (2,2))
 dropout_layers = [Dropout, AlphaDropout]
 gradtest("Dropout", dropout_layers, r, 0.5f0)
 norm_layers = [LayerNorm, BatchNorm]
 gradtest("Normalising", norm_layers, rand(Float32, 28,28,3,1), 1)
 instancenorm = [InstanceNorm]
 gradtest("InstanceNorm", instancenorm, r, 1)
 groupnorm = [GroupNorm]
 gradtest("GroupNorm", groupnorm, rand(Float32, 28,28,3,1), 3, 1)
 const stateless_layers = [Flux.mse,
                          Flux.crossentropy,
                          Flux.logitcrossentropy,
                          Flux.normalise]
 const stateless_layers_broadcasted = [Flux.binarycrossentropy,
                                      Flux.logitbinarycrossentropy]
 function stateless_gradtest(f, args...)
  @test gradient((args...) -> sum(f(args...)), args...)[1] isa CuArray
 end
 function stateless_gradtest_broadcasted(f, args...)
  @test gradient((args...) -> sum(f.(args...)), args...)[1] isa CuArray
 end
@testset "Stateless GPU grad tests" begin
  x = gpu(rand(3,3))
  y = gpu(rand(3,3))
  for layer in stateless_layers
    if layer == Flux.normalise
      stateless_gradtest(layer, x)
    else
      stateless_gradtest(layer, x, y)
    end
  end
  for layer in stateless_layers_broadcasted
    stateless_gradtest_broadcasted(layer, x, y)
  end
 end
--- a/test/data.jl
+++ b/test/data.jl
@ -3,34 +3,20 @@
    Y = [1:5;]
    d = DataLoader(X, batchsize=2)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == typeof(X)
    @test length(batches) == 3
    @test batches[1] == X[:,1:2]
    @test batches[2] == X[:,3:4]
    @test batches[3] == X[:,5:5]
    d = DataLoader(X, batchsize=2, partial=false)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == typeof(X)
    @test length(batches) == 2
    @test batches[1] == X[:,1:2]
    @test batches[2] == X[:,3:4]
-    d = DataLoader((X,), batchsize=2, partial=false)
+    d = DataLoader(X, Y, batchsize=2)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == Tuple{typeof(X)}
    @test length(batches) == 2
    @test batches[1] == (X[:,1:2],)
    @test batches[2] == (X[:,3:4],)
    d = DataLoader((X, Y), batchsize=2)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
    @test length(batches) == 3
    @test length(batches[1]) == 2
    @test length(batches[2]) == 2
@ -42,22 +28,6 @@
    @test batches[3][1] == X[:,5:5]
    @test batches[3][2] == Y[5:5]
    # test with NamedTuple
    d = DataLoader((x=X, y=Y), batchsize=2)
    @inferred first(d)
    batches = collect(d)
    @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
    @test length(batches) == 3
    @test length(batches[1]) == 2
    @test length(batches[2]) == 2
    @test length(batches[3]) == 2
    @test batches[1][1] == batches[1].x == X[:,1:2]
    @test batches[1][2] == batches[1].y == Y[1:2]
    @test batches[2][1] == batches[2].x == X[:,3:4]
    @test batches[2][2] == batches[2].y == Y[3:4]
    @test batches[3][1] == batches[3].x == X[:,5:5]
    @test batches[3][2] == batches[3].y == Y[5:5]
    # test interaction with `train!`
    θ = ones(2)
    X = zeros(2, 10)
@ -71,7 +41,7 @@
    X = ones(2, 10)
    Y = fill(2, 10)
    loss(x, y) = sum((y - x'*θ).^2)
-    d  = DataLoader((X, Y)) 
+    d  = DataLoader(X, Y) 
    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
    @test norm(θ .- 1) < 1e-10
 end
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@ -28,14 +28,6 @@ import Flux: activations
  end
  @testset "Dense" begin
    @testset "constructors" begin
      @test size(Dense(10, 100).W) == (100, 10)
      @test Dense(rand(100,10), rand(10)).σ == identity
      @test_throws MethodError Dense(10, 10.5)
      @test_throws MethodError Dense(10, 10.5, tanh)
    end
    @test  length(Dense(10, 5)(randn(10))) == 5
    @test_throws DimensionMismatch Dense(10, 5)(randn(1))
    @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
@ -45,6 +37,7 @@ import Flux: activations
    @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
    @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
    @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
  end
  @testset "Diagonal" begin
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@ -25,35 +25,6 @@ end
    Dense(288, 10), softmax)
  @test size(m(r)) == (10, 5)
  # Test bias switch
  bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
  ip = zeros(Float32, 28,28,1,1)
  op = bias(ip)
  @test sum(op) == prod(size(op))
  bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
  op = bias(ip)
  @test sum(op) === 0.f0
  gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
  @test gs[bias.bias] == nothing
  # Train w/o bias and make sure no convergence happens
  # when only bias can be converged
  bias = Conv((2, 2), 1=>3, bias = Flux.Zeros());
  ip = zeros(Float32, 28,28,1,1)
  op = zeros(Float32, 27,27,3,1) .+ 2.f0
  opt = Descent()
  for _ = 1:10^3
    gs = gradient(params(bias)) do
      Flux.mse(bias(ip), op)
    end
    Flux.Optimise.update!(opt, params(bias), gs)
  end
  @test Flux.mse(bias(ip), op) ≈ 4.f0
 end
@testset "asymmetric padding" begin
@ -192,27 +163,3 @@ end
  m = MeanPool((2, 2); stride = 2, pad = 3)
  @test Flux.outdims(m, (5, 5)) == (5, 5)
 end
@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, CrossCor), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
  data = ones(Float32, (k .+ 3)..., 1,1)
  l = ltype(k, 1=>1, pad=SamePad())
  @test size(l(data)) == size(data)
  l = ltype(k, 1=>1, pad=SamePad(), dilation = k .÷ 2)
  @test size(l(data)) == size(data)
  stride = 3
  l = ltype(k, 1=>1, pad=SamePad(), stride = stride)
  if ltype == ConvTranspose
    @test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] .- stride .+ 1
  else
    @test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ stride)
  end
 end
@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
  data = ones(Float32, (k .+ 3)..., 1,1)
  l = ltype(k, pad=SamePad())
  @test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ k)
 end
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@ -1,26 +1,9 @@
 using Test
 using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
-            σ, binarycrossentropy, logitbinarycrossentropy, flatten,
+            σ, binarycrossentropy, logitbinarycrossentropy, flatten
            xlogx, xlogy
 const ϵ = 1e-7
@testset "xlogx & xlogy" begin
  @test iszero(xlogx(0))
  @test isnan(xlogx(NaN))
  @test xlogx(2) ≈ 2.0 * log(2.0)
  @inferred xlogx(2)
  @inferred xlogx(0)
  @test iszero(xlogy(0, 1))
  @test isnan(xlogy(NaN, 1))
  @test isnan(xlogy(1, NaN))
  @test isnan(xlogy(NaN, NaN))
  @test xlogy(2, 3) ≈ 2.0 * log(3.0)
  @inferred xlogy(2, 3)
  @inferred xlogy(0, 1)
 end
@testset "losses" begin
  # First, regression-style y's
  y = [1, 1, 0, 0]
@ -52,7 +35,6 @@ end
  lossvalue = 1.203972804325936
  @testset "crossentropy" begin
    @test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ crossentropy([0.1,0.9], [0.1,0.9])
    @test crossentropy(ŷ, y) ≈ lossvalue
  end
@ -85,7 +67,6 @@ end
  y = [1 2 3]
  ŷ = [4.0 5.0 6.0]
  @testset "kldivergence" begin
    @test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ Flux.kldivergence([0.1,0.9], [0.1,0.9])
    @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
    @test Flux.kldivergence(y, y) ≈ 0 
  end
--- a/test/optimise.jl
+++ b/test/optimise.jl
@ -57,16 +57,6 @@ end
 end
@testset "ExpDecay" begin
  @testset "Sanity Check" begin
    o = ExpDecay(0.2, 0.5, 1, 1e-3)
    p = [0.0]
    steps = 1:8
    eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
    @test eta_actual == eta_expected
  end
    w = randn(10, 10)
    o = ExpDecay(0.1, 0.1, 1000, 1e-4)
    w1 = randn(10,10)
@ -91,23 +81,11 @@ end
      end
    end
    @test flag == 1
-  # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
+    # Test to check if decay happens at decay steps. Eta reaches clip value eventually.
    ground_truth = []
-  for i in 1:4
+    for i in 1:11
      push!(ground_truth, 1000*i)  # Expected decay steps for this example.
    end
    @test decay_steps == ground_truth
    @test o.eta == o.clip
 end
@testset "Clipping" begin
    w = randn(10, 10)
    loss(x) = sum(w * x)
    θ = Params([w])
    x = 1000 * randn(10)
    w̄ = gradient(() -> loss(x), θ)[w]
    w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
    @test all(w̄_value .<= 1)
    w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
    @test norm(w̄_norm) <= 1
 end
--- a/test/runtests.jl
+++ b/test/runtests.jl
@ -2,10 +2,13 @@ using Flux
 using Flux.Data
 using Test 
 using Random, Statistics, LinearAlgebra
 using Documenter
 using IterTools: ncycle
 Random.seed!(0)
@testset "Flux" begin
  @testset "Utils" begin
    include("utils.jl")
  end
@ -37,10 +40,11 @@ end
    end
  end
@static if VERSION >= v"1.4"
  using Documenter
  @testset "Docs" begin
    if VERSION >= v"1.4"
      DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
      doctest(Flux)
    end
  end
 end # testset Flux