Merge #1238

1238: Fix inline code block r=dhairyagandhi96 a=harryscholes ### PR Checklist - [ ] Tests are added - [ ] Entry in NEWS.md - [x] Documentation, if applicable - [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes). Co-authored-by: harryscholes <harryscholes@gmail.com>
Fix inline code block
2020-06-19 08:28:41 +00:00 · 2020-06-19 09:24:44 +01:00 · 2020-06-16 17:21:28 +00:00 · 2020-06-16 13:04:20 +00:00 · 2020-06-16 14:02:24 +01:00 · 2020-06-16 13:32:27 +02:00
78 changed files with 4384 additions and 2397 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1 +1,2 @@
 paper/* linguist-documentation
+CITATION.bib linguist-detectable=false
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -0,0 +1 @@
+custom: https://numfocus.salsalabs.org/donate-to-julia/index.html
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,12 @@
+[Please delete this text and describe your change here.
+For bugfixes, please detail the bug and include a test case which your patch fixes.
+If you are adding a new feature, please clearly describe the design, its rationale, the possible alternatives considered.
+It is easiest to merge new features when there is clear precedent in other systems; we need to know we're taking
+the right direction since it can be hard to change later.]
+
+### PR Checklist
+
+- [ ] Tests are added
+- [ ] Entry in NEWS.md
+- [ ] Documentation, if applicable
+- [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes).
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@ -0,0 +1,16 @@
+name: CompatHelper
+
+on:
+  schedule:
+    - cron: '00 00 * * *'
+
+jobs:
+  CompatHelper:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'
--- a/.github/workflows/TagBot.yml
+++ b/.github/workflows/TagBot.yml
@ -0,0 +1,11 @@
+name: TagBot
+on:
+  schedule:
+    - cron: 0 * * * *
+jobs:
+  TagBot:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -0,0 +1,41 @@
+include:
+  - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v6.yml'
+
+image: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+
+
+# julia:1.0:
+#   extends:
+#     - .julia:1.0
+#     - .test
+#   tags:
+#     - nvidia
+# 
+# julia:1.1:
+#   extends:
+#     - .julia:1.1
+#     - .test
+#   tags:
+#     - nvidia
+# 
+# julia:1.2:
+#   extends:
+#     - .julia:1.2
+#     - .test
+#   tags:
+#     - nvidia
+
+julia:1.3:
+  extends:
+    - .julia:1.3
+    - .test
+  tags:
+    - nvidia
+
+julia:nightly:
+  extends:
+    - .julia:nightly
+    - .test
+  tags:
+    - nvidia
+  allow_failure: true
--- a/.travis.yml
+++ b/.travis.yml
@ -6,17 +6,17 @@ os:
  # - osx

 julia:
-  - 1.0
+  - 1.3
+  - 1
  - nightly

-matrix:
-  allow_failures:
-    - julia: nightly
+notifications:
+  email: false

 jobs:
  include:
    - stage: "Documentation"
-      julia: 1.0
+      julia: 1.3
      os: linux
      script:
        - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
@ -24,6 +24,9 @@ jobs:
        - julia --project=docs/ docs/make.jl
      after_success: skip

+  allow_failures:
+    - julia: nightly
+
 ## uncomment the following lines to override the default test script
 script:
 - julia --color=yes -e 'using Pkg; Pkg.activate(); Pkg.instantiate(); Pkg.test()'
--- a/CITATION.bib
+++ b/CITATION.bib
@ -0,0 +1,29 @@
+@article{Flux.jl-2018,
+  author    = {Michael Innes and
+               Elliot Saba and
+               Keno Fischer and
+               Dhairya Gandhi and
+               Marco Concetto Rudilosso and
+               Neethu Mariya Joy and
+               Tejan Karmali and
+               Avik Pal and
+               Viral Shah},
+  title     = {Fashionable Modelling with Flux},
+  journal   = {CoRR},
+  volume    = {abs/1811.01457},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1811.01457},
+  archivePrefix = {arXiv},
+  eprint    = {1811.01457},
+  timestamp = {Thu, 22 Nov 2018 17:58:30 +0100},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1811-01457},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{innes:2018,
+  author    = {Mike Innes},
+  title     = {Flux: Elegant Machine Learning with Julia},
+  journal   = {Journal of Open Source Software},
+  year      = {2018},
+  doi       = {10.21105/joss.00602},
+}
--- a/LICENSE.md
+++ b/LICENSE.md
@ -1,6 +1,6 @@
 The Flux.jl package is licensed under the MIT "Expat" License:

-> Copyright (c) 2016: Mike Innes.
+> Copyright (c) 2016-19: Julia Computing, INc., Mike Innes and Contributors
 >
 > Permission is hereby granted, free of charge, to any person obtaining
 > a copy of this software and associated documentation files (the
--- a/Manifest.toml
+++ b/Manifest.toml
@ -1,49 +1,84 @@
 # This file is machine-generated - editing it directly is not advised

+[[AbstractFFTs]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716"
+uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+version = "0.5.0"
+
 [[AbstractTrees]]
-deps = ["Markdown", "Test"]
-git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b"
+deps = ["Markdown"]
+git-tree-sha1 = "33e450545eaf7699da1a6e755f9ea65f14077a45"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.2.1"
+version = "0.3.3"

 [[Adapt]]
-deps = ["LinearAlgebra", "Test"]
-git-tree-sha1 = "53d8fec4f662088c1202530e338a11a919407f3b"
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "fd04049c7dd78cfef0b06cdc1f0f181467655712"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "0.4.2"
+version = "1.1.0"
+
+[[ArrayLayouts]]
+deps = ["FillArrays", "LinearAlgebra"]
+git-tree-sha1 = "a504dca2ac7eda8761c8f7c1ed52427a1be75a3c"
+uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
+version = "0.2.6"

 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"

-[[BinDeps]]
-deps = ["Compat", "Libdl", "SHA", "URIParser"]
-git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
-uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
-version = "0.8.10"
-
 [[BinaryProvider]]
-deps = ["Libdl", "Pkg", "SHA", "Test"]
-git-tree-sha1 = "055eb2690182ebc31087859c3dd8598371d3ef9e"
+deps = ["Libdl", "Logging", "SHA"]
+git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
 uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.3"
+version = "0.5.10"
+
+[[CEnum]]
+git-tree-sha1 = "1b77a77c3b28e0b3f413f7567c9bb8dd9bdccd14"
+uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+version = "0.3.0"
+
+[[CUDAapi]]
+deps = ["Libdl", "Logging"]
+git-tree-sha1 = "831b825d10104bd29e28f6da93312a976830717b"
+uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
+version = "4.0.0"
+
+[[CUDAdrv]]
+deps = ["CEnum", "CUDAapi", "Printf"]
+git-tree-sha1 = "f56bbf18c86bcff7a961a32a4947a5abb2963a29"
+uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
+version = "6.3.0"
+
+[[CUDAnative]]
+deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "ExprTools", "GPUCompiler", "LLVM", "Libdl", "Pkg", "Printf"]
+git-tree-sha1 = "ac86db2b05fdfec96b011e25a504ffe7476e8a68"
+uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
+version = "3.1.0"
+
+[[CodeTracking]]
+deps = ["InteractiveUtils", "UUIDs"]
+git-tree-sha1 = "cab4da992adc0a64f63fa30d2db2fd8bec40cab4"
+uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
+version = "0.5.11"

 [[CodecZlib]]
-deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"]
-git-tree-sha1 = "36bbf5374c661054d41410dc53ff752972583b9b"
+deps = ["TranscodingStreams", "Zlib_jll"]
+git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
 uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
-version = "0.5.2"
+version = "0.7.0"

 [[ColorTypes]]
-deps = ["FixedPointNumbers", "Random", "Test"]
-git-tree-sha1 = "f73b0e10f2a5756de7019818a41654686da06b09"
+deps = ["FixedPointNumbers", "Random"]
+git-tree-sha1 = "c73d9cfc2a9d8433dc77f5bff4bddf46b1d78c20"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.7.5"
+version = "0.10.3"

 [[Colors]]
-deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport", "Test"]
-git-tree-sha1 = "9f0a0210450acb91c730b730a994f8eef1d3d543"
+deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"]
+git-tree-sha1 = "1e9bba7984e78aa8cdeea7f9f7cc984ad4e4b1c7"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.9.5"
+version = "0.12.2"

 [[CommonSubexpressions]]
 deps = ["Test"]
@ -51,17 +86,34 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"

-[[Compat]]
-deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "195a3ffcb8b0762684b6821de18f83a16455c6ea"
-uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "2.0.0"
+[[CompilerSupportLibraries_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "7c4f882c41faa72118841185afc58a2eb00ef612"
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "0.3.3+0"
+
+[[Cthulhu]]
+deps = ["CodeTracking", "InteractiveUtils", "REPL", "UUIDs", "Unicode"]
+git-tree-sha1 = "f3643e78353199d3097821e806348bd83f364155"
+uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+version = "1.1.1"
+
+[[CuArrays]]
+deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
+git-tree-sha1 = "1582b74d2322df7dd94549d4ac9d095e0f20e884"
+uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
+version = "2.2.1"
+
+[[DataAPI]]
+git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32"
+uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+version = "1.3.0"

 [[DataStructures]]
-deps = ["InteractiveUtils", "OrderedCollections", "Random", "Serialization", "Test"]
-git-tree-sha1 = "ca971f03e146cf144a9e2f2ce59674f5bf0e8038"
+deps = ["InteractiveUtils", "OrderedCollections"]
+git-tree-sha1 = "af6d9c86e191c917c2276fbede1137e8ea20157f"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.15.0"
+version = "0.17.17"

 [[Dates]]
 deps = ["Printf"]
@ -72,44 +124,89 @@ deps = ["Mmap"]
 uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"

 [[DiffResults]]
-deps = ["Compat", "StaticArrays"]
-git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c"
+deps = ["StaticArrays"]
+git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc"
 uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-version = "0.0.4"
+version = "1.0.2"

 [[DiffRules]]
-deps = ["Random", "Test"]
-git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
+deps = ["NaNMath", "Random", "SpecialFunctions"]
+git-tree-sha1 = "eb0c34204c8410888844ada5359ac8b96292cfd1"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "0.0.10"
+version = "1.0.1"

 [[Distributed]]
-deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
+deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"

+[[ExprTools]]
+git-tree-sha1 = "6f0517056812fd6aa3af23d4b70d5325a2ae4e95"
+uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+version = "0.1.1"
+
+[[FillArrays]]
+deps = ["LinearAlgebra", "Random", "SparseArrays"]
+git-tree-sha1 = "44f561e293987ffc84272cd3d2b14b0b93123d63"
+uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
+version = "0.8.10"
+
 [[FixedPointNumbers]]
-deps = ["Test"]
-git-tree-sha1 = "b8045033701c3b10bf2324d7203404be7aef88ba"
+git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.5.3"
+version = "0.8.0"

 [[ForwardDiff]]
-deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"]
-git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b"
+deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
+git-tree-sha1 = "869540e4367122fbffaace383a5bdc34d6e5e5ac"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.3"
+version = "0.10.10"
+
+[[Functors]]
+deps = ["MacroTools"]
+git-tree-sha1 = "f40adc6422f548176bb4351ebd29e4abf773040a"
+uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+version = "0.1.0"
+
+[[Future]]
+deps = ["Random"]
+uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
+
+[[GPUArrays]]
+deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
+git-tree-sha1 = "d887693eb1bd5e1fd573262a978745481895ec7d"
+uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+version = "3.4.1"
+
+[[GPUCompiler]]
+deps = ["Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs"]
+git-tree-sha1 = "5275aa268ecd09640b32560e1eae90c78816e4d1"
+uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
+version = "0.2.0"
+
+[[IRTools]]
+deps = ["InteractiveUtils", "MacroTools", "Test"]
+git-tree-sha1 = "90ee39f9beaaa186e4968417ea2b8ed5673c91c0"
+uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
+version = "0.3.3"

 [[InteractiveUtils]]
-deps = ["LinearAlgebra", "Markdown"]
+deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

 [[Juno]]
-deps = ["Base64", "Logging", "Media", "Profile", "Test"]
-git-tree-sha1 = "4e4a8d43aa7ecec66cadaf311fbd1e5c9d7b9175"
+deps = ["Base64", "Logging", "Media", "Profile"]
+git-tree-sha1 = "a686b0cf235fa3e491b79b4783c2d2382292b436"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.7.0"
+version = "0.8.2"
+
+[[LLVM]]
+deps = ["CEnum", "Libdl", "Printf", "Unicode"]
+git-tree-sha1 = "dd3f584c3dbefe39b2a8fbafa1a3b77e31e21255"
+uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
+version = "1.5.1"

 [[LibGit2]]
+deps = ["Printf"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"

 [[Libdl]]
@ -123,10 +220,10 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

 [[MacroTools]]
-deps = ["Compat"]
-git-tree-sha1 = "3fd1a3022952128935b449c33552eb65895380c1"
+deps = ["Markdown", "Random"]
+git-tree-sha1 = "f7d2e3f654af75f01ec49be82c231c382214223a"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.4.5"
+version = "0.5.5"

 [[Markdown]]
 deps = ["Base64"]
@ -139,34 +236,38 @@ uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
 version = "0.5.0"

 [[Missings]]
-deps = ["Dates", "InteractiveUtils", "SparseArrays", "Test"]
-git-tree-sha1 = "d1d2585677f2bd93a97cfeb8faa7a0de0f982042"
+deps = ["DataAPI"]
+git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5"
 uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "0.4.0"
+version = "0.4.3"

 [[Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"

 [[NNlib]]
-deps = ["Libdl", "LinearAlgebra", "MacroTools", "Requires", "Test"]
-git-tree-sha1 = "d07ac0bfd3c71c3a29bc9c22becbba19227bbeb5"
+deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
+git-tree-sha1 = "d9f196d911f55aeaff11b11f681b135980783824"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.5.0"
+version = "0.6.6"

 [[NaNMath]]
-deps = ["Compat"]
-git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
+git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "0.3.2"
+version = "0.3.3"
+
+[[OpenSpecFun_jll]]
+deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
+git-tree-sha1 = "d51c416559217d974a1113522d5919235ae67a87"
+uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
+version = "0.5.3+3"

 [[OrderedCollections]]
-deps = ["Random", "Serialization", "Test"]
-git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
+git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.0.2"
+version = "1.2.0"

 [[Pkg]]
-deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

 [[Printf]]
@ -192,10 +293,10 @@ uuid = "189a3867-3050-52da-a836-e630ba90ab69"
 version = "0.2.0"

 [[Requires]]
-deps = ["Test"]
-git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
+deps = ["UUIDs"]
+git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "0.5.2"
+version = "1.0.1"

 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@ -203,10 +304,6 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

-[[SharedArrays]]
-deps = ["Distributed", "Mmap", "Random", "Serialization"]
-uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"

@ -221,58 +318,70 @@ deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

 [[SpecialFunctions]]
-deps = ["BinDeps", "BinaryProvider", "Libdl", "Test"]
-git-tree-sha1 = "0b45dc2e45ed77f445617b99ff2adf0f5b0f23ea"
+deps = ["OpenSpecFun_jll"]
+git-tree-sha1 = "d8d8b8a9f4119829410ecd706da4cc8594a1e020"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.7.2"
+version = "0.10.3"

 [[StaticArrays]]
-deps = ["InteractiveUtils", "LinearAlgebra", "Random", "Statistics", "Test"]
-git-tree-sha1 = "3841b39ed5f047db1162627bf5f80a9cd3e39ae2"
+deps = ["LinearAlgebra", "Random", "Statistics"]
+git-tree-sha1 = "5c06c0aeb81bef54aed4b3f446847905eb6cbda0"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.10.3"
+version = "0.12.3"

 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

 [[StatsBase]]
-deps = ["DataStructures", "DelimitedFiles", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "Test"]
-git-tree-sha1 = "435707791dc85a67d98d671c1c3fcf1b20b00f94"
+deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
+git-tree-sha1 = "a6102b1f364befdb05746f386b67c6b7e3262c45"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.29.0"
+version = "0.33.0"

 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

-[[Tracker]]
-deps = ["Adapt", "DiffRules", "ForwardDiff", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Printf", "Random", "Requires", "SpecialFunctions", "Statistics", "Test"]
-git-tree-sha1 = "4eeea9f0ef9b8c7d1c5c5b1f8f68cb9b7f45d7df"
-uuid = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
-version = "0.1.0"
+[[TimerOutputs]]
+deps = ["Printf"]
+git-tree-sha1 = "f458ca23ff80e46a630922c555d838303e4b9603"
+uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+version = "0.5.6"

 [[TranscodingStreams]]
-deps = ["Pkg", "Random", "Test"]
-git-tree-sha1 = "f42956022d8084539f1d7219f632542b0ea686ce"
+deps = ["Random", "Test"]
+git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
 uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
-version = "0.9.3"
-
-[[URIParser]]
-deps = ["Test", "Unicode"]
-git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
-uuid = "30578b45-9adc-5946-b283-645ec420af67"
-version = "0.4.0"
+version = "0.9.5"

 [[UUIDs]]
-deps = ["Random"]
+deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

 [[ZipFile]]
-deps = ["BinaryProvider", "Libdl", "Printf", "Test"]
-git-tree-sha1 = "4000c633efe994b2e10b31b6d91382c4b7412dac"
+deps = ["Libdl", "Printf", "Zlib_jll"]
+git-tree-sha1 = "254975fef2fc526583bb9b7c9420fe66ffe09f2f"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.8.0"
+version = "0.9.2"
+
+[[Zlib_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "a2e0d558f6031002e380a90613b199e37a8565bf"
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.11+10"
+
+[[Zygote]]
+deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "Future", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
+git-tree-sha1 = "707ceea58e2bd0ff3077ab13a92f8355181d3ee4"
+uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
+version = "0.4.20"
+
+[[ZygoteRules]]
+deps = ["MacroTools"]
+git-tree-sha1 = "b3b4882cc9accf6731a08cc39543fbc6b669dca8"
+uuid = "700de1a5-db45-46bc-99cf-38207098b444"
+version = "0.2.0"
--- a/NEWS.md
+++ b/NEWS.md
@ -1,6 +1,42 @@
+# v0.11
+* Change to `DataLoader`'s constructor [https://github.com/FluxML/Flux.jl/pull/1152]
+* Use `DataLoader` with `NamedTuple`s, so that tensors can be accessed by name [https://github.com/FluxML/Flux.jl/pull/1221].
+* Error if Dense layers weights and biases are not arrays [https://github.com/FluxML/Flux.jl/pull/1218].
+
+# v0.10.5
+* Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`.
+* Added option to set `bias` to [Flux.Zeros](https://github.com/FluxML/Flux.jl/pull/873) to eliminating `bias` from being trained.
+* Added `GlobalMaxPool` and `GlobalMeanPool` [layers](https://github.com/FluxML/Flux.jl/pull/950) for performing global pooling operations.
+* Added `ClipValue` and `ClipNorm` in this [pr](https://github.com/FluxML/Flux.jl/pull/1133) to `Flux.Optimise` to provide a cleaner API for gradient clipping.
+* Added new kwarg-only [constructors](https://github.com/FluxML/Flux.jl/pull/873) for the various convolutional layers.
+* Documented the convolutional layer constructors accepting `weight` and `bias` keyword arguments to supply custom arrays for those fields.
+* Testing suite improvements now test for gradients of all layers along with GPU support.
+* Functors have now moved to [Functors.jl](https://github.com/FluxML/Flux.jl/pull/1174) to allow for their use outside of Flux.
+* Added [helper functions](https://github.com/FluxML/Flux.jl/pull/873) `Flux.convfilter` and `Flux.depthwiseconvfilter` to construct weight arrays for convolutions outside of layer constructors so as to not have to depend on the default layers for custom implementations.
+
+# v0.10.0
+* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
+  - The dependency on Tracker.jl has been removed.
+  - This means Flux now does not depend on using a specialised `TrackedArray` type, and can be used with normal Array implementations directly.
+  - Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
+* The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
+* `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
+* Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
+* [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
+* [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
+* `@treelike` has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
+* `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
+
+# v0.9.0
+* [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
+* New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.
+* New [RADAM](https://github.com/FluxML/Flux.jl/pull/842) optimiser.
+
 # v0.8.0

+* [Dropout now has a `dims` argument for specifying the unbroadcast dimensions.](https://github.com/FluxML/Flux.jl/pull/563)
 * New [ConvTranspose layer](https://github.com/FluxML/Flux.jl/pull/311).
+* New [Maxout layer](https://github.com/FluxML/Flux.jl/pull/647)
 * Datasets are now [hash verified on download](https://github.com/FluxML/Flux.jl/pull/585) to avoid corruption.
 * We now [zero the initial state for RNNs](https://github.com/FluxML/Flux.jl/pull/590/).
 * [Normalisation can now work on arbitrary `dims`.](https://github.com/FluxML/Flux.jl/pull/592)
@ -11,6 +47,8 @@
 * New [AlphaDropout](https://github.com/FluxML/Flux.jl/pull/656).
 * [Data.Iris](https://github.com/FluxML/Flux.jl/pull/652) makes Fisher's Iris dataset available with `Iris.labels` and `Iris.features`.
 * New [InstanceNorm](https://github.com/FluxML/Flux.jl/pull/634), as popularized by [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
+* New [GroupNorm](https://github.com/FluxML/Flux.jl/pull/696), as described in [Group Normalization](https://arxiv.org/abs/1803.08494).
+* New [CrossCor](https://github.com/FluxML/Flux.jl/pull/762).

 AD Changes:

--- a/Project.toml
+++ b/Project.toml
@ -1,12 +1,15 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+version = "0.11.0-DEV"

 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
+CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
@ -15,10 +18,34 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
-Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[compat]
+AbstractTrees = "0.2, 0.3"
+Adapt = "1, 2.0"
+CodecZlib = "0.5, 0.6, 0.7"
+Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
+CuArrays = "2"
+Functors = "0.1"
+Juno = "0.5, 0.6, 0.7, 0.8"
+MacroTools = "0.3, 0.4, 0.5"
+NNlib = "0.6"
+Reexport = "0.2"
+StatsBase = "0"
+ZipFile = "0.7, 0.8, 0.9"
+Zygote = "0.4.13"
+julia = "1.3"
+
+[extras]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test", "Documenter", "IterTools", "LinearAlgebra"]
--- a/README.md
+++ b/README.md
@ -2,98 +2,14 @@
 <img width="400px" src="https://raw.githubusercontent.com/FluxML/fluxml.github.io/master/logo.png"/>
 </p>

-[![Build Status](https://travis-ci.org/FluxML/Flux.jl.svg?branch=master)](https://travis-ci.org/FluxML/Flux.jl) [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://fluxml.github.io/Flux.jl/stable/) [![](https://img.shields.io/badge/chat-on%20slack-yellow.svg)](https://slackinvite.julialang.org/) [![DOI](http://joss.theoj.org/papers/10.21105/joss.00602/status.svg)](https://doi.org/10.21105/joss.00602)
+[![Build Status](https://travis-ci.org/FluxML/Flux.jl.svg?branch=master)](https://travis-ci.org/FluxML/Flux.jl) [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://fluxml.github.io/Flux.jl/stable/) [![](https://img.shields.io/badge/chat-on%20slack-yellow.svg)](https://slackinvite.julialang.org/) [![DOI](https://joss.theoj.org/papers/10.21105/joss.00602/status.svg)](https://doi.org/10.21105/joss.00602)

 Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, and provides lightweight abstractions on top of Julia's native GPU and AD support. Flux makes the easy things easy while remaining fully hackable.

 ```julia
-julia> Pkg.add("Flux")
+] add Flux
 ```

-See the [documentation](http://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
+See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.

-If you use Flux in research, please cite the following paper:
-
-```
-@article{innes:2018,
-  author    = {Mike Innes},
-  title     = {Flux: Elegant Machine Learning with Julia},
-  journal   = {Journal of Open Source Software},
-  year      = {2018},
-  doi       = {10.21105/joss.00602},
-}
-```
-
-## Features
-
-Flux has powerful high-level features, and common architectures can be defined in a few lines.
-
-```julia
-model = Chain(
-  Dense(768, 128, σ),
-  LSTM(128, 256),
-  LSTM(256, 128),
-  Dense(128, 10),
-  softmax)
-
-loss(x, y) = crossentropy(model(x), y)
-
-Flux.train!(loss, data, ADAM(...))
-```
-
-Yet you can easily strip away the layers, and directly write the mathematics for your problem. Flux will seamlessly take gradients of any Julia code, so your model looks just like the paper.
-
-```julia
-W = param(randn(2, 10))
-b = param(randn(2))
-
-y(x) = σ.(W * x .+ b)
-```
-
-If that's *still* not enough, you can go as deep as you want, even writing your own CUDA kernels with [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl)! All this can be freely mixed-and-matched in a single model or script, and it all runs interactively via Jupyter or Juno.
-
-```julia
-function gpu_add(a, b, c)
-  i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-  c[i] = a[i] + b[i]
-  return nothing
-end
-```
-
-Unusual architectures are no problem in Flux, as you can use all the loops, control flow and even macros that you're used to. Here's a Tree RNN in 4 lines.
-
-```julia
-tree() = rand() < 0.5 ? rand(10) : (tree(), tree()) # dummy data
-
-shrink = Dense(20, 10)
-combine(a, b) = shrink([a; b])
-
-model(x) = x
-model(x::Tuple) = combine(model(x[1]), model(x[2]))
-
-model(tree()) # Sample output
-```
-
-Despite this flexibility, Julia's advanced compiler lets us do some powerful optimisations. For example, this definition of `sigmoid` automatically gets fused into a *single* GPU kernel – so it's really fast.
-
-```julia
-sigmoid(xs) = 1 ./ (1 .+ exp.(.-xs))
-```
-
-Similarly, Flux is the first dynamic framework to support [compiling to the browser](https://fluxml.github.io/experiments/) and model import via [formats like ONNX](https://github.com/FluxML/ONNX.jl/), both of which are thinly-veiled compiler problems.
-
-For more on our philosophy on machine learning, check out our article [On Machine Learning & Programming Languages](https://julialang.org/blog/2017/12/ml&pl).
-
-## Contributing & Help
-
-For general questions and help, check out Julia's [community forum](https://discourse.julialang.org/c/domain/ML).
-
-Flux development is carried out via our [GitHub issues](https://github.com/FluxML/Flux.jl/issues), so feel free to open feature requests or PRs here.
-
-For more informal discussions we'd love to have you on the [Julia slack](https://slackinvite.julialang.org/), where we hang out on the #machine-learning channel.
-
-## Related Packages
-
-Check out [Metalhead.jl](https://github.com/FluxML/Metalhead.jl) for common computer vision datasets and trained models.
-
-[MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) provides further common datasets.
+If you use Flux in your research, please [cite](CITATION.bib) our work.
--- a/12
+++ b/12
@ -1,12 +0,0 @@
-julia 1.0
-Juno
-MacroTools 0.3.3
-NNlib
-Requires
-Adapt 0.4
-CodecZlib
-Colors
-ZipFile
-AbstractTrees
-Reexport
-StatsBase
--- a/bors.toml
+++ b/bors.toml
@ -0,0 +1,4 @@
+status = [
+  "ci/gitlab%"
+]
+timeout-sec = 7200
--- a/docs/Manifest.toml
+++ b/docs/Manifest.toml
@ -1,296 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-[[AbstractTrees]]
-deps = ["Markdown", "Test"]
-git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b"
-uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.2.1"
-
-[[Adapt]]
-deps = ["LinearAlgebra", "Test"]
-git-tree-sha1 = "53d8fec4f662088c1202530e338a11a919407f3b"
-uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "0.4.2"
-
-[[Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[BinDeps]]
-deps = ["Compat", "Libdl", "SHA", "URIParser"]
-git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
-uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
-version = "0.8.10"
-
-[[BinaryProvider]]
-deps = ["Libdl", "Pkg", "SHA", "Test"]
-git-tree-sha1 = "055eb2690182ebc31087859c3dd8598371d3ef9e"
-uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.3"
-
-[[CodecZlib]]
-deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"]
-git-tree-sha1 = "36bbf5374c661054d41410dc53ff752972583b9b"
-uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
-version = "0.5.2"
-
-[[ColorTypes]]
-deps = ["FixedPointNumbers", "Random", "Test"]
-git-tree-sha1 = "f73b0e10f2a5756de7019818a41654686da06b09"
-uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.7.5"
-
-[[Colors]]
-deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport", "Test"]
-git-tree-sha1 = "9f0a0210450acb91c730b730a994f8eef1d3d543"
-uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.9.5"
-
-[[CommonSubexpressions]]
-deps = ["Test"]
-git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
-uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
-version = "0.2.0"
-
-[[Compat]]
-deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "195a3ffcb8b0762684b6821de18f83a16455c6ea"
-uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "2.0.0"
-
-[[DataStructures]]
-deps = ["InteractiveUtils", "OrderedCollections", "Random", "Serialization", "Test"]
-git-tree-sha1 = "ca971f03e146cf144a9e2f2ce59674f5bf0e8038"
-uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.15.0"
-
-[[Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[DelimitedFiles]]
-deps = ["Mmap"]
-uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
-
-[[DiffResults]]
-deps = ["Compat", "StaticArrays"]
-git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c"
-uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-version = "0.0.4"
-
-[[DiffRules]]
-deps = ["Random", "Test"]
-git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
-uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "0.0.10"
-
-[[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
-uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-
-[[DocStringExtensions]]
-deps = ["LibGit2", "Markdown", "Pkg", "Test"]
-git-tree-sha1 = "1df01539a1c952cef21f2d2d1c092c2bcf0177d7"
-uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.6.0"
-
-[[Documenter]]
-deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "LibGit2", "Logging", "Markdown", "Pkg", "REPL", "Random", "Test", "Unicode"]
-git-tree-sha1 = "a8c41ba3d0861240dbec942ee1d0f86c57c37c1c"
-uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-version = "0.21.5"
-
-[[FixedPointNumbers]]
-deps = ["Test"]
-git-tree-sha1 = "b8045033701c3b10bf2324d7203404be7aef88ba"
-uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.5.3"
-
-[[Flux]]
-deps = ["AbstractTrees", "Adapt", "CodecZlib", "Colors", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SHA", "Statistics", "StatsBase", "Test", "Tracker", "ZipFile"]
-path = ".."
-uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.7.3+"
-
-[[ForwardDiff]]
-deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"]
-git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b"
-uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.3"
-
-[[InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[Juno]]
-deps = ["Base64", "Logging", "Media", "Profile", "Test"]
-git-tree-sha1 = "ce6246e19061e36cbdce954caaae717498daeed8"
-uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.5.4"
-
-[[LibGit2]]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[LinearAlgebra]]
-deps = ["Libdl"]
-uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-
-[[Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[MacroTools]]
-deps = ["Compat"]
-git-tree-sha1 = "3fd1a3022952128935b449c33552eb65895380c1"
-uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.4.5"
-
-[[Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[Media]]
-deps = ["MacroTools", "Test"]
-git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58"
-uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
-version = "0.5.0"
-
-[[Missings]]
-deps = ["Dates", "InteractiveUtils", "SparseArrays", "Test"]
-git-tree-sha1 = "d1d2585677f2bd93a97cfeb8faa7a0de0f982042"
-uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "0.4.0"
-
-[[Mmap]]
-uuid = "a63ad114-7e13-5084-954f-fe012c677804"
-
-[[NNlib]]
-deps = ["Libdl", "LinearAlgebra", "MacroTools", "Requires", "Test"]
-git-tree-sha1 = "51330bb45927379007e089997bf548fbe232589d"
-uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.4.3"
-
-[[NaNMath]]
-deps = ["Compat"]
-git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
-uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "0.3.2"
-
-[[OrderedCollections]]
-deps = ["Random", "Serialization", "Test"]
-git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
-uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.0.2"
-
-[[Pkg]]
-deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-
-[[Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[Profile]]
-deps = ["Printf"]
-uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
-
-[[REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[Random]]
-deps = ["Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[Reexport]]
-deps = ["Pkg"]
-git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0"
-uuid = "189a3867-3050-52da-a836-e630ba90ab69"
-version = "0.2.0"
-
-[[Requires]]
-deps = ["Test"]
-git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
-uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "0.5.2"
-
-[[SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-
-[[Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[SharedArrays]]
-deps = ["Distributed", "Mmap", "Random", "Serialization"]
-uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-
-[[Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[SortingAlgorithms]]
-deps = ["DataStructures", "Random", "Test"]
-git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd"
-uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
-version = "0.3.1"
-
-[[SparseArrays]]
-deps = ["LinearAlgebra", "Random"]
-uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-
-[[SpecialFunctions]]
-deps = ["BinDeps", "BinaryProvider", "Libdl", "Test"]
-git-tree-sha1 = "0b45dc2e45ed77f445617b99ff2adf0f5b0f23ea"
-uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.7.2"
-
-[[StaticArrays]]
-deps = ["InteractiveUtils", "LinearAlgebra", "Random", "Statistics", "Test"]
-git-tree-sha1 = "3841b39ed5f047db1162627bf5f80a9cd3e39ae2"
-uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.10.3"
-
-[[Statistics]]
-deps = ["LinearAlgebra", "SparseArrays"]
-uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-
-[[StatsBase]]
-deps = ["DataStructures", "DelimitedFiles", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "Test"]
-git-tree-sha1 = "435707791dc85a67d98d671c1c3fcf1b20b00f94"
-uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.29.0"
-
-[[Test]]
-deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
-uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[[Tracker]]
-deps = ["Adapt", "DiffRules", "ForwardDiff", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Printf", "Random", "Requires", "SpecialFunctions", "Statistics", "Test"]
-git-tree-sha1 = "4eeea9f0ef9b8c7d1c5c5b1f8f68cb9b7f45d7df"
-uuid = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
-version = "0.1.0"
-
-[[TranscodingStreams]]
-deps = ["Pkg", "Random", "Test"]
-git-tree-sha1 = "90f845c65c50bc57d6ffc815dbab2a4003ccf75c"
-uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
-version = "0.9.1"
-
-[[URIParser]]
-deps = ["Test", "Unicode"]
-git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
-uuid = "30578b45-9adc-5946-b283-645ec420af67"
-version = "0.4.0"
-
-[[UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
-
-[[ZipFile]]
-deps = ["BinaryProvider", "Libdl", "Printf", "Test"]
-git-tree-sha1 = "4000c633efe994b2e10b31b6d91382c4b7412dac"
-uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.8.0"
--- a/docs/Project.toml
+++ b/docs/Project.toml
@ -1,4 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+
+[compat]
+Documenter = "0.24"
--- a/docs/make.jl
+++ b/docs/make.jl
@ -1,27 +1,36 @@
 using Documenter, Flux, NNlib

+DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
 makedocs(modules=[Flux, NNlib],
-         doctest = true,
-         analytics = "UA-36890222-9",
+         doctest = VERSION >= v"1.4",
         sitename = "Flux",
-         # Uncomment below for local build
-         #format = Documenter.HTML(prettyurls = false),
-         assets = ["assets/flux.css"],
         pages = ["Home" => "index.md",
                  "Building Models" =>
                    ["Basics" => "models/basics.md",
                     "Recurrence" => "models/recurrence.md",
                     "Regularisation" => "models/regularisation.md",
-                     "Model Reference" => "models/layers.md"],
+                     "Model Reference" => "models/layers.md",
+                     "Advanced Model Building" => "models/advanced.md",
+                     "NNlib" => "models/nnlib.md"],
+                  "Handling Data" =>
+                    ["One-Hot Encoding" => "data/onehot.md",
+                     "DataLoader" => "data/dataloader.md"],
                  "Training Models" =>
                    ["Optimisers" => "training/optimisers.md",
                     "Training" => "training/training.md"],
-                  "One-Hot Encoding" => "data/onehot.md",
                  "GPU Support" => "gpu.md",
                  "Saving & Loading" => "saving.md",
+                  "The Julia Ecosystem" => "ecosystem.md",
+                  "Utility Functions" => "utilities.md",
                  "Performance Tips" => "performance.md",
-                  "Internals" =>
-                    ["Backpropagation" => "internals/tracker.md"],
-                  "Community" => "community.md"])
+                  "Datasets" => "datasets.md",
+                  "Community" => "community.md"],
+         format = Documenter.HTML(
+             analytics = "UA-36890222-9",
+             assets = ["assets/flux.css"],
+             prettyurls = get(ENV, "CI", nothing) == "true"),
+         )

-deploydocs(repo = "github.com/FluxML/Flux.jl.git")
+deploydocs(repo = "github.com/FluxML/Flux.jl.git",
+           target = "build",
+           push_preview = true)
--- a/docs/src/community.md
+++ b/docs/src/community.md
@ -1,5 +1,5 @@
 # Community

-All Flux users are welcome to join our community on the [Julia forum](https://discourse.julialang.org/), the [slack](https://discourse.julialang.org/t/announcing-a-julia-slack/4866) (channel #machine-learning), or Flux's [Gitter](https://gitter.im/FluxML/Lobby). If you have questions or issues we'll try to help you out.
+All Flux users are welcome to join our community on the [Julia forum](https://discourse.julialang.org/), or the [slack](https://discourse.julialang.org/t/announcing-a-julia-slack/4866) (channel #machine-learning). If you have questions or issues we'll try to help you out.

 If you're interested in hacking on Flux, the [source code](https://github.com/FluxML/Flux.jl) is open and easy to understand -- it's all just the same Julia code you work with normally. You might be interested in our [intro issues](https://github.com/FluxML/Flux.jl/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) to get started.
--- a/docs/src/data/dataloader.md
+++ b/docs/src/data/dataloader.md
@ -0,0 +1,6 @@
+# DataLoader
+Flux provides the `DataLoader` type in the `Flux.Data` module to handle iteration over mini-batches of data. 
+
+```@docs
+Flux.Data.DataLoader
+```
--- a/docs/src/data/onehot.md
+++ b/docs/src/data/onehot.md
@ -7,15 +7,15 @@ julia> using Flux: onehot, onecold

 julia> onehot(:b, [:a, :b, :c])
 3-element Flux.OneHotVector:
- false
-  true
- false
+ 0
+ 1
+ 0

 julia> onehot(:c, [:a, :b, :c])
 3-element Flux.OneHotVector:
- false
- false
-  true
+ 0
+ 0
+ 1
 ```

 The inverse is `onecold` (which can take a general probability distribution, as well as just booleans).
@ -31,6 +31,11 @@ julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c])
 :c
 ```

+```@docs
+Flux.onehot
+Flux.onecold
+```
+
 ## Batches

 `onehotbatch` creates a batch (matrix) of one-hot vectors, and `onecold` treats matrices as batches.
@ -52,3 +57,7 @@ julia> onecold(ans, [:a, :b, :c])
 ```

 Note that these operations returned `OneHotVector` and `OneHotMatrix` rather than `Array`s. `OneHotVector`s behave like normal vectors but avoid any unnecessary cost compared to using an integer index directly. For example, multiplying a matrix with a one-hot vector simply slices out the relevant row of the matrix under the hood.
+
+```@docs
+Flux.onehotbatch
+```
--- a/docs/src/datasets.md
+++ b/docs/src/datasets.md
@ -0,0 +1,20 @@
+# Datasets
+
+Flux includes several standard machine learning datasets.
+
+```@docs
+Flux.Data.Iris.features()
+Flux.Data.Iris.labels()
+Flux.Data.MNIST.images()
+Flux.Data.MNIST.labels()
+Flux.Data.FashionMNIST.images()
+Flux.Data.FashionMNIST.labels()
+Flux.Data.CMUDict.phones()
+Flux.Data.CMUDict.symbols()
+Flux.Data.CMUDict.rawdict()
+Flux.Data.CMUDict.cmudict()
+Flux.Data.Sentiment.train()
+Flux.Data.Sentiment.test()
+Flux.Data.Sentiment.dev()
+```
+
--- a/docs/src/ecosystem.md
+++ b/docs/src/ecosystem.md
@ -0,0 +1,21 @@
+# The Julia Ecosystem
+
+One of the main strengths of Julia lies in an ecosystem of packages 
+globally providing a rich and consistent user experience.
+
+This is a non-exhaustive list of Julia packages, nicely complementing `Flux` in typical
+machine learning and deep learning workflows:
+
+- [ArgParse.jl](https://github.com/carlobaldassi/ArgParse.jl): package for parsing command-line arguments to Julia programs.
+- [Augmentor.jl](https://github.com/Evizero/Augmentor.jl): a fast image augmentation library in Julia for machine learning.
+- [BSON.jl](https://github.com/JuliaIO/BSON.jl): package for working with the Binary JSON serialisation format
+- [DataFrames.jl](https://github.com/joshday/OnlineStats.jl): in-memory tabular data in Julia
+- [DrWatson.jl](https://github.com/JuliaDynamics/DrWatson.jl):  a scientific project assistant software
+- [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl): utility package for accessing common machine learning datasets
+- [OnlineStats.jl](https://github.com/joshday/OnlineStats.jl): single-pass algorithms for statistics
+- [Parameters.jl](https://github.com/mauro3/Parameters.jl): types with default field values, keyword constructors and (un-)pack macros
+- [ProgressMeters.jl](https://github.com/timholy/ProgressMeter.jl): progress meters for long-running computations
+- [TensorBoardLogger.jl](https://github.com/PhilipVinc/TensorBoardLogger.jl): easy peasy logging to [tensorboard](https://www.tensorflow.org/tensorboard) in Julia
+
+
+This tight integration among Julia pakages is shown in some of the examples in the [model-zoo](https://github.com/FluxML/model-zoo) repository.
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@ -1,14 +1,6 @@
 # GPU Support

-## Installation
-
-To get GPU support for NVIDIA graphics cards, you need to install `CuArrays.jl`
-
-**Steps needed**
-
-1. Install [NVIDIA toolkit](https://developer.nvidia.com/cuda-downloads)
-2. Install [NVIDIA cuDNN library](https://developer.nvidia.com/cudnn)
-3. In Julia's terminal run `]add CuArrays`
+NVIDIA GPU support should work out of the box on systems with CUDA and CUDNN installed. For more details see the [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) readme.

 ## GPU Usage

@ -33,16 +25,16 @@ loss(x, y) # ~ 3

 Note that we convert both the parameters (`W`, `b`) and the data set (`x`, `y`) to cuda arrays. Taking derivatives and training works exactly as before.

-If you define a structured model, like a `Dense` layer or `Chain`, you just need to convert the internal parameters. Flux provides `mapleaves`, which allows you to alter all parameters of a model at once.
+If you define a structured model, like a `Dense` layer or `Chain`, you just need to convert the internal parameters. Flux provides `fmap`, which allows you to alter all parameters of a model at once.

 ```julia
 d = Dense(10, 5, σ)
-d = mapleaves(cu, d)
-d.W # Tracked CuArray
+d = fmap(cu, d)
+d.W # CuArray
 d(cu(rand(10))) # CuArray output

 m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
-m = mapleaves(cu, m)
+m = fmap(cu, m)
 d(cu(rand(10)))
 ```

@ -61,7 +53,7 @@ julia> x = rand(10) |> gpu
 0.511655

 julia> m(x)
-Tracked 5-element CuArray{Float32,1}:
+5-element CuArray{Float32,1}:
 -0.30535
 ⋮
 -0.618002
--- a/docs/src/internals/tracker.md
+++ b/docs/src/internals/tracker.md
@ -1,184 +0,0 @@
-# Flux.Tracker
-
-Backpropagation, or reverse-mode automatic differentiation, is handled by the `Flux.Tracker` module.
-
-```julia
-julia> using Flux.Tracker
-```
-
-Here we discuss some more advanced uses of this module, as well as covering its internals.
-
-## Taking Gradients
-
-In the [basics section](../models/basics.md) we covered basic usage of the `gradient` function.
-
-```julia
-using Flux.Tracker
-
-Tracker.gradient((a, b) -> a*b, 2, 3) # (3.0 (tracked), 2.0 (tracked))
-```
-
-`gradient` is actually just a thin wrapper around the backpropagator-based interface, `forward`.
-
-```julia
-using Flux.Tracker: forward
-
-y, back = forward((a, b) -> a*b, 2, 3) # (6.0 (tracked), Flux.Tracker.#9)
-
-back(1) # (3.0 (tracked), 2.0 (tracked))
-```
-
-The `forward` function returns two results. The first, `y`, is the original value of the function (perhaps with tracking applied). The second, `back`, is a new function which, given a sensitivity, returns the sensitivity of the inputs to `forward` (we call this a "backpropagator"). One use of this interface is to provide custom sensitivities when outputs are not scalar.
-
-```julia
-julia> y, back = forward((a, b) -> a.*b, [1,2,3],[4,5,6])
-(param([4.0, 10.0, 18.0]), Flux.Tracker.#9)
-
-julia> back([1,1,1])
-(param([4.0, 5.0, 6.0]), param([1.0, 2.0, 3.0]))
-```
-
-We can also take gradients in-place. This can be useful if you only care about first-order gradients.
-
-```julia
-a, b = param(2), param(3)
-
-c = a*b # 6.0 (tracked)
-
-Tracker.back!(c)
-
-Tracker.grad(a), Tracker.grad(b) # (3.0, 2.0)
-```
-
-## Tracked Arrays
-
-The `param` function converts a normal Julia array into a new object that, while behaving like an array, tracks extra information that allows us to calculate derivatives. For example, say we multiply two parameters:
-
-```julia
-julia> W = param([1 2; 3 4])
-Tracked 2×2 Array{Float64,2}:
- 1.0  2.0
- 3.0  4.0
-
-julia> x = param([5, 6])
-Tracked 2-element Array{Float64,1}:
- 5.0
- 6.0
-
-julia> y = W*x
-Tracked 2-element Array{Float64,1}:
- 17.0
- 39.0
-```
-
-The output `y` is also a `TrackedArray` object. We can now backpropagate sensitivities to `W` and `x` via the `back!` function, and see the gradients accumulated in the `W` and `x` tracked arrays:
-
-```julia
-julia> Tracker.back!(y, [1, -1])
-
-julia> W.grad
-2×2 Array{Float64,2}:
- 5.0   6.0
-5.0  -6.0
-
-julia> x.grad
-2-element Array{Float64,1}:
- -2.0
- -2.0
-```
-
-You may sometimes want to drop derivative information and just get the plain value back. You can do this by calling `Tracker.data(W)`.
-
-## Custom Gradients
-
-We can hook in to the processes above to implement custom gradients for a function or kernel. For a toy example, imagine a custom implementation of `minus`:
-
-```julia
-minus(a, b) = a - b
-```
-
-Firstly, we must tell the tracker system to stop when it sees a call to `minus`, and record it. We can do this using dispatch:
-
-```julia
-using Flux.Tracker: TrackedArray, track, @grad
-
-minus(a::TrackedArray, b::TrackedArray) = track(minus, a, b)
-```
-
-`track` takes care of building a new `Tracked` object and recording the operation on the tape. We just need to provide a gradient definition.
-
-```julia
-@grad function minus(a, b)
-  return minus(data(a), data(b)), Δ -> (Δ, -Δ)
-end
-```
-
-This is essentially just a way of overloading the `forward` function we saw above. We strip tracking from `a` and `b` so that we are calling the original definition of `minus` (otherwise, we'd just try to track the call again and hit an infinite regress).
-
-Note that in the backpropagator we don't call `data(a)`; we *do* in fact want to track this, since nest AD will take a derivative through the backpropagator itself. For example, the gradient of `*` might look like this.
-
-```julia
-@grad a * b = data(a)*data(b), Δ -> (Δ*b, a*Δ)
-```
-
-We can then calculate the first derivative of `minus` as follows:
-
-```julia
-a = param([1,2,3])
-b = param([3,2,1])
-
-c = minus(a, b)  # [-2.0 (tracked), 0.0 (tracked), 2.0 (tracked)]
-
-Tracker.back!(c, 1)
-Tracker.grad(a)  # [1.00, 1.00, 1.00]
-Tracker.grad(b)  # [-1.00, -1.00, -1.00]
-```
-
-For multi-argument functions with custom gradients, you likely want to catch not just `minus(::TrackedArray, ::TrackedArray)` but also `minus(::Array, TrackedArray)` and so on. To do so, just define those extra signatures as needed:
-
-```julia
-minus(a::AbstractArray, b::TrackedArray) = Tracker.track(minus, a, b)
-minus(a::TrackedArray, b::AbstractArray) = Tracker.track(minus, a, b)
-```
-
-## Tracked Internals
-
-All `Tracked*` objects (`TrackedArray`, `TrackedReal`) are light wrappers around the `Tracked` type, which you can access via the `.tracker` field.
-
-```julia
-julia> x.tracker
-Flux.Tracker.Tracked{Array{Float64,1}}(0x00000000, Flux.Tracker.Call{Nothing,Tuple{}}(nothing, ()), true, [5.0, 6.0], [-2.0, -2.0])
-```
-
-The `Tracker` stores the gradient of a given object, which we've seen before.
-
-```julia
-julia> x.tracker.grad
-2-element Array{Float64,1}:
- -2.0
- -2.0
-```
-
-The tracker also contains a `Call` object, which simply represents a function call that was made at some point during the forward pass. For example, the `+` call would look like this:
-
-```julia
-julia> Tracker.Call(+, 1, 2)
-Flux.Tracker.Call{Base.#+,Tuple{Int64,Int64}}(+, (1, 2))
-```
-
-In the case of the `y` we produced above, we can see that it stores the call that produced it -- that is, `W*x`.
-
-```julia
-julia> y.tracker.f
-Flux.Tracker.Call{...}(*, (param([1.0 2.0; 3.0 4.0]), param([5.0, 6.0])))
-```
-
-Notice that because the arguments to the call may also be tracked arrays, storing their own calls, this means that `Tracker` ends up forming a data structure that records everything that happened during the forward pass (often known as a *tape*).
-
-When we call `back!(y, [1, -1])`, the sensitivities `[1, -1]` simply get forwarded to `y`'s call (`*`), effectively calling
-
-```julia
-Tracker.back(*, [1, -1], W, x)
-```
-
-which in turn calculates the sensitivities of the arguments (`W` and `x`) and back-propagates through their calls. This is recursive, so it will walk the entire program graph and propagate gradients to the original model parameters.
--- a/docs/src/models/advanced.md
+++ b/docs/src/models/advanced.md
@ -0,0 +1,73 @@
+# Advanced Model Building and Customisation
+
+Here we will try and describe usage of some more advanced features that Flux provides to give more control over model building.
+
+## Customising Parameter Collection for a Model
+
+Taking reference from our example `Affine` layer from the [basics](basics.md#Building-Layers-1).
+
+By default all the fields in the `Affine` type are collected as its parameters, however, in some cases it may be desired to hold other metadata in our "layers" that may not be needed for training, and are hence supposed to be ignored while the parameters are collected. With Flux, it is possible to mark the fields of our layers that are trainable in two ways.
+
+The first way of achieving this is through overloading the `trainable` function.
+
+```julia-repl
+julia> @functor Affine
+
+julia> a = Affine(rand(3,3), rand(3))
+Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955])
+
+julia> Flux.params(a) # default behavior
+Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])
+
+julia> Flux.trainable(a::Affine) = (a.W,)
+
+julia> Flux.params(a)
+Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])
+```
+
+Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`.
+
+Another way of achieving this is through the `@functor` macro directly. Here, we can mark the fields we are interested in by grouping them in the second argument:
+
+```julia
+Flux.@functor Affine (W,)
+```
+
+However, doing this requires the `struct` to have a corresponding constructor that accepts those parameters.
+
+## Freezing Layer Parameters
+
+When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
+
+Consider a simple multi-layer perceptron model where we want to avoid optimising the first two `Dense` layers. We can obtain
+this using the slicing features `Chain` provides:
+
+```julia
+m = Chain(
+      Dense(784, 64, relu),
+      Dense(64, 64, relu),
+      Dense(32, 10)
+    )
+
+ps = Flux.params(m[3:end])
+```
+
+The `Zygote.Params` object `ps` now holds a reference to only the parameters of the layers passed to it.
+
+During training, the gradients will only be computed for (and applied to) the last `Dense` layer, therefore only that would have its parameters changed.
+
+`Flux.params` also takes multiple inputs to make it easy to collect parameters from heterogenous models with a single call. A simple demonstration would be if we wanted to omit optimising the second `Dense` layer in the previous example. It would look something like this:
+
+```julia
+Flux.params(m[1], m[3:end])
+```
+
+Sometimes, a more fine-tuned control is needed. 
+We can freeze a specific parameter of a specific layer which already entered a `Params` object `ps`, 
+by simply deleting it from `ps`:
+
+```julia
+ps = params(m)
+delete!(ps, m[2].b) 
+```
+
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@ -5,55 +5,54 @@
 Flux's core feature is taking gradients of Julia code. The `gradient` function takes another Julia function `f` and a set of arguments, and returns the gradient with respect to each argument. (It's a good idea to try pasting these examples in the Julia terminal.)

 ```jldoctest basics
-julia> using Flux.Tracker
+julia> using Flux

 julia> f(x) = 3x^2 + 2x + 1;

-julia> df(x) = Tracker.gradient(f, x; nest = true)[1]; # df/dx = 6x + 2
+julia> df(x) = gradient(f, x)[1]; # df/dx = 6x + 2

 julia> df(2)
-14.0 (tracked)
+14

-julia> d2f(x) = Tracker.gradient(df, x; nest = true)[1]; # d²f/dx² = 6
+julia> d2f(x) = gradient(df, x)[1]; # d²f/dx² = 6

 julia> d2f(2)
-6.0 (tracked)
+6
 ```

-(We'll learn more about why these numbers show up as `(tracked)` below.)
-
-When a function has many parameters, we can pass them all in explicitly:
+When a function has many parameters, we can get gradients of each one at the same time:

 ```jldoctest basics
-julia> f(W, b, x) = W * x + b;
+julia> f(x, y) = sum((x .- y).^2);

-julia> Tracker.gradient(f, 2, 3, 4)
-(4.0 (tracked), 1.0 (tracked), 2.0 (tracked))
+julia> gradient(f, [2, 1], [2, 0])
+([0, 2], [0, -2])
 ```

-But machine learning models can have *hundreds* of parameters! Flux offers a nice way to handle this. We can tell Flux to treat something as a parameter via `param`. Then we can collect these together and tell `gradient` to collect the gradients of all `params` at once.
+But machine learning models can have *hundreds* of parameters! To handle this, Flux lets you work with collections of parameters, via `params`. You can get the gradient of all parameters used in a program without explicitly passing them in.

 ```jldoctest basics
-julia> using Flux
+julia> x = [2, 1];

-julia> W = param(2) 
-2.0 (tracked)
+julia> y = [2, 0];

-julia> b = param(3)
-3.0 (tracked)
+julia> gs = gradient(params(x, y)) do
+         f(x, y)
+       end
+Grads(...)

-julia> f(x) = W * x + b;
+julia> gs[x]
+2-element Array{Int64,1}:
+ 0
+ 2

-julia> grads = Tracker.gradient(() -> f(4), params(W, b));
-
-julia> grads[W]
-4.0
-
-julia> grads[b]
-1.0
+julia> gs[y]
+2-element Array{Int64,1}:
+  0
+ -2
 ```

-There are a few things to notice here. Firstly, `W` and `b` now show up as *tracked*. Tracked things behave like normal numbers or arrays, but keep records of everything you do with them, allowing Flux to calculate their gradients. `gradient` takes a zero-argument function; no arguments are necessary because the `params` tell it what to differentiate.
+Here, `gradient` takes a zero-argument function; no arguments are necessary because the `params` tell it what to differentiate.

 This will come in really handy when dealing with big, complicated models. For now, though, let's start with something simple.

@ -68,34 +67,28 @@ b = rand(2)
 predict(x) = W*x .+ b

 function loss(x, y)
-  ŷ = predict(x)
-  sum((y .- ŷ).^2)
+  ŷ = predict(x)
+  sum((y .- ŷ).^2)
 end

 x, y = rand(5), rand(2) # Dummy data
 loss(x, y) # ~ 3
 ```

-To improve the prediction we can take the gradients of `W` and `b` with respect to the loss and perform gradient descent. Let's tell Flux that `W` and `b` are parameters, just like we did above.
+To improve the prediction we can take the gradients of `W` and `b` with respect to the loss and perform gradient descent.

 ```julia
-using Flux.Tracker
+using Flux

-W = param(W)
-b = param(b)
-
-gs = Tracker.gradient(() -> loss(x, y), params(W, b))
+gs = gradient(() -> loss(x, y), params(W, b))
 ```

-Now that we have gradients, we can pull them out and update `W` to train the model. The `update!(W, Δ)` function applies `W = W + Δ`, which we can use for gradient descent.
+Now that we have gradients, we can pull them out and update `W` to train the model.

 ```julia
-using Flux.Tracker: update!
+W̄ = gs[W]

-Δ = gs[W]
-
-# Update the parameter and reset the gradient
-update!(W, -0.1Δ)
+W .-= 0.1 .* W̄

 loss(x, y) # ~ 2.5
 ```
@ -111,12 +104,12 @@ It's common to create more complex models than the linear regression above. For
 ```julia
 using Flux

-W1 = param(rand(3, 5))
-b1 = param(rand(3))
+W1 = rand(3, 5)
+b1 = rand(3)
 layer1(x) = W1 * x .+ b1

-W2 = param(rand(2, 3))
-b2 = param(rand(2))
+W2 = rand(2, 3)
+b2 = rand(2)
 layer2(x) = W2 * x .+ b2

 model(x) = layer2(σ.(layer1(x)))
@ -128,8 +121,8 @@ This works but is fairly unwieldy, with a lot of repetition – especially as we

 ```julia
 function linear(in, out)
-  W = param(randn(out, in))
-  b = param(randn(out))
+  W = randn(out, in)
+  b = randn(out)
  x -> W * x .+ b
 end

@ -150,7 +143,7 @@ struct Affine
 end

 Affine(in::Integer, out::Integer) =
-  Affine(param(randn(out, in)), param(randn(out)))
+  Affine(randn(out, in), randn(out))

 # Overload call, so the object can be used as a function
 (m::Affine)(x) = m.W * x .+ m.b
@ -220,7 +213,30 @@ m(5) # => 26
 Flux provides a set of helpers for custom layers, which you can enable by calling

 ```julia
-Flux.@treelike Affine
+Flux.@functor Affine
 ```

 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
+
+For some more helpful tricks, including parameter freezing, please checkout the [advanced usage guide](advanced.md).
+
+## Utility functions
+
+Flux provides some utility functions to help you generate models in an automated fashion.
+
+`outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
+Currently limited to the following layers:
+- `Chain`
+- `Dense`
+- `Conv`
+- `Diagonal`
+- `Maxout`
+- `ConvTranspose`
+- `DepthwiseConv`
+- `CrossCor`
+- `MaxPool`
+- `MeanPool`
+
+```@docs
+Flux.outdims
+```
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@ -14,9 +14,17 @@ These layers are used to build convolutional neural networks (CNNs).
 ```@docs
 Conv
 MaxPool
+GlobalMaxPool
 MeanPool
+GlobalMeanPool
 DepthwiseConv
 ConvTranspose
+CrossCor
+SamePad
+flatten
+Flux.Zeros
+Flux.convfilter
+Flux.depthwiseconvfilter
 ```

 ## Recurrent Layers
@ -28,6 +36,7 @@ RNN
 LSTM
 GRU
 Flux.Recur
+Flux.reset!
 ```

 ## Other General Purpose Layers
@ -36,41 +45,48 @@ But in contrast to the layers described in the other sections are not readily gr

 ```@docs
 Maxout
+SkipConnection
 ```

-# Normalisation & Regularisation
-
-These layers don't affect the structure of the network but may improve training times or reduce overfitting.
-
-```@docs
-Flux.testmode!
-BatchNorm
-Dropout
-LayerNorm
-```
-
-## Activation Functions
-
-Non-linearities that go between layers of your model. Most of these functions are defined in [NNlib](https://github.com/FluxML/NNlib.jl) but are available by default in Flux.
-
-Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
-
-```@docs
-σ
-relu
-leakyrelu
-elu
-swish
-```

 ## Normalisation & Regularisation

 These layers don't affect the structure of the network but may improve training times or reduce overfitting.

 ```@docs
-Flux.testmode!
+Flux.normalise
 BatchNorm
+Flux.dropout
 Dropout
 AlphaDropout
 LayerNorm
+InstanceNorm
+GroupNorm
+```
+
+### Testmode
+
+Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `Flux.testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified.
+
+```@docs
+Flux.testmode!
+trainmode!
+```
+
+## Cost Functions
+```@docs
+Flux.mae
+Flux.mse
+Flux.msle
+Flux.huber_loss
+Flux.crossentropy
+Flux.logitcrossentropy
+Flux.binarycrossentropy
+Flux.logitbinarycrossentropy
+Flux.kldivergence
+Flux.poisson
+Flux.hinge
+Flux.squared_hinge
+Flux.dice_coeff_loss
+Flux.tversky_loss
 ```
--- a/docs/src/models/nnlib.md
+++ b/docs/src/models/nnlib.md
@ -0,0 +1,61 @@
+# NNlib
+
+Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
+
+## Activation Functions
+
+Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
+
+```@docs
+NNlib.celu
+NNlib.elu
+NNlib.gelu
+NNlib.hardsigmoid
+NNlib.hardtanh
+NNlib.leakyrelu
+NNlib.lisht
+NNlib.logcosh
+NNlib.logsigmoid
+NNlib.mish
+NNlib.relu
+NNlib.relu6
+NNlib.rrelu
+NNlib.selu
+NNlib.sigmoid
+NNlib.softplus
+NNlib.softshrink
+NNlib.softsign
+NNlib.swish
+NNlib.tanhshrink
+NNlib.trelu
+```
+
+## Softmax
+
+```@docs
+NNlib.softmax
+NNlib.logsoftmax
+```
+
+## Pooling
+
+```@docs
+NNlib.maxpool
+NNlib.meanpool
+```
+
+## Convolution
+
+```@docs
+NNlib.conv
+NNlib.depthwiseconv
+```
+
+## Batched Operations
+
+```@docs
+NNlib.batched_mul
+NNlib.batched_mul!
+NNlib.batched_adjoint
+NNlib.batched_transpose
+```
--- a/docs/src/models/recurrence.md
+++ b/docs/src/models/recurrence.md
@ -77,7 +77,7 @@ If you use the `RNN(10, 5)` constructor – as opposed to `RNNCell` – you'll s

 ```julia
 julia> RNN(10, 5)
-Recur(RNNCell(Dense(15, 5)))
+Recur(RNNCell(10, 5, tanh))
 ```

 ## Sequences
@ -101,16 +101,4 @@ m = Chain(LSTM(10, 15), Dense(15, 5))
 m.(seq)
 ```

-## Truncating Gradients
-
-By default, calculating the gradients in a recurrent layer involves its entire history. For example, if we call the model on 100 inputs, we'll have to calculate the gradient for those 100 calls. If we then calculate another 10 inputs we have to calculate 110 gradients – this accumulates and quickly becomes expensive.
-
-To avoid this we can *truncate* the gradient calculation, forgetting the history.
-
-```julia
-truncate!(m)
-```
-
-Calling `truncate!` wipes the slate clean, so we can call the model with more inputs without building up an expensive gradient computation.
-
-`truncate!` makes sense when you are working with multiple chunks of a large sequence, but we may also want to work with a set of independent sequences. In this case the hidden state should be completely reset to its original value, throwing away any accumulated information. `reset!` does this for you.
+Finally, we can reset the hidden state of the cell back to its initial value using `reset!(m)`.
--- a/docs/src/models/regularisation.md
+++ b/docs/src/models/regularisation.md
@ -15,6 +15,8 @@ loss(x, y) = crossentropy(softmax(m(x)), y)
 We can regularise this by taking the (L2) norm of the parameters, `m.W` and `m.b`.

 ```julia
+using LinearAlgebra
+
 penalty() = norm(m.W) + norm(m.b)
 loss(x, y) = crossentropy(softmax(m(x)), y) + penalty()
 ```
@ -29,7 +31,7 @@ julia> params(m)
 param([0.0, 0.0, 0.0, 0.0, 0.0])

 julia> sum(norm, params(m))
-26.01749952921026 (tracked)
+26.01749952921026
 ```

 Here's a larger example with a multi-layer perceptron.
@ -48,15 +50,21 @@ loss(rand(28^2), rand(10))
 One can also easily add per-layer regularisation via the `activations` function:

 ```julia
-julia> c = Chain(Dense(10,5,σ),Dense(5,2),softmax)
-Chain(Dense(10, 5, NNlib.σ), Dense(5, 2), NNlib.softmax)
+julia> using Flux: activations
+
+julia> c = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+Chain(Dense(10, 5, σ), Dense(5, 2), softmax)

 julia> activations(c, rand(10))
 3-element Array{Any,1}:
- param([0.71068, 0.831145, 0.751219, 0.227116, 0.553074])
- param([0.0330606, -0.456104])
- param([0.61991, 0.38009])
+ Float32[0.84682214, 0.6704139, 0.42177814, 0.257832, 0.36255655]
+ Float32[0.1501253, 0.073269576]                                 
+ Float32[0.5192045, 0.48079553]                                  

 julia> sum(norm, ans)
-2.639678767773633 (tracked)
+2.1166067f0
+```
+
+```@docs
+Flux.activations
 ```
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@ -4,7 +4,7 @@ All the usual [Julia performance tips apply](https://docs.julialang.org/en/v1/ma
 As always [profiling your code](https://docs.julialang.org/en/v1/manual/profile/#Profiling-1) is generally a useful way of finding bottlenecks.
 Below follow some Flux specific tips/reminders.

-## Don't use more precision than you need.
+## Don't use more precision than you need

 Flux works great with all kinds of number types.
 But often you do not need to be working with say `Float64` (let alone `BigFloat`).
@ -14,11 +14,12 @@ Which means allocations occur much faster.
 And you use less memory.


-## Make sure your custom activation functions preserve the type of their inputs
-Not only should your activation functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
+## Preserve inputs' types
+
+Not only should your activation and loss functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
 they should also preserve the type of their inputs.

-A very artificial example using an activatioon function like
+A very artificial example using an activation function like

 ```
    my_tanh(x) = Float64(tanh(x))
@ -26,33 +27,32 @@ A very artificial example using an activatioon function like

 will result in performance on `Float32` input orders of magnitude slower than the normal `tanh` would,
 because it results in having to use slow mixed type multiplication in the dense layers.
+Similar situations can occur in the loss function during backpropagation.

 Which means if you change your data say from `Float64` to `Float32` (which should give a speedup: see above),
-you will see a large slow-down
+you will see a large slow-down.

 This can occur sneakily, because you can cause type-promotion by interacting with a numeric literals.
 E.g. the following will have run into the same problem as above:

 ```
-    leaky_tanh(x) = 0.01x + tanh(x)
+    leaky_tanh(x) = 0.01*x + tanh(x)
 ```

-While one could change your activation function (e.g. to use `0.01f0x`) to avoid this when ever your inputs change,
-the idiomatic (and safe way) is to use `oftype`.
-
+While one could change the activation function (e.g. to use `0.01f0*x`), the idiomatic (and safe way)  to avoid type casts whenever inputs changes is to use `oftype`:
 ```
-    leaky_tanh(x) = oftype(x/1, 0.01) + tanh(x)
+    leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x)
 ```


-## Evaluate batches as Matrices of features, rather than sequences of Vector features
+## Evaluate batches as Matrices of features

 While it can sometimes be tempting to process your observations (feature vectors) one at a time
 e.g.
 ```julia
 function loss_total(xs::AbstractVector{<:Vector}, ys::AbstractVector{<:Vector})
    sum(zip(xs, ys)) do (x, y_target)
-        y_pred = model(x) #  evaluate the model
+        y_pred = model(x)  # evaluate the model
        return loss(y_pred, y_target)
    end
 end
@ -60,7 +60,7 @@ end

 It is much faster to concatenate them into a matrix,
 as this will hit BLAS matrix-matrix multiplication, which is much faster than the equivalent sequence of matrix-vector multiplications.
-Even though this means allocating new memory to store them contiguously.
+The improvement is enough that it is worthwhile allocating new memory to store them contiguously.

 ```julia
 x_batch = reduce(hcat, xs)
@ -73,4 +73,4 @@ end
 ```

 When doing this kind of concatenation use `reduce(hcat, xs)` rather than `hcat(xs...)`.
-This will avoid the splatting penality, and will hit the optimised `reduce` method.
+This will avoid the splatting penalty, and will hit the optimised `reduce` method.
--- a/docs/src/saving.md
+++ b/docs/src/saving.md
@ -53,7 +53,7 @@ julia> using Flux
 julia> model = Chain(Dense(10,5,relu),Dense(5,2),softmax)
 Chain(Dense(10, 5, NNlib.relu), Dense(5, 2), NNlib.softmax)

-julia> weights = Tracker.data.(params(model));
+julia> weights = params(model);

 julia> using BSON: @save

@ -113,6 +113,6 @@ You can even store optimiser state alongside the model, to resume training
 exactly where you left off.

 ```julia
-opt = ADAM(params(model))
+opt = ADAM()
@save "model-$(now()).bson" model opt
 ```
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@ -3,25 +3,25 @@
 Consider a [simple linear regression](../models/basics.md). We create some dummy data, calculate a loss, and backpropagate to calculate gradients for the parameters `W` and `b`.

 ```julia
-using Flux, Flux.Tracker
+using Flux

-W = param(rand(2, 5))
-b = param(rand(2))
+W = rand(2, 5)
+b = rand(2)

-predict(x) = W*x .+ b
+predict(x) = (W * x) .+ b
 loss(x, y) = sum((predict(x) .- y).^2)

 x, y = rand(5), rand(2) # Dummy data
 l = loss(x, y) # ~ 3

 θ = Params([W, b])
-grads = Tracker.gradient(() -> loss(x, y), θ)
+grads = gradient(() -> loss(x, y), θ)
 ```

 We want to update each parameter, using the gradient, in order to improve (reduce) the loss. Here's one way to do that:

 ```julia
-using Flux.Tracker: grad, update!
+using Flux.Optimise: update!

 η = 0.1 # Learning Rate
 for p in (W, b)
@ -46,8 +46,110 @@ An optimiser `update!` accepts a parameter and a gradient, and updates the param
 All optimisers return an object that, when passed to `train!`, will update the parameters passed to it.

 ```@docs
+Flux.Optimise.update!
 Descent
 Momentum
 Nesterov
+RMSProp
 ADAM
+RADAM
+AdaMax
+ADAGrad
+ADADelta
+AMSGrad
+NADAM
+ADAMW
 ```
+
+## Optimiser Interface
+
+Flux's optimisers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
+
+In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
+
+```julia
+mutable struct Momentum
+  eta
+  rho
+  velocity
+end
+
+Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
+```
+
+The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
+
+```julia
+function Flux.Optimise.apply!(o::Momentum, x, Δ)
+  η, ρ = o.eta, o.rho
+  v = get!(o.velocity, x, zero(x))::typeof(x)
+  @. v = ρ * v - η * Δ
+  @. Δ = -v
+end
+```
+
+This is the basic definition of a Momentum update rule given by:
+
+```math
+v = ρ * v - η * Δ
+w = w - v
+```
+
+The `apply!` defines the update rules for an optimiser `opt`, given the parameters and gradients. It returns the updated gradients. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
+
+Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully.
+
+## Composing Optimisers
+
+Flux defines a special kind of optimiser simply called `Optimiser` which takes in arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
+that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
+
+```julia
+opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
+```
+
+Here we apply exponential decay to the `Descent` optimiser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
+It is then applied like any optimiser.
+
+```julia
+w = randn(10, 10)
+w1 = randn(10,10)
+ps = Params([w, w1])
+
+loss(x) = Flux.mse(w * x, w1 * x)
+
+loss(rand(10)) # around 9
+
+for t = 1:10^5
+  θ = Params([w, w1])
+  θ̄ = gradient(() -> loss(rand(10)), θ)
+  Flux.Optimise.update!(opt, θ, θ̄)
+end
+
+loss(rand(10)) # around 0.9
+```
+
+In this manner it is possible to compose optimisers for some added flexibility.
+
+## Decays
+
+Similar to optimisers, Flux also defines some simple decays that can be used in conjunction with other optimisers, or standalone.
+
+```@docs
+ExpDecay
+InvDecay
+WeightDecay
+```
+
+## Gradient Clipping
+
+Gradient clipping is useful for training recurrent neural networks, which have a tendency to suffer from the exploding gradient problem. An example usage is
+
+```julia
+opt = Optimiser(ClipValue(1e-3), ADAM(1e-3))
+```
+
+```@docs
+ClipValue
+ClipNorm
+```
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@ -1,15 +1,16 @@
 # Training

-To actually train a model we need three things:
+To actually train a model we need four things:

 * A *objective function*, that evaluates how well a model is doing given some input data.
+* The trainable parameters of the model.
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.

-With these we can call `Flux.train!`:
+With these we can call `train!`:

-```julia
-Flux.train!(objective, params, data, opt)
+```@docs
+Flux.Optimise.train!
 ```

 There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
@ -31,6 +32,17 @@ Flux.train!(loss, ps, data, opt)
 ```

 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
+For a list of all built-in loss functions, check out the [layer reference](../models/layers.md).
+
+At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
+
+## Model parameters
+
+The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `params(m)`.
+
+Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
+
+Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. Also, for freezing model parameters, see the [Advanced Usage Guide](../models/advanced.md).

 ## Datasets

@ -47,7 +59,8 @@ data = [(x, y)]
 ```julia
 data = [(x, y), (x, y), (x, y)]
 # Or equivalently
-data = Iterators.repeated((x, y), 3)
+using IterTools: ncycle
+data = ncycle([(x, y)], 3)
 ```

 It's common to load the `x`s and `y`s separately. In this case you can use `zip`:
@ -58,6 +71,14 @@ ys = [rand( 10), rand( 10), rand( 10)]
 data = zip(xs, ys)
 ```

+Training data can be conveniently  partitioned for mini-batch training using the [`Flux.Data.DataLoader`](@ref) type:
+
+```julia
+X = rand(28, 28, 60000)
+Y = rand(0:9, 60000)
+data = DataLoader(X, Y, batchsize=128) 
+```
+
 Note that, by default, `train!` only loops over the data once (a single "epoch").
 A convenient way to run multiple epochs from the REPL is provided by `@epochs`.

@ -74,6 +95,10 @@ julia> @epochs 2 Flux.train!(...)
 # Train for two epochs
 ```

+```@docs
+Flux.@epochs
+```
+
 ## Callbacks

 `train!` takes an additional argument, `cb`, that's used for callbacks so that you can observe the training process. For example:
@ -93,3 +118,38 @@ evalcb() = @show(loss(test_x, test_y))
 Flux.train!(objective, ps, data, opt,
            cb = throttle(evalcb, 5))
 ```
+
+Calling `Flux.stop()` in a callback will exit the training loop early.
+
+```julia
+cb = function ()
+  accuracy() > 0.9 && Flux.stop()
+end
+```
+
+## Custom Training loops
+
+The `Flux.train!` function can be very convenient, especially for simple problems.
+Its also very flexible with the use of callbacks.
+But for some problems its much cleaner to write your own custom training loop.
+An example follows that works similar to the default `Flux.train` but with no callbacks.
+You don't need callbacks if you just code the calls to your functions directly into the loop.
+E.g. in the places marked with comments.
+
+```julia
+function my_custom_train!(loss, ps, data, opt)
+  ps = Params(ps)
+  for d in data
+    gs = gradient(ps) do
+      training_loss = loss(d...)
+      # Insert whatever code you want here that needs Training loss, e.g. logging
+      return training_loss
+    end
+    # insert what ever code you want here that needs gradient
+    # E.g. logging with TensorBoardLogger.jl as histogram so you can see if it is becoming huge
+    update!(opt, ps, gs)
+    # Here you might like to check validation set accuracy, and break out to do early stopping
+  end
+end
+```
+You could simplify this further, for example by hard-coding in the loss function.
--- a/docs/src/utilities.md
+++ b/docs/src/utilities.md
@ -0,0 +1,49 @@
+# Utility Functions
+
+Flux contains some utility functions for working with data; these functions
+help create inputs for your models or batch your dataset.
+Other functions can be used to initialize your layers or to regularly execute
+callback functions.
+
+## Working with Data
+
+```@docs
+Flux.unsqueeze
+Flux.stack
+Flux.unstack
+Flux.chunk
+Flux.frequencies
+Flux.batch
+Flux.batchseq
+Base.rpad(v::AbstractVector, n::Integer, p)
+```
+
+## Layer Initialization
+
+These are primarily useful if you are planning to write your own layers.
+Flux initializes convolutional layers and recurrent cells with `glorot_uniform`
+by default.
+To change the default on an applicable layer, pass the desired function with the
+`init` keyword. For example:
+```jldoctest; setup = :(using Flux)
+julia> conv = Conv((3, 3), 1 => 8, relu; init=Flux.glorot_normal)
+Conv((3, 3), 1=>8, relu)
+```
+
+```@docs
+Flux.glorot_uniform
+Flux.glorot_normal
+```
+
+## Model Abstraction
+
+```@docs
+Flux.destructure
+```
+
+## Callback Helpers
+
+```@docs
+Flux.throttle
+Flux.stop
+```
--- a/paper/paper.bib
+++ b/paper/paper.bib
@ -14,7 +14,7 @@
  journal   = {arXiv},
  volume    = {abs/11712.03112},
  year      = {2017},
-  url       = {http://arxiv.org/abs/1712.03112},
+  url       = {https://arxiv.org/abs/1712.03112},
 }

@online{MLPL,
@ -29,7 +29,7 @@
  author = {Mike Innes and others},
  title = {Generic GPU Kernels},
  year = 2017,
-  url = {http://mikeinnes.github.io/2017/08/24/cudanative.html},
+  url = {https://mikeinnes.github.io/2017/08/24/cudanative.html},
  urldate = {2018-02-16}
 }

--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -3,31 +3,35 @@ module Flux
 # Zero Flux Given

 using Base: tail
-using MacroTools, Juno, Requires, Reexport, Statistics, Random
+using Statistics, Random, LinearAlgebra
+using Zygote, MacroTools, Juno, Reexport
 using MacroTools: @forward
-
-export Chain, Dense, Maxout,
-       RNN, LSTM, GRU,
-       Conv, ConvTranspose, MaxPool, MeanPool, DepthwiseConv,
-       Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm,
-       params, mapleaves, cpu, gpu, f32, f64
-
@reexport using NNlib
+using Zygote: Params, @adjoint, gradient, pullback, @nograd

-using Tracker
-using Tracker: data
-export Tracker, TrackedArray, TrackedVector, TrackedMatrix, param
+export gradient
+
+export Chain, Dense, Maxout, RNN, LSTM, GRU, SamePad, Conv, CrossCor, ConvTranspose,
+       GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, flatten,
+       DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
+       SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!

 include("optimise/Optimise.jl")
 using .Optimise
 using .Optimise: @epochs
-export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
+export Descent, ADAM, Momentum, Nesterov, RMSProp,
  ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
-  ADAMW, InvDecay, ExpDecay, WeightDecay
+  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay,
+  ClipValue, ClipNorm
+
+
+using CuArrays
+const use_cuda = Ref(false)

 include("utils.jl")
+include("zeros.jl")
 include("onehot.jl")
-include("treelike.jl")
+include("functor.jl")

 include("layers/stateless.jl")
 include("layers/basic.jl")
@ -37,6 +41,17 @@ include("layers/normalise.jl")

 include("data/Data.jl")

-@init @require CuArrays="3a865a2d-5b23-5a0f-bc46-62713ec82fae" include("cuda/cuda.jl")
+include("deprecations.jl")
+
+include("cuda/cuda.jl")
+
+function __init__()
+  use_cuda[] = CuArrays.functional() # Can be overridden after load with `Flux.use_cuda[] = false`
+  if CuArrays.functional()
+    if !CuArrays.has_cudnn()
+      @warn "CuArrays.jl found cuda, but did not find libcudnn. Some functionality will not be available."
+    end
+  end
+end

 end # module
--- a/src/cuda/cuda.jl
+++ b/src/cuda/cuda.jl
@ -1,38 +1,9 @@
 module CUDA

 using ..CuArrays
-import ..CuArrays.CUDAdrv: CuPtr, CU_NULL
-using Pkg.TOML

-function version_check()
-  major_version = 1
-  project = joinpath(dirname(pathof(CuArrays)), "../Project.toml")
-  project = TOML.parse(String(read(project)))
-  version = VersionNumber(get(project, "version", "0.0.0"))
-  if version.major != major_version
-    @warn """
-    Flux is only supported with CuArrays v$major_version.x.
-    Try running `] pin CuArrays@$major_version`.
-    """
-  end
-end
-
-version_check()
-
-if !applicable(CuArray{UInt8}, undef, 1)
-  (T::Type{<:CuArray})(::UndefInitializer, sz...) = T(sz...)
-end
-
-if CuArrays.libcudnn != nothing
-  if isdefined(CuArrays, :libcudnn_handle)
-    handle() = CuArrays.libcudnn_handle[]
-  else
-    handle() = CuArrays.CUDNN.handle()
-  end
-  include("curnn.jl")
-  include("cudnn.jl")
-else
-  @warn("CUDNN is not installed, some functionality will not be available.")
-end
+using CuArrays: CUDNN
+include("curnn.jl")
+include("cudnn.jl")

 end
--- a/src/cuda/cudnn.jl
+++ b/src/cuda/cudnn.jl
@ -1,228 +1,8 @@
-using .CuArrays.CUDNN: @check, libcudnn, cudnnStatus_t, cudnnTensorDescriptor_t,
-  cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
 import ..Flux: data
-using LinearAlgebra
+import CuArrays.CUDNN: batchnorm, ∇batchnorm

-mutable struct DropoutDesc
-  ptr::Ptr{Nothing}
-  states::CuVector{UInt8}
-end
+(BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, cache = nothing) where T<:Union{Float32, Float64} =
+  BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = Flux.istraining()))

-Base.unsafe_convert(::Type{Ptr{Nothing}}, dd::DropoutDesc) = dd.ptr
-
-function DropoutDesc(ρ::Real; seed::Integer=0)
-  d = [C_NULL]
-  s = Csize_t[0]
-  @check ccall((:cudnnCreateDropoutDescriptor,libcudnn), cudnnStatus_t, (Ptr{Ptr{Nothing}},), d)
-  @check ccall((:cudnnDropoutGetStatesSize,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Csize_t}),handle(),s)
-  states = CuArray{UInt8}(undef, s[]) # TODO: can we drop this when ρ=0?
-  desc = DropoutDesc(d[], states)
-  @check ccall((:cudnnSetDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Nothing},Cfloat,CuPtr{Nothing},Csize_t,Culonglong),
-    desc,handle(),ρ,states,length(states),seed)
-  finalizer(desc) do x
-    @check ccall((:cudnnDestroyDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
-  end
-  return desc
-end
-
-const BATCHNORM_SPATIAL = 1
-const BATCHNORM_ACTIVATION = 0
-const BATCHNORM_MIN_EPS = 1e-5
-
-@inline _wsize(y) = (map(_ -> 1, size(y)[1:end-2])..., size(y)[end-1], 1)
-
-@inline _reddims(y) = (collect(1:ndims(y)-2)..., ndims(y))
-
-mutable struct BNCache
-  mean
-  ivar
-end
-
-BNCache() = BNCache(nothing, nothing)
-
-# NOTE: CuDNN supports only 4D and 5D Tensors for BatchNorm Operations
-# so reshape a 2D Tensor into 4D
-batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2},
-          running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-          cache = nothing, alpha = T(1), beta = T(0),
-          eps = T(1e-5), training = true) where T<:Union{Float32, Float64} =
-  dropdims(batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), running_mean, running_var, momentum,
-            cache = cache, alpha = alpha, beta = beta, eps = eps, training = training), dims = (1, 2))
-
-function batchnorm(g::CuArray{T}, b::CuArray{T}, x::Union{CuArray{T, 4},CuArray{T,5}},
-                   running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-                   cache = nothing, alpha = T(1), beta = T(0),
-                   eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
-  y = similar(x)
-  cudnnBNForward!(y, g, b, x, running_mean, running_var, momentum, cache = cache,
-      alpha = alpha, beta = beta, eps = eps, training = training)
-  y
-end
-
-function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray{T},
-                        running_mean::CuArray{T}, running_var::CuArray{T},
-                        momentum; cache = nothing,
-                        alpha = T(1), beta = T(0),
-                        eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
-  dims = _wsize(x)
-  if eps < BATCHNORM_MIN_EPS
-    # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", BATCHNORM_MIN_EPS)
-    eps = BATCHNORM_MIN_EPS
-  end
-  xd = TensorDesc(x)
-  yd = TensorDesc(y)
-  gd = TensorDesc(T, dims)
-
-  if training
-
-    if cache !== nothing
-      mean = zeros(CuArray{T}, dims...)
-      ivar = ones(CuArray{T}, dims...)
-    else
-      mean = CU_NULL
-      ivar = CU_NULL
-    end
-
-    @check ccall((:cudnnBatchNormalizationForwardTraining, libcudnn), cudnnStatus_t,
-                 (cudnnHandle_t,cudnnBatchNormMode_t,
-                  Ptr{T}, Ptr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, CuPtr{T},
-                  Cdouble, CuPtr{T}, CuPtr{T},
-                  Cdouble, CuPtr{T}, CuPtr{T}),
-                  handle(), BATCHNORM_SPATIAL,
-                  Ref(T(alpha)), Ref(T(beta)),
-                  xd, x,
-                  yd, y,
-                  gd, g, b,
-                  momentum, running_mean, running_var,
-                  eps, mean, ivar)
-
-    if cache !== nothing
-      cache.mean = mean
-      cache.ivar = ivar
-    end
-  else
-    @check ccall((:cudnnBatchNormalizationForwardInference, libcudnn), cudnnStatus_t,
-                 (Ptr{cudnnHandle_t},cudnnBatchNormMode_t,
-                  Ptr{T}, Ptr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, CuPtr{T},
-                  CuPtr{T}, CuPtr{T},
-                  Cdouble),
-                  handle(), BATCHNORM_SPATIAL,
-                  Ref(T(alpha)), Ref(T(beta)),
-                  xd, x,
-                  yd, y,
-                  gd, g, b,
-                  running_mean, running_var,
-                  eps)
-  end
-end
-
-function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2}, dy::CuArray{T, 2},
-           running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-           cache = nothing, eps = T(1e-5), alpha = T(1),
-           beta = T(0), training = true) where T<:Union{Float32, Float64}
-  dg, db, dx = ∇batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), reshape(dy, 1, 1, size(dy, 1),
-                          size(dy, 2)), running_mean, running_var, momentum, cache = cache, eps = eps,
-                          alpha = alpha, beta = beta, training = training)
-  (dg, db, dropdims(dx, dims = (1, 2)))
-end
-
-function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
-                    running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-                    cache = nothing, eps = T(1e-5), alpha = T(1),
-                    beta = T(0), training = true) where T<:Union{Float32, Float64}
-  dg = similar(g)
-  db = similar(b)
-  dx = similar(x)
-  cudnnBNBackward!(dg, g, db, dx, x, dy, running_mean, running_var, T(momentum),
-    training = training, cache = cache, eps = eps, alpha = alpha, beta = beta)
-  (dg, db, dx)
-end
-
-function cudnnBNBackward!(dg::CuArray{T}, g::CuArray{T}, db::CuArray{T},
-                          dx::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
-                          running_mean::CuArray{T}, running_var::CuArray{T},
-                          momentum; cache = nothing, eps = T(1e-5),
-                          alpha = T(1), beta = T(0),
-                          dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
-  if training
-    xd = TensorDesc(x)
-    dyd = TensorDesc(dy)
-    dxd = TensorDesc(dx)
-    gd = TensorDesc(T, _wsize(x))
-    if cache !== nothing
-      mean, ivar = cache.mean, cache.ivar
-      info("mean and ivar are fetched from the cache")
-    else
-      mean, ivar = CU_NULL, CU_NULL
-    end
-
-    if eps < BATCHNORM_MIN_EPS
-      eps = BATCHNORM_MIN_EPS
-    end
-
-    @check ccall((:cudnnBatchNormalizationBackward, libcudnn), cudnnStatus_t,
-                 (cudnnHandle_t,cudnnBatchNormMode_t,
-                  Ptr{T}, Ptr{T},
-                  Ptr{T}, Ptr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, CuPtr{T}, CuPtr{T},
-                  Cdouble, CuPtr{T}, CuPtr{T}),
-                  handle(), BATCHNORM_SPATIAL,
-                  Ref(T(alpha)), Ref(T(beta)),
-                  Ref(T(dalpha)), Ref(T(dbeta)),
-                  xd, x,
-                  dyd, dy,
-                  dxd, dx,
-                  gd, g, dg, db,
-                  eps, mean, ivar)
-  else
-    ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps)
-    dx .= dy .* reshape(g, _wsize(x)) .* ivar
-    dg .= squeeze(sum(dy .* (x .- reshape(running_mean, _wsize(x))) .* ivar, _reddims(dy)), dims = (1,2,4))
-    db .= squeeze(sum(dy, _reddims(dy)), dims = (1,2,4))
-  end
-end
-
-# Flux Interface
-
-(BN::Flux.BatchNorm)(x::Union{CuParam{T,2},CuParam{T,4},CuParam{T,5}}, cache = nothing) where T<:Union{Float32, Float64} =
-  batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = BN.active)
-
-batchnorm(g::TrackedArray, b::TrackedArray, x::TrackedArray, running_mean::CuArray{T},
-          running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} =
-  track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...)
-
-batchnorm(g::TrackedArray, b::TrackedArray, x::CuArray{T}, running_mean::CuArray{T},
-          running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} =
-  track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...)
-
-batchnorm(g::TrackedArray, b::CuArray{T}, x::TrackedArray, running_mean::CuArray{T},
-          running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} =
-  track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...)
-
-batchnorm(g::CuArray{T}, b::TrackedArray, x::CuArray{T}, running_mean::CuArray{T},
-          running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} =
-  track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...)
-
-batchnorm(g::CuArray{T}, b::TrackedArray, x::TrackedArray, running_mean::CuArray{T},
-          running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} =
-  track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...)
-
-batchnorm(g::TrackedArray, b::CuArray{T}, x::CuArray{T}, running_mean::CuArray{T},
-          running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} =
-  track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...)
-
-batchnorm(g::CuArray{T}, b::CuArray{T}, x::TrackedArray, running_mean::CuArray{T},
-          running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} =
-  track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...)
-
-@grad batchnorm(g, b, x, running_mean, running_var, momentum; kw...) =
-  batchnorm(data.((g, b, x))..., running_mean, running_var, momentum; kw...), Δ -> (nobacksies(:batchnorm, ∇batchnorm(data.((g, b, x, Δ))..., running_mean, running_var, momentum; kw...))..., nothing, nothing, nothing)
+@adjoint batchnorm(g, b, x, running_mean, running_var, momentum; kw...) =
+  batchnorm(g, b, x, running_mean, running_var, momentum; kw...), Δ -> (∇batchnorm(g, b, x, Δ, running_mean, running_var, momentum; kw...)..., nothing, nothing, nothing)
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@ -1,325 +1,90 @@
-using .CuArrays.CUDNN: @check, libcudnn, cudnnStatus_t, cudnnTensorDescriptor_t,
-  cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
-using LinearAlgebra
-
-const RNN_RELU = 0 # Stock RNN with ReLu activation
-const RNN_TANH = 1 # Stock RNN with tanh activation
-const LSTM = 2     # LSTM with no peephole connections
-const GRU = 3      # Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1)
-
-const LINEAR_INPUT = 0
-const SKIP_INPUT = 1
-
-const UNIDIRECTIONAL = 0
-const BIDIRECTIONAL = 1
-
-const RNN_ALGO_STANDARD = 0
-const RNN_ALGO_PERSIST_STATIC = 1
-const RNN_ALGO_PERSIST_DYNAMIC = 2
-
-# param layout:
-# RNN: [weight, bias] × [input, hidden]
-# GRU: [weight, bias] × [input, hidden] × [reset, update, newmem]
-# LSTM: [weight, bias] × [input, hidden] × [input, forget, newmem, output]
-
-function params(w::CuVector, input, hidden, n = 1)
-  slice(offset, shape) = reshape(view(w, offset.+(1:prod(shape))), shape)
-  wx = slice(0, (input, hidden*n))
-  wh = slice(length(wx), (hidden, hidden*n))
-  bias = view(w, length(wx)+length(wh) .+ (1:hidden*n))
-  (wx, wh), bias
-end
-
-mutable struct RNNDesc{T}
-  mode::Int
-  input::Int
-  hidden::Int
-  params::CuVector{T}
-  weights::NTuple{2,CuMatrix{T}}
-  bias::CuVector{T}
-  ptr::Ptr{Nothing}
-end
-
-Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr
-
-function rnnParamSize(T, r, input)
-  size = Csize_t[0]
-  @check ccall((:cudnnGetRNNParamsSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Ptr{Nothing},Ptr{Csize_t},Cint),
-    handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T))
-  return Int(size[])÷sizeof(T)
-end
-
-ngates(mode) = [1, 1, 4, 3][mode+1]
-ngates(r::RNNDesc) = ngates(r.mode)
-
-function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T
-  d = [C_NULL]
-  @check ccall((:cudnnCreateRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Ptr{Nothing}},),d)
-
-  dropoutDesc = DropoutDesc(0)
-  inputMode = LINEAR_INPUT
-  direction = UNIDIRECTIONAL
-  algo = RNN_ALGO_STANDARD
-  @check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint),
-    handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
-
-  w = cuzeros(T, rnnParamSize(T, d[], input))
-  # TODO: avoid reserve allocation here
-  rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
-  finalizer(rd) do x
-    @check ccall((:cudnnDestroyRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
-  end
-  return rd
-end
-
-function rnnWorkspaceSize(r::RNNDesc, seqlen, xdesc)
-  size = Csize_t[0]
-  @check ccall((:cudnnGetRNNWorkspaceSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Ptr{Ptr{Nothing}},Ptr{Csize_t}),
-    handle(), r, seqlen, xdesc, size)
-  return Int(size[])
-end
-
-const workspace = [CuVector{UInt8}(undef, 1)]
-
-getworkspace(bytes) =
-  length(workspace[]) ≥ bytes ?
-    workspace[] :
-    (workspace[] = CuVector{UInt8}(undef, bytes))
-
-getworkspace(r::RNNDesc, seqlen, xdesc) =
-  getworkspace(rnnWorkspaceSize(r, seqlen, xdesc))
-
-function rnnTrainingReserveSize(r::RNNDesc, seqlen, xdesc)
-  size = Csize_t[0]
-  @check ccall((:cudnnGetRNNTrainingReserveSize,libcudnn), cudnnStatus_t, (Ptr{Nothing}, Ptr{Nothing}, Cint, Ptr{Ptr{Nothing}}, Ptr{Csize_t}),
-    handle(), r, seqlen, xdesc, size)
-  return Int(size[])
-end
-
-function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
-                         workspace, reserve=nothing) where T
-  if reserve == nothing
-    @check ccall((:cudnnRNNForwardInference, libcudnn), cudnnStatus_t,
-                 (Ptr{Nothing}, Ptr{Nothing}, Cint,
-                  Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  CuPtr{Nothing}, Csize_t),
-                 handle(), rnn, seqlen,
-                 xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
-                 workspace, length(workspace))
-  else
-    @check ccall((:cudnnRNNForwardTraining, libcudnn), cudnnStatus_t,
-                 (Ptr{Nothing}, Ptr{Nothing}, Cint,
-                  Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                  CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
-                 handle(), rnn, seqlen,
-                 xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
-                 workspace, length(workspace), reserve, length(reserve))
-  end
-end
-
-xDesc(x) = [TensorDesc(eltype(x), (1, size(x, 1), size(x, 2)))]
-
-hDesc(h::Nothing) = C_NULL, CU_NULL
-hDesc(x::Integer) = (@assert x == 0; hDesc(nothing))
-function hDesc(h::CuArray)
-  TensorDesc(eltype(h), (size(h, 1), size(h, 2), 1)), h
-end
-
-# TODO: can we just manipulate strides here?
-# TODO: should use repmat, but this isn't implemented.
-hBatch(x::AbstractVector, h::CuVector) = h
-hBatch(x::AbstractMatrix, h::CuVector) = h .* cuones(1, size(x, 2))
-hBatch(x::AbstractMatrix, h::CuMatrix) = h .* cuones(1, size(h,2) == 1 ? size(x,2) : 1)
-
-function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
-  h = hBatch(x, h_)
-  c = c_ == nothing ? nothing : hBatch(x, c_)
-  @assert size(x, 1) == rnn.input
-  @assert size(h, 1) == rnn.hidden
-  @assert size(x, 2) == size(h, 2)
-  seqLength = 1
-  xdesc = xDesc(x)
-  y = x isa AbstractVector ? similar(x, rnn.hidden) : similar(x, rnn.hidden, size(x, 2))
-  ho = similar(h)
-  ydesc = xDesc(y)
-  workspace = getworkspace(rnn, seqLength, xdesc)
-  reserve = train == Val{true} ?
-    CuVector{UInt8}(undef, rnnTrainingReserveSize(rnn, seqLength, xdesc)) :
-    nothing
-  co = c == nothing ? c : similar(c)
-  cudnnRNNForward(rnn, seqLength,
-                  xdesc, x,
-                  hDesc(h)...,
-                  hDesc(c)...,
-                  FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
-                  ydesc, y,
-                  hDesc(ho)...,
-                  hDesc(co)...,
-                  workspace, reserve)
-  result = c == nothing ? (y, ho) : (y, ho, co)
-  return train == Val{true} ? (reserve, result) : result
-end
-
-forwardTrain(rnn::RNNDesc{T}, x::CuArray{T}, h::CuArray{T}, c = nothing) where T =
-  forward(rnn, x, h, c, Val{true})
-
-function cudnnRNNBackwardData(rnn::RNNDesc{T}, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
-                              wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, rs) where T
-  @check ccall((:cudnnRNNBackwardData,libcudnn),cudnnStatus_t,
-               (Ptr{Nothing}, Ptr{Nothing}, Cint,
-                Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing},
-                CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
-               handle(), rnn, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
-               wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, length(ws), rs, length(rs))
-end
-
-function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
-  # Same as above, any more efficient way?
-  dy = dy_ isa Integer ? zero(y) : dy_
-  yd = xDesc(y)
-  dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2))
-  dh = similar(h)
-  dc = c == nothing ? nothing : similar(c)
-  cudnnRNNBackwardData(rnn, 1,
-    yd, y, yd, dy, hDesc(dho)..., hDesc(dco)...,
-    FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
-    hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)...,
-    workspace[], reserve)
-  return c == nothing ? (dx, dh) : (dx, dh, dc)
-end
-
-backwardData(rnn, y, dy, dho, hx, reserve) =
-  backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve)
-
-function cudnnRNNBackwardWeights(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, yd, y, dwd, dw,
-                                 workspace, reserve) where T
-  @check ccall((:cudnnRNNBackwardWeights,libcudnn), cudnnStatus_t,
-               (Ptr{Nothing}, Ptr{Nothing}, Cint,  # handle, rnnDesc, seqLength
-                Ptr{Ptr{Nothing}}, CuPtr{T}, #x
-                Ptr{Nothing}, CuPtr{T}, #hx
-                Ptr{Ptr{Nothing}}, CuPtr{T}, #y
-                CuPtr{Nothing}, Csize_t, #ws
-                Ptr{Nothing}, CuPtr{T}, #dw
-                CuPtr{Nothing}, Csize_t), #rs
-               handle(), rnn, seqlen, xd, x, hd, h, yd, y,
-               workspace, length(workspace), dwd, dw, reserve, length(reserve))
-end
-
-function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T
-  dw = zero(rnn.params)
-  cudnnRNNBackwardWeights(rnn, 1,
-    xDesc(x), x, hDesc(h)..., xDesc(y), y,
-    FilterDesc(T, (1, 1, length(dw))), dw,
-    workspace[], reserve)
-  return params(dw, rnn.input, rnn.hidden, ngates(rnn))
-end
-
-# Interface
-
 import ..Flux: Flux, relu
-import ..Tracker: TrackedArray
-using .CuArrays.CUDAnative
-using .CuArrays: @cuindex, cudims
+using CuArrays.CUDAnative

-function LinearAlgebra.copy_transpose!(dst::CuArray, src::CuArray)
-  function kernel(dst, src)
-    I = @cuindex dst
-    dst[I...] = src[reverse(I)...]
-    return
-  end
-  blk, thr = cudims(dst)
-  @cuda blocks=blk threads=thr kernel(dst, src)
-  return dst
-end
-
-CuParam{T,N} = Union{CuArray{T,N},TrackedArray{T,N,CuArray{T,N}}}
-CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuParam{T,2},<:CuParam{T,1}}
-CuGRU{T} = Flux.GRUCell{<:CuParam{T,2},<:CuParam{T,1}}
-CuLSTM{T} = Flux.LSTMCell{<:CuParam{T,2},<:CuParam{T,1}}
+CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuArray{T,2},<:CuArray{T,1}}
+CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}}
+CuLSTM{T} = Flux.LSTMCell{<:CuArray{T,2},<:CuArray{T,1}}
 CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}}

-function copyparams!(m::CuRNNs, d::RNNDesc)
-  Wi, Wh = d.weights
-  copy_transpose!(Wi, Flux.data(m.Wi))
-  copy_transpose!(Wh, Flux.data(m.Wh))
-  copy_transpose!(d.bias, Flux.data(m.b))
-  return
-end
-
-function RNNDesc(m::CuRNNs{T}) where T
+function CUDNN.RNNDesc(m::CuRNNs{T}) where T
  h, i = length(m.h), size(m.Wi, 2)
  mode = m isa CuRNN ?
-    (m.σ == tanh ? RNN_TANH : RNN_RELU) :
-    m isa CuGRU ? GRU : LSTM
-  r = RNNDesc{T}(mode, i, h)
+    (m.σ == tanh ? CUDNN.CUDNN_RNN_TANH : CUDNN.CUDNN_RNN_RELU) :
+    m isa CuGRU ? CUDNN.CUDNN_GRU : CUDNN.CUDNN_LSTM
+  r = CUDNN.RNNDesc{T}(mode, i, h)
  return r
 end

 const descs = WeakKeyDict()

 function desc(rnn)
-  d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = RNNDesc(rnn))
-  copyparams!(rnn, d)
+  d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = CUDNN.RNNDesc(rnn))
+  CUDNN.setweights!(d, rnn.Wi, rnn.Wh, rnn.b)
  return d
 end

-import Flux.Tracker
-import Flux.Tracker: data, istracked, track, unbroadcast, @grad, nobacksies
+import Zygote
+using Zygote: @adjoint

-istrain(m::CuRNNs, args...) = any(x -> x isa TrackedArray, (m.Wi, m.Wh, m.b, args...))
-
-function (m::CuRNN{T})(h::CuParam{T}, x::CuParam{T}) where T <: Union{Float32,Float64}
-  result = istrain(m, h, x) ?
-    track(m, x, h, m.Wi, m.Wh, m.b) :
-    forward(desc(m), x, h)
-  return result[2], result[1]
+function (m::CuRNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
+  y, h′ = CUDNN.forward(desc(m), x, h)
+  return h′, y
 end

-function (m::CuGRU{T})(h::CuParam{T}, x::CuParam{T}) where T <: Union{Float32,Float64}
-  result = istrain(m, h, x) ?
-    track(m, x, h, m.Wi, m.Wh, m.b) :
-    forward(desc(m), x, h)
-  return result[2], result[1]
+function (m::CuGRU{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
+  y, h′ = CUDNN.forward(desc(m), x, h)
+  return h′, y
 end

-function (m::CuLSTM{T})(h::NTuple{2,CuParam{T}}, x::CuParam{T}) where T <: Union{Float32,Float64}
-  result = istrain(m, h, x) ?
-    track(m, x, h[1], h[2], m.Wi, m.Wh, m.b) :
-    forward(desc(m), x, h[1], h[2])
-  return (result[2], result[3]), result[1]
+function (m::CuLSTM{T})(h::NTuple{2,CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
+  y, h′, c′ = CUDNN.forward(desc(m), x, h[1], h[2])
+  return (h′, c′), y
 end

-(m::CuRNN{T})(h::CuParam{T}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x))
-(m::CuGRU{T})(h::CuParam{T}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x))
-(m::CuLSTM{T})(h::NTuple{2,CuParam{T}}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x))
+(m::CuRNN{T})(h::CuArray{T}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x))
+(m::CuGRU{T})(h::CuArray{T}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x))
+(m::CuLSTM{T})(h::NTuple{2,CuArray{T}}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x))

-@grad function (m::Union{CuRNN,CuGRU})(x, h, Wi, Wh, b)
-  reserve, result = forwardTrain(desc(m), data(x), data(h))
-  result, function (Δ)
-    y, ho = result
-    dy, dho = Δ
-    h_ = hBatch(x, data(h))
-    dx, dh = backwardData(descs[m], y, dy, dho, h_, reserve)
-    (dWi, dWh), db = backwardWeights(descs[m], data(x), h_, y, reserve)
-    nobacksies(:RNN, (dx, unbroadcast(h, dh), transpose(dWi), transpose(dWh), db))
+trim(x, Δ) = reshape(Δ, ntuple(i -> size(Δ, i), Val(ndims(x))))
+
+unbroadcast(x::AbstractArray, Δ) =
+  size(x) == size(Δ) ? Δ :
+  length(x) == length(Δ) ? trim(x, Δ) :
+    trim(x, sum(Δ, dims = ntuple(i -> size(x, i) == 1 ? i : ndims(Δ)+1, Val(ndims(Δ)))))
+
+coerce_cuda(x::Union{CuArray,Nothing}) = x
+coerce_cuda(x::Tuple) = coerce_cuda.(x)
+
+coerce_cuda(x::AbstractArray) = x .+ CuArrays.fill(0)
+
+function struct_grad!(cx::Zygote.Context, x, x̄)
+  for f in fieldnames(typeof(x))
+    Zygote.accum_param(cx, getfield(x, f), getfield(x̄, f))
+  end
+  dx = Zygote.grad_mut(cx, x)
+  dx[] = Zygote.accum(dx[], x̄)
+  return dx
+end
+
+for RNN in (CuRNN, CuGRU)
+  @eval @adjoint function (m::$RNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
+    (y, ho), back = CUDNN.pullback(desc(m), x, h)
+    (ho, y), function (Δ)
+      dho, dy = coerce_cuda(Δ) # Support FillArrays etc.
+      m̄ = back(dy, dho)
+      dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(m̄.Wi),Wh=transpose(m̄.Wh),b=m̄.b,h=nothing))
+      (dm, unbroadcast(h, m̄.h), m̄.x)
+    end
  end
 end

-@grad function (m::CuLSTM)(x, h, c, Wi, Wh, b)
-  reserve, result = forwardTrain(desc(m), data.((x, h, c))...)
-  result, function (Δ)
-    y, ho = result
-    dy, dho, dco = Δ
-    h_ = hBatch(x, data(h))
-    c_ = hBatch(x, data(c))
-    dx, dh, dc = backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve)
-    (dWi, dWh), db = backwardWeights(descs[m], data(x), h_, y, reserve)
-    nobacksies(:RNN,
-      (dx, unbroadcast(h, dh), unbroadcast(c, dc),
-       transpose(dWi), transpose(dWh), db))
+@adjoint function (m::CuLSTM)((h, c)::Tuple{CuArray{T},CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
+  (y, ho, co), back = CUDNN.pullback(desc(m), x, h, c)
+  ((ho, co), y), function (Δ)
+    dhc, dy = coerce_cuda(Δ) # Support FillArrays etc.
+    dho, dco = dhc === nothing ? (nothing, nothing) : dhc
+    m̄ = back(dy, dho, dco)
+    dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(m̄.Wi),Wh=transpose(m̄.Wh),b=m̄.b,h=nothing,c=nothing))
+    (dm, (unbroadcast(h, m̄.h), unbroadcast(c, m̄.c)), m̄.x)
  end
 end
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@ -3,6 +3,9 @@ module Data
 import ..Flux
 import SHA

+using Random: shuffle!
+using Base: @propagate_inbounds
+
 export CMUDict, cmudict

 deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...)
@ -26,6 +29,9 @@ function __init__()
  mkpath(deps())
 end

+include("dataloader.jl")
+export DataLoader
+
 include("mnist.jl")
 export MNIST

@ -42,4 +48,9 @@ using .Sentiment
 include("iris.jl")
 export Iris

+include("housing.jl")
+export Housing
+
+@deprecate DataLoader(x...; kws...) DataLoader(x; kws...)
+
 end
--- a/src/data/cmudict.jl
+++ b/src/data/cmudict.jl
@ -19,23 +19,40 @@ function load()
  @info "Downloading CMUDict dataset"
  mkpath(deps("cmudict"))
  for (x, hash) in suffixes_and_hashes
-    download_and_verify("$cache_prefix/http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-$version$x",
+    download_and_verify("$cache_prefix/https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-$version$x",
             deps("cmudict", "cmudict$x"), hash)
  end
 end

+"""
+    phones()
+
+Return a `Vector` containing the phones used in the CMU Pronouncing Dictionary.
+"""
 function phones()
  load()
  Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
                        "\n", keepempty = false), "\t")))
 end

+"""
+    symbols()
+
+Return a `Vector` containing the symbols used in the CMU Pronouncing Dictionary.
+A symbol is a phone with optional auxiliary symbols, indicating for example the
+amount of stress on the phone.
+"""
 function symbols()
  load()
  Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
                "\n", keepempty = false))
 end

+"""
+    rawdict()
+
+Return the unfiltered CMU Pronouncing Dictionary.
+"""
 function rawdict()
  load()
  Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
@ -44,6 +61,14 @@ end

 validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)

+"""
+    cmudict()
+
+Return a filtered CMU Pronouncing Dictionary.
+
+It is filtered so each word contains only ASCII characters and a combination of
+word characters (as determined by the regex engine using `\\w`), '-' and '.'.
+"""
 cmudict() = filter(p -> validword(p.first), rawdict())

 alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
--- a/src/data/dataloader.jl
+++ b/src/data/dataloader.jl
@ -0,0 +1,110 @@
+# Adapted from Knet's src/data.jl (author: Deniz Yuret)
+
+struct DataLoader{D}
+    data::D
+    batchsize::Int
+    nobs::Int
+    partial::Bool
+    imax::Int
+    indices::Vector{Int}
+    shuffle::Bool
+end
+
+"""
+    DataLoader(data; batchsize=1, shuffle=false, partial=true)
+
+An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
+(except possibly the last one). 
+
+Takes as input a single data tensor, or a tuple (or a named tuple) of tensors.
+The last dimension in each tensor is considered to be the observation dimension.
+
+If `shuffle=true`, shuffles the observations each time iterations are re-started.
+If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
+
+The original data is preserved in the `data` field of the DataLoader. 
+
+Usage example:
+
+    Xtrain = rand(10, 100)
+    train_loader = DataLoader(Xtrain, batchsize=2) 
+    # iterate over 50 mini-batches of size 2
+    for x in train_loader
+        @assert size(x) == (10, 2)
+        ...
+    end
+
+    train_loader.data   # original dataset
+
+    # similar, but yielding tuples
+    train_loader = DataLoader((Xtrain,), batchsize=2) 
+    for (x,) in train_loader
+        @assert size(x) == (10, 2)
+        ...
+    end
+
+    Xtrain = rand(10, 100)
+    Ytrain = rand(100)
+    train_loader = DataLoader((Xtrain, Ytrain), batchsize=2, shuffle=true) 
+    for epoch in 1:100
+        for (x, y) in train_loader
+            @assert size(x) == (10, 2)
+            @assert size(y) == (2,)
+            ...
+        end
+    end
+
+    # train for 10 epochs
+    using IterTools: ncycle 
+    Flux.train!(loss, ps, ncycle(train_loader, 10), opt)
+
+    # can use NamedTuple to name tensors
+    train_loader = DataLoader((images=Xtrain, labels=Ytrain), batchsize=2, shuffle=true)
+    for datum in train_loader
+        @assert size(datum.images) == (10, 2)
+        @assert size(datum.labels) == (2,)
+    end
+"""
+function DataLoader(data; batchsize=1, shuffle=false, partial=true)
+    batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
+    
+    n = _nobs(data) 
+    if n < batchsize
+        @warn "Number of observations less than batchsize, decreasing the batchsize to $n"
+        batchsize = n
+    end
+    imax = partial ? n : n - batchsize + 1
+    DataLoader(data, batchsize, n, partial, imax, [1:n;], shuffle)
+end
+
+@propagate_inbounds function Base.iterate(d::DataLoader, i=0)     # returns data in d.indices[i+1:i+batchsize]
+    i >= d.imax && return nothing
+    if d.shuffle && i == 0
+        shuffle!(d.indices)
+    end
+    nexti = min(i + d.batchsize, d.nobs)
+    ids = d.indices[i+1:nexti]
+    batch = _getobs(d.data, ids)
+    return (batch, nexti)
+end
+
+function Base.length(d::DataLoader)
+    n = d.nobs / d.batchsize
+    d.partial ? ceil(Int,n) : floor(Int,n)
+end
+
+_nobs(data::AbstractArray) = size(data)[end]
+
+function _nobs(data::Union{Tuple, NamedTuple})
+    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
+    n = _nobs(data[1])
+    if !all(x -> _nobs(x) == n, Base.tail(data))
+        throw(DimensionMismatch("All data should contain same number of observations"))
+    end
+    return n
+end
+
+_getobs(data::AbstractArray, i) = data[ntuple(i -> Colon(), Val(ndims(data) - 1))..., i]
+_getobs(data::Union{Tuple, NamedTuple}, i) = map(Base.Fix2(_getobs, i), data)
+
+Base.eltype(::DataLoader{D}) where D = D
--- a/src/data/fashion-mnist.jl
+++ b/src/data/fashion-mnist.jl
@ -33,9 +33,10 @@ const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte")

 Load the Fashion-MNIST images.

-Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
+Each image is a 28×28 array of `Gray` colour values
+(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).

-Returns the 60,000 training images by default; pass `:test` to retreive the
+Return the 60,000 training images by default; pass `:test` to retrieve the
 10,000 test images.
 """
 function images(set = :train)
@ -49,10 +50,10 @@ end
    labels()
    labels(:test)

-Load the labels corresponding to each of the images returned from `images()`.
+Load the labels corresponding to each of the images returned from [`images()`](@ref).
 Each label is a number from 0-9.

-Returns the 60,000 training labels by default; pass `:test` to retreive the
+Return the 60,000 training labels by default; pass `:test` to retrieve the
 10,000 test labels.
 """
 function labels(set = :train)
--- a/src/data/housing.jl
+++ b/src/data/housing.jl
@ -0,0 +1,136 @@
+"""
+1. Title: Boston Housing Data
+
+2. Sources:
+   (a) Origin:  This dataset was taken from the StatLib library which is
+                maintained at Carnegie Mellon University.
+   (b) Creator:  Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the 
+                 demand for clean air', J. Environ. Economics & Management,
+                 vol.5, 81-102, 1978.
+   (c) Date: July 7, 1993
+
+3. Number of Instances: 506
+
+4. Number of Attributes: 13 continuous attributes (including "class"
+                            attribute "MEDV"), 1 binary-valued attribute.
+                            
+5. Attribute Information:
+   
+       1. CRIM      per capita crime rate by town
+       2. ZN        proportion of residential land zoned for lots over 
+                    25,000 sq.ft.
+       3. INDUS     proportion of non-retail business acres per town
+       4. CHAS      Charles River dummy variable (= 1 if tract bounds 
+                    river; 0 otherwise)
+       5. NOX       nitric oxides concentration (parts per 10 million)
+       6. RM        average number of rooms per dwelling
+       7. AGE       proportion of owner-occupied units built prior to 1940
+       8. DIS       weighted distances to five Boston employment centres
+       9. RAD       index of accessibility to radial highways
+       10. TAX      full-value property-tax rate per 10,000 dollars
+       11. PTRATIO  pupil-teacher ratio by town
+       12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
+                    by town
+       13. LSTAT    % lower status of the population
+       14. MEDV     Median value of owner-occupied homes in 1000's of dollars   
+
+       Downloaded From: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data
+
+"""
+module Housing
+
+using DelimitedFiles
+using ..Data: deps, download_and_verify
+
+#Uncomment if package exists
+#const cache_prefix = "https://cache.julialang.org/"
+const cache_prefix = ""
+
+function load()
+    isfile(deps("housing.data")) && return
+    
+    @info "Downloading the Boston housing Dataset"
+    download_and_verify("$(cache_prefix)http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
+                        deps("housing.data"),
+                        "baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")
+    
+    #@info "Download complete. Working on the files"
+    path = deps()
+    isfile(deps("housing.data")) && touch(joinpath(path, "tempfile.data"))
+    open(joinpath(path, "tempfile.data"), "a") do fout
+        open(deps("housing.data"), "r") do fin
+            for line in eachline(fin)
+                line = replace(lstrip(line), r" +" => s",")
+                println(fout, line)
+            end
+        end
+    end
+    mv(joinpath(path, "tempfile.data"), deps("housing.data"), force=true)
+end
+
+"""
+Gets the targets for the Boston housing dataset, a 506 element array listing the targets for each example
+
+```jldoctest
+julia> using Flux
+
+julia> target = Flux.Data.Housing.targets()
+
+julia> summary(target)
+506×1 Array{Float64,2}
+
+julia> target[1]
+24.0
+
+"""
+function targets()
+    load()
+    housing = readdlm(deps("housing.data"), ',')
+    reshape(Vector{Float64}(housing[1:end,end]), (506, 1))           
+end
+
+
+"""
+Gets the names of the features provided in the dataset
+
+"""
+function feature_names()
+    ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"]
+end
+
+
+"""
+Gets the features of the Boston Housing Dataset. This is a 506x13 Matrix of Float64 datatypes.
+The values are in the order ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"].
+It has 506 examples.
+
+```jldoctest
+julia> using Flux
+
+julia> features = Flux.Data.Housing.features()
+
+julia> summary(features)
+506×13 Array{Float64,2}
+
+julia> features[1, :]
+13-element Array{Float64,1}:
+0.00632
+18.0    
+2.31   
+0.0    
+0.538  
+   ⋮      
+296.0    
+15.3    
+396.9    
+4.98   
+
+"""
+function features()
+    load()
+    housing = readdlm(deps("housing.data"), ',')
+    Matrix{Float64}(housing[1:end, 1:13])    
+end
+
+
+end
--- a/src/data/iris.jl
+++ b/src/data/iris.jl
@ -1,28 +1,21 @@
-
 """
-
-    Iris
-
 Fisher's classic iris dataset.

-Measurements from 3 different species of iris: setosa, versicolor and 
-virginica.  There are 50 examples of each species.
+Measurements from 3 different species of iris: setosa, versicolor and
+virginica. There are 50 examples of each species.

-There are 4 measurements for each example: sepal length, sepal width, petal 
-length and petal width.  The measurements are in centimeters.
+There are 4 measurements for each example: sepal length, sepal width,
+petal length and petal width. The measurements are in centimeters.

 The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris).
-
 """
 module Iris

 using DelimitedFiles
 using ..Data: deps, download_and_verify

-const cache_prefix = ""
-
 # Uncomment if the iris.data file is cached to cache.julialang.org.
-# const cache_prefix = "https://cache.julialang.org/"
+const cache_prefix = "https://cache.julialang.org/"

 function load()
    isfile(deps("iris.data")) && return
@ -34,13 +27,12 @@ function load()
 end

 """
-
    labels()

-Get the labels of the iris dataset, a 150 element array of strings listing the 
+Get the labels of the iris dataset, a 150 element array of strings listing the
 species of each example.

-```jldoctest
+```jldoctest; setup = :(Flux.Data.Iris.load())
 julia> labels = Flux.Data.Iris.labels();

 julia> summary(labels)
@ -57,14 +49,13 @@ function labels()
 end

 """
-
    features()

-Get the features of the iris dataset.  This is a 4x150 matrix of Float64 
-elements.  It has a row for each feature (sepal length, sepal width, 
+Get the features of the iris dataset. This is a 4x150 matrix of Float64
+elements. It has a row for each feature (sepal length, sepal width,
 petal length, petal width) and a column for each example.

-```jldoctest
+```jldoctest; setup = :(Flux.Data.Iris.load())
 julia> features = Flux.Data.Iris.features();

 julia> summary(features)
@ -83,6 +74,5 @@ function features()
    iris = readdlm(deps("iris.data"), ',')
    Matrix{Float64}(iris[1:end, 1:4]')
 end
+
 end
-
-
--- a/src/data/mnist.jl
+++ b/src/data/mnist.jl
@ -83,9 +83,10 @@ getfeatures(io::IO, index::Integer) = vec(getimage(io, index))

 Load the MNIST images.

-Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
+Each image is a 28×28 array of `Gray` colour values
+(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).

-Returns the 60,000 training images by default; pass `:test` to retreive the
+Return the 60,000 training images by default; pass `:test` to retrieve the
 10,000 test images.
 """
 function images(set = :train)
@ -99,10 +100,10 @@ end
    labels()
    labels(:test)

-Load the labels corresponding to each of the images returned from `images()`.
+Load the labels corresponding to each of the images returned from [`images()`](@ref).
 Each label is a number from 0-9.

-Returns the 60,000 training labels by default; pass `:test` to retreive the
+Return the 60,000 training labels by default; pass `:test` to retrieve the
 10,000 test labels.
 """
 function labels(set = :train)
--- a/src/data/sentiment.jl
+++ b/src/data/sentiment.jl
@ -1,3 +1,4 @@
+"Stanford Sentiment Treebank dataset."
 module Sentiment

 using ZipFile
@ -39,8 +40,28 @@ function gettrees(name)
  return parsetree.(ss)
 end

+"""
+    train()
+
+Return the train split of the Stanford Sentiment Treebank.
+The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
+"""
 train() = gettrees("train")
+
+"""
+    test()
+
+Return the test split of the Stanford Sentiment Treebank.
+The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
+"""
 test() = gettrees("test")
+
+"""
+    dev()
+
+Return the dev split of the Stanford Sentiment Treebank.
+The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
+"""
 dev() = gettrees("dev")

 end
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@ -0,0 +1,2 @@
+@deprecate param(x) x
+@deprecate data(x) x
--- a/src/functor.jl
+++ b/src/functor.jl
@ -0,0 +1,82 @@
+import Adapt: adapt, adapt_storage
+using Zygote: IdSet
+import Functors: @functor, functor, fmap
+
+trainable(m) = functor(m)[1]
+
+"""
+    testmode!(m, mode = true)
+
+Set a layer or model's test mode (see below).
+Using `:auto` mode will treat any gradient computation as training.
+
+_Note_: if you manually set a model into test mode, you need to manually place
+it back into train mode during training phase.
+
+Possible values include:
+- `false` for training
+- `true` for testing
+- `:auto` or `nothing` for Flux to detect the mode automatically
+"""
+testmode!(m, mode = true) = m
+
+"""
+    trainmode!(m, mode = true)
+
+Set a layer of model's train mode (see below).
+Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`).
+
+_Note_: if you manually set a model into train mode, you need to manually place
+it into test mode during testing phase.
+
+Possible values include:
+- `true` for training
+- `false` for testing
+- `:auto` or `nothing` for Flux to detect the mode automatically
+"""
+trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode)
+
+params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
+
+function params!(p::Params, x, seen = IdSet())
+  x in seen && return
+  push!(seen, x)
+  for child in trainable(x)
+    params!(p, child, seen)
+  end
+end
+
+function params(m...)
+  ps = Params()
+  params!(ps, m)
+  return ps
+end
+
+# Deprecated stuff
+macro treelike(args...)
+  functorm(args...)
+end
+mapleaves(f, x) = fmap(f, x)
+
+function loadparams!(m, xs)
+  for (p, x) in zip(params(m), xs)
+    size(p) == size(x) ||
+      error("Expected param size $(size(p)), got $(size(x))")
+    copyto!(p, x)
+  end
+end
+
+# CPU/GPU movement conveniences
+
+cpu(m) = fmap(x -> adapt(Array, x), m)
+
+gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x
+
+# Precision
+
+adapt_storage(T::Type{<:Real}, xs::AbstractArray{<:Real}) = convert.(T, xs)
+
+paramtype(T::Type{<:Real}, m) = fmap(x -> adapt(T, x), m)
+
+f32(m) = paramtype(Float32, m)
+f64(m) = paramtype(Float64, m)
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@ -4,17 +4,23 @@
 Chain multiple layers / functions together, so that they are called in sequence
 on a given input.

-```julia
-m = Chain(x -> x^2, x -> x+1)
-m(5) == 26
-
-m = Chain(Dense(10, 5), Dense(5, 2))
-x = rand(10)
-m(x) == m[2](m[1](x))
-```
-
 `Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
 `m[1:3](x)` will calculate the output of the first three layers.
+
+# Examples
+```jldoctest
+julia> m = Chain(x -> x^2, x -> x+1);
+
+julia> m(5) == 26
+true
+
+julia> m = Chain(Dense(10, 5), Dense(5, 2));
+
+julia> x = rand(10);
+
+julia> m(x) == m[2](m[1](x))
+true
+```
 """
 struct Chain{T<:Tuple}
  layers::T
@ -24,8 +30,7 @@ end
@forward Chain.layers Base.getindex, Base.length, Base.first, Base.last,
  Base.iterate, Base.lastindex

-children(c::Chain) = c.layers
-mapchildren(f, c::Chain) = Chain(f.(c.layers)...)
+functor(::Type{<:Chain}, c) = c.layers, ls -> Chain(ls...)

 applychain(::Tuple{}, x) = x
 applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
@ -34,35 +39,70 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))

 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)

+testmode!(m::Chain, mode = true) = (map(x -> testmode!(x, mode), m.layers); m)
+
 function Base.show(io::IO, c::Chain)
  print(io, "Chain(")
  join(io, c.layers, ", ")
  print(io, ")")
 end

-activations(c::Chain, x) = accumulate((x, m) -> m(x), c.layers, init = x)
+"""
+    outdims(c::Chain, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+outdims(m, (10, 10)) == (6, 6)
+```
+"""
+outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(isize)
+
+# This is a temporary and naive implementation
+# it might be replaced in the future for better performance
+# see issue https://github.com/FluxML/Flux.jl/issues/702
+# Johnny Chen -- @johnnychen94
+# only slightly changed to better handle interaction with Zygote @dsweber2
+"""
+    activations(c::Chain, input)
+
+Calculate the forward results of each layers in Chain `c` with `input` as model input.
+"""
+function activations(c::Chain, input)
+    extraChain(c.layers, input)
+end
+
+function extraChain(fs::Tuple, x)
+    res = first(fs)(x)
+    return (res, extraChain(Base.tail(fs), res)...)
+end
+
+extraChain(::Tuple{}, x) = ()
+
+

 """
    Dense(in::Integer, out::Integer, σ = identity)

-Creates a traditional `Dense` layer with parameters `W` and `b`.
+Create a traditional `Dense` layer with parameters `W` and `b`.

    y = σ.(W * x .+ b)

 The input `x` must be a vector of length `in`, or a batch of vectors represented
 as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.

-```julia
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
 julia> d = Dense(5, 2)
 Dense(5, 2)

 julia> d(rand(5))
-Tracked 2-element Array{Float64,1}:
-  0.00257447
-  -0.00449443
-```
+2-element Array{Float32,1}:
+  -0.16210233
+   0.12311903```
 """
-struct Dense{F,S,T}
+struct Dense{F,S<:AbstractArray,T<:AbstractArray}
  W::S
  b::T
  σ::F
@ -72,10 +112,10 @@ Dense(W, b) = Dense(W, b, identity)

 function Dense(in::Integer, out::Integer, σ = identity;
               initW = glorot_uniform, initb = zeros)
-  return Dense(param(initW(out, in)), param(initb(out)), σ)
+  return Dense(initW(out, in), initb(out), σ)
 end

-@treelike Dense
+@functor Dense

 function (a::Dense)(x::AbstractArray)
  W, b, σ = a.W, a.b, a.σ
@ -93,13 +133,26 @@ end
 (a::Dense{<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  invoke(a, Tuple{AbstractArray}, x)

-(a::Dense{<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
+(a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  a(T.(x))

+"""
+    outdims(l::Dense, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Dense(10, 5)
+outdims(m, (5, 2)) == (5,)
+outdims(m, (10,)) == (5,)
+```
+"""
+outdims(l::Dense, isize) = (size(l.W)[1],)
+
 """
    Diagonal(in::Integer)

-Creates an element-wise linear transformation layer with learnable
+Create an element-wise linear transformation layer with learnable
 vectors `α` and `β`:

    y = α .* x .+ β
@ -112,9 +165,9 @@ struct Diagonal{T}
 end

 Diagonal(in::Integer; initα = ones, initβ = zeros) =
-  Diagonal(param(initα(in)), param(initβ(in)))
+  Diagonal(initα(in), initβ(in))

-@treelike Diagonal
+@functor Diagonal

 function (a::Diagonal)(x)
  α, β = a.α, a.β
@ -125,22 +178,16 @@ function Base.show(io::IO, l::Diagonal)
  print(io, "Diagonal(", length(l.α), ")")
 end

+outdims(l::Diagonal, isize) = (length(l.α),)

 """
    Maxout(over)

-`Maxout` is a neural network layer, which has a number of internal layers,
-which all have the same input, and the maxout returns the elementwise maximium
-of the internal layers' outputs.
+The [Maxout](https://arxiv.org/pdf/1302.4389.pdf) layer has a number of
+internal layers which all receive the same input. It returns the elementwise
+maximum of the internal layers' outputs.

 Maxout over linear dense layers satisfies the univeral approximation theorem.
-
-Reference:
-Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, and Yoshua Bengio.
-2013. Maxout networks.
-In Proceedings of the 30th International Conference on International Conference on Machine Learning - Volume 28 (ICML'13),
-Sanjoy Dasgupta and David McAllester (Eds.), Vol. 28. JMLR.org III-1319-III-1327.
-https://arxiv.org/pdf/1302.4389.pdf
 """
 struct Maxout{FS<:Tuple}
    over::FS
@ -149,17 +196,18 @@ end
 """
    Maxout(f, n_alts)

-Constructs a Maxout layer over `n_alts` instances of  the layer given  by `f`.
-The function takes no arguement and should return some callable layer.
-Conventionally this is a linear dense layer.
+Construct a Maxout layer over `n_alts` instances of the layer given by `f`.
+The function takes no arguments and should return some callable layer.
+Conventionally, this is a linear dense layer.

-For example the following example which
-will construct a `Maxout` layer over 4 internal dense linear layers,
-each identical in structure (784 inputs, 128 outputs).
+# Examples
+
+This constructs a `Maxout` layer over 4 internal dense linear layers, each
+identical in structure (784 inputs, 128 outputs):
 ```julia
-    insize = 784
-    outsie = 128
-    Maxout(()->Dense(insize, outsize), 4)
+insize = 784
+outsize = 128
+Maxout(()->Dense(insize, outsize), 4)
 ```
 """
 function Maxout(f, n_alts)
@ -167,6 +215,46 @@ function Maxout(f, n_alts)
  return Maxout(over)
 end

+@functor Maxout
+
 function (mo::Maxout)(input::AbstractArray)
    mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end
+
+outdims(l::Maxout, isize) = outdims(first(l.over), isize)
+
+"""
+    SkipConnection(layer, connection)
+
+Create a skip connection which consists of a layer or `Chain` of consecutive
+layers and a shortcut connection linking the block's input to the output
+through a user-supplied 2-argument callable. The first argument to the callable
+will be propagated through the given `layer` while the second is the unchanged,
+"skipped" input.
+
+The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`,
+and requires the output of the layers to be the same shape as the input.
+Here is a more complicated example:
+```julia
+m = Conv((3,3), 4=>7, pad=(1,1))
+x = ones(5,5,4,10);
+size(m(x)) == (5, 5, 7, 10)
+
+sm = SkipConnection(m, (mx, x) -> cat(mx, x, dims=3))
+size(sm(x)) == (5, 5, 11, 10)
+```
+"""
+struct SkipConnection
+  layers
+  connection  #user can pass arbitrary connections here, such as (a,b) -> a + b
+end
+
+@functor SkipConnection
+
+function (skip::SkipConnection)(input)
+  skip.connection(skip.layers(input), input)
+end
+
+function Base.show(io::IO, b::SkipConnection)
+  print(io, "SkipConnection(", b.layers, ", ", b.connection, ")")
+end
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@ -1,56 +1,140 @@
-using NNlib: conv, ∇conv_data, depthwiseconv
+using NNlib: conv, ∇conv_data, depthwiseconv, output_size

-@generated sub2(::Val{N}) where N = :(Val($(N-2)))
+# pad dims of x with dims of y until ndims(x) == ndims(y)
+_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
+
+_convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+ (ksize .- 1).*dsize .- (pad[1:2:end] .+ pad[2:2:end])

 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)

 """
-    Conv(size, in=>out)
-    Conv(size, in=>out, relu)
+    SamePad

-Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
-`in` and `out` specify the number of input and output channels respectively.
+Padding for convolutional layers will be calculated so that outputshape == inputshape when stride = 1.

-Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
-         giving us a 16-channel output. Output is activated with ReLU.
+For stride > 1 the output shape depends on the type of convolution layer.
+"""
+struct SamePad end

-    size = (2,2)
+calc_padding(pad, k::NTuple{N,T}, dilation, stride) where {T,N}= expand(Val(2*N), pad)
+function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
+  #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/pdf/1603.07285
+
+  # Effective kernel size, including dilation
+  k_eff = @. k + (k - 1) * (dilation - 1)
+  # How much total padding needs to be applied?
+  pad_amt = @. k_eff - 1
+  # In case amount of padding is odd we need to apply different amounts to each side.
+  return Tuple(mapfoldl(i -> [ceil(Int, i/2), floor(Int, i/2)], vcat, pad_amt))
+end
+
+"""
+    Conv(filter, in => out, σ = identity; init = glorot_uniform,
+         stride = 1, pad = 0, dilation = 1)
+
+    filter = (2,2)
    in = 1
-    out = 16 
+    out = 16
    Conv((2, 2), 1=>16, relu)

-Data should be stored in WHCN order (width, height, # channels, # batches). 
-In other words, a 100×100 RGB image would be a `100×100×3×1` array, 
+Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
+`in` and `out` specify the number of input and output channels respectively.
+
+Data should be stored in WHCN order (width, height, # channels, batch size).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.

+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
+
+# Examples
+
+Apply a `Conv` layer to a 1-channel input using a 2×2 window filter size, giving us a
+16-channel output. Output is activated with ReLU.
+```julia
+filter = (2,2)
+in = 1
+out = 16
+Conv(filter, in => out, relu)
+```
 """
-struct Conv{N,F,A,V}
+struct Conv{N,M,F,A,V}
  σ::F
  weight::A
  bias::V
  stride::NTuple{N,Int}
-  pad::NTuple{N,Int}
+  pad::NTuple{M,Int}
  dilation::NTuple{N,Int}
 end

-Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-     stride = 1, pad = 0, dilation = 1) where {T,N} =
-  Conv(σ, w, b, expand.(sub2(Val(N)), (stride, pad, dilation))...)
+"""
+    Conv(weight::AbstractArray, bias::AbstractArray)
+    Conv(weight::AbstractArray, bias::AbstractArray, activation)

-Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1) where N =
-  Conv(param(init(k..., ch...)), param(zeros(ch[2])), σ,
-       stride = stride, pad = pad, dilation = dilation)
+Constructs the convolutional layer with user defined weight and bias arrays.

-@treelike Conv
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+There is also a keyword-only constuctor available for all convoultional
+layers.
+
+```julia
+weight = rand(Float32, 3, 3, 5)
+bias = zeros(Float32, 5)
+Conv(weight = weight,
+    bias = bias,
+    σ = sigmoid)
+```
+"""
+function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+              stride = 1, pad = 0, dilation = 1) where {T,N}
+  stride = expand(Val(N-2), stride)
+  dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
+  return Conv(σ, w, b, stride, pad, dilation)
+end
+
+function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+              activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+"""
+    convfilter(filter::Tuple, in=>out)
+
+Constructs a standard convolutional weight matrix with given `filter` and
+channels from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`depthwiseconvfilter`](@ref)
+"""
+convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+          init = glorot_uniform) where N = init(filter..., ch...)
+
+function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+            init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
+            weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+
+  Conv(weight, bias, σ,
+      stride = stride, pad = pad, dilation = dilation)
+end
+
+@functor Conv

 function (c::Conv)(x::AbstractArray)
  # TODO: breaks gpu broadcast :(
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
-  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
-  σ.(conv(x, c.weight, stride = c.stride, pad = c.pad, dilation = c.dilation) .+ b)
+  σ, b = c.σ, reshape(c.bias, ntuple(_->1, length(c.stride))..., :, 1)
+  cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
+  σ.(conv(x, c.weight, cdims) .+ b)
 end

 function Base.show(io::IO, l::Conv)
@ -67,39 +151,106 @@ end
  a(T.(x))

 """
-    ConvTranspose(size, in=>out)
-    ConvTranspose(size, in=>out, relu)
+    outdims(l::Conv, isize::Tuple)

-Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
-`in` and `out` specify the number of input and output channels respectively.
-Data should be stored in WHCN order. In other words, a 100×100 RGB image would
-be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
-Takes the keyword arguments `pad`, `stride` and `dilation`.
+Calculate the output dimensions given the input dimensions `isize`.
+Batch size and channel size are ignored as per [NNlib.jl](https://github.com/FluxML/NNlib.jl).
+
+```julia
+m = Conv((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+outdims(m, (10, 10, 1, 3)) == (8, 8)
+```
 """
-struct ConvTranspose{N,F,A,V}
+outdims(l::Conv, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
+
+"""
+    ConvTranspose(filter, in=>out)
+    ConvTranspose(filter, in=>out, activation)
+    ConvTranspose(filter, in => out, σ = identity; init = glorot_uniform,
+                  stride = 1, pad = 0, dilation = 1)
+
+Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
+`in` and `out` specify the number of input and output channels respectively.
+
+Data should be stored in WHCN order (width, height, # channels, batch size).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
+and a batch of 50 would be a `100×100×3×50` array.
+
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1.
+"""
+struct ConvTranspose{N,M,F,A,V}
  σ::F
  weight::A
  bias::V
  stride::NTuple{N,Int}
-  pad::NTuple{N,Int}
+  pad::NTuple{M,Int}
  dilation::NTuple{N,Int}
 end

-ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N} =
-  ConvTranspose(σ, w, b, expand.(sub2(Val(N)), (stride, pad, dilation))...)
+"""
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray)
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)

-ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-              init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
-ConvTranspose(param(init(k..., reverse(ch)...)), param(zeros(ch[2])), σ,
+Constructs the convolutional transpose layer with user defined weight and bias arrays.
+forward pass.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
+  stride = expand(Val(N-2), stride)
+  dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
+  return ConvTranspose(σ, w, b, stride, pad, dilation)
+end
+
+function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                        activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
+
+  ConvTranspose(weight, bias, σ,
              stride = stride, pad = pad, dilation = dilation)
+end

-@treelike ConvTranspose
+@functor ConvTranspose
+
+function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
+    # Calculate size of "input", from ∇conv_data()'s perspective...
+    combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end])
+    I = (size(x)[1:end-2] .- 1).*c.stride .+ 1 .+ (size(c.weight)[1:end-2] .- 1).*c.dilation .- combined_pad
+    C_in = size(c.weight)[end-1]
+    batch_size = size(x)[end]
+    # Create DenseConvDims() that looks like the corresponding conv()
+    return DenseConvDims((I..., C_in, batch_size), size(c.weight);
+                        stride=c.stride,
+                        padding=c.pad,
+                        dilation=c.dilation,
+    )
+end
+
+# TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
+@nograd conv_transpose_dims

 function (c::ConvTranspose)(x::AbstractArray)
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
-  σ.(∇conv_data(x, c.weight, stride = c.stride, pad = c.pad, dilation = c.dilation) .+ b)
+  cdims = conv_transpose_dims(c, x)
+  σ.(∇conv_data(x, c.weight, cdims) .+ b)
 end

 function Base.show(io::IO, l::ConvTranspose)
@ -114,97 +265,328 @@ end

 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  a(T.(x))
+
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
+
 """
-    DepthwiseConv(size, in)
-    DepthwiseConv(size, in=>mul)
-    DepthwiseConv(size, in=>mul, relu)
+    DepthwiseConv(filter::Tuple, in=>out)
+    DepthwiseConv(filter::Tuple, in=>out, activation)
+    DepthwiseConv(filter, in => out, σ = identity; init = glorot_uniform,
+                  stride = 1, pad = 0, dilation = 1)

-Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
-`in` and `mul` specify the number of input channels and channel multiplier respectively.
-In case the `mul` is not specified it is taken as 1.
+Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
+`in` and `out` specify the number of input and output channels respectively.
+Note that `out` must be an integer multiple of `in`.

-Data should be stored in WHCN order. In other words, a 100×100 RGB image would
-be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
+Data should be stored in WHCN order (width, height, # channels, batch size).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
+and a batch of 50 would be a `100×100×3×50` array.

-Takes the keyword arguments `pad` and `stride`.
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 """
-struct DepthwiseConv{N,F,A,V}
+struct DepthwiseConv{N,M,F,A,V}
  σ::F
  weight::A
  bias::V
  stride::NTuple{N,Int}
-  pad::NTuple{N,Int}
+  pad::NTuple{M,Int}
+  dilation::NTuple{N,Int}
 end

-DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-       stride = 1, pad = 0) where {T,N} =
-  DepthwiseConv(σ, w, b, expand.(sub2(Val(N)), (stride, pad))...)
+"""
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray)
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)

-DepthwiseConv(k::NTuple{N,Integer}, ch::Integer, σ = identity; init = glorot_uniform,
-     stride = 1, pad = 0) where N =
-  DepthwiseConv(param(init(k..., 1, ch)), param(zeros(ch)), σ,
-       stride = stride, pad = pad)
+Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
+forward pass.

-DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; init = glorot_uniform,
-     stride::NTuple{N,Integer} = map(_->1,k),
-     pad::NTuple{N,Integer} = map(_->0,k)) where N =
-  DepthwiseConv(param(init(k..., ch[2], ch[1])), param(zeros(ch[2]*ch[1])), σ,
-       stride = stride, pad = pad)
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.

-@treelike DepthwiseConv
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
+  stride = expand(Val(N-2), stride)
+  dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
+  return DepthwiseConv(σ, w, b, stride, pad, dilation)
+end
+
+function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+"""
+    depthwiseconvfilter(filter::Tuple, in=>out)
+
+Constructs a depthwise convolutional weight array defined by `filter` and channels
+from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`convfilter`](@ref)
+"""
+depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+                    init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
+
+function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+  @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
+
+  return DepthwiseConv(
+    weight,
+    bias,
+    σ;
+    stride = stride,
+    pad = pad,
+    dilation = dilation
+  )
+end
+
+@functor DepthwiseConv

 function (c::DepthwiseConv)(x)
  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
-  σ.(depthwiseconv(x, c.weight, stride = c.stride, pad = c.pad) .+ b)
+  cdims = DepthwiseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
+  σ.(depthwiseconv(x, c.weight, cdims) .+ b)
 end

 function Base.show(io::IO, l::DepthwiseConv)
-  print(io, "DepthwiseConv(", size(l.weight)[1:ndims(l.weight)-2])
-  print(io, ", ", size(l.weight, ndims(l.weight)), "=>", size(l.weight, ndims(l.weight)-1))
+  print(io, "DepthwiseConv(", size(l.weight)[1:end-2])
+  print(io, ", ", size(l.weight)[end], "=>", prod(size(l.weight)[end-1:end]))
  l.σ == identity || print(io, ", ", l.σ)
  print(io, ")")
 end

-"""
-    MaxPool(k)
+(a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
+  invoke(a, Tuple{AbstractArray}, x)

-Max pooling layer. `k` stands for the size of the window for each dimension of the input.
+(a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
+  a(T.(x))
+
+outdims(l::DepthwiseConv, isize) =
+  output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))

-Takes the keyword arguments `pad` and `stride`.
 """
-struct MaxPool{N}
+    CrossCor(filter, in=>out)
+    CrossCor(filter, in=>out, activation)
+    CrossCor(filter, in => out, σ = identity; init = glorot_uniform,
+             stride = 1, pad = 0, dilation = 1)
+
+Standard cross convolutional layer. `filter` should be a tuple like `(2, 2)`.
+`in` and `out` specify the number of input and output channels respectively.
+
+Data should be stored in WHCN order (width, height, # channels, batch size).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
+and a batch of 50 would be a `100×100×3×50` array.
+
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
+
+# Examples
+
+Apply a `CrossCor` layer to a 1-channel input using a 2×2 window filter size, giving us a
+16-channel output. Output is activated with ReLU.
+```julia
+filter = (2,2)
+in = 1
+out = 16
+CrossCor((2, 2), 1=>16, relu)
+```
+"""
+struct CrossCor{N,M,F,A,V}
+  σ::F
+  weight::A
+  bias::V
+  stride::NTuple{N,Int}
+  pad::NTuple{M,Int}
+  dilation::NTuple{N,Int}
+end
+
+"""
+    CrossCor(weight::AbstractArray, bias::AbstractArray)
+    CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the standard cross convolutional layer with user defined weight and bias
+arrays.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                  stride = 1, pad = 0, dilation = 1) where {T,N}
+  stride = expand(Val(N-2), stride)
+  dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
+  return CrossCor(σ, w, b, stride, pad, dilation)
+end
+
+function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+                  init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                  weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+
+  CrossCor(weight, bias, σ,
+       stride = stride, pad = pad, dilation = dilation)
+end
+
+@functor CrossCor
+
+function crosscor(x, w, ddims::DenseConvDims)
+  ddims = DenseConvDims(ddims, F=true)
+  return conv(x, w, ddims)
+end
+
+function (c::CrossCor)(x::AbstractArray)
+  # TODO: breaks gpu broadcast :(
+  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
+  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
+  cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
+  σ.(crosscor(x, c.weight, cdims) .+ b)
+end
+
+function Base.show(io::IO, l::CrossCor)
+  print(io, "CrossCor(", size(l.weight)[1:ndims(l.weight)-2])
+  print(io, ", ", size(l.weight, ndims(l.weight)-1), "=>", size(l.weight, ndims(l.weight)))
+  l.σ == identity || print(io, ", ", l.σ)
+  print(io, ")")
+end
+
+(a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
+  invoke(a, Tuple{AbstractArray}, x)
+
+(a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
+  a(T.(x))
+
+outdims(l::CrossCor, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
+
+"""
+    GlobalMaxPool()
+
+Global max pooling layer.
+
+Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output,
+by performing max pooling on the complete (w,h)-shaped feature maps.
+"""
+struct GlobalMaxPool end
+
+function (g::GlobalMaxPool)(x)
+  # Input size
+  x_size = size(x)
+  # Kernel size
+  k = x_size[1:end-2]
+  # Pooling dimensions
+  pdims = PoolDims(x, k)
+
+  return maxpool(x, pdims)
+end
+
+function Base.show(io::IO, g::GlobalMaxPool)
+  print(io, "GlobalMaxPool()")
+end
+
+"""
+    GlobalMeanPool()
+
+Global mean pooling layer.
+
+Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output,
+by performing mean pooling on the complete (w,h)-shaped feature maps.
+"""
+struct GlobalMeanPool end
+
+function (g::GlobalMeanPool)(x)
+  # Input size
+  x_size = size(x)
+  # Kernel size
+  k = x_size[1:end-2]
+  # Pooling dimensions
+  pdims = PoolDims(x, k)
+
+  return meanpool(x, pdims)
+end
+
+function Base.show(io::IO, g::GlobalMeanPool)
+  print(io, "GlobalMeanPool()")
+end
+
+"""
+    MaxPool(k; pad = 0, stride = k)
+
+Max pooling layer. `k` is the size of the window for each dimension of the input.
+
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
+=======
+"""
+struct MaxPool{N,M}
  k::NTuple{N,Int}
-  pad::NTuple{N,Int}
+  pad::NTuple{M,Int}
  stride::NTuple{N,Int}
 end

-MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N =
-  MaxPool(k, expand(Val(N), pad), expand(Val(N), stride))
+function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
+  stride = expand(Val(N), stride)
+  pad = calc_padding(pad, k, 1, stride)
+  return MaxPool(k, pad, stride)
+end

-(m::MaxPool)(x) = maxpool(x, m.k; pad = m.pad, stride = m.stride)
+function (m::MaxPool)(x)
+    pdims = PoolDims(x, m.k; padding=m.pad, stride=m.stride)
+    return maxpool(x, pdims)
+end

 function Base.show(io::IO, m::MaxPool)
  print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end

-"""
-    MeanPool(k)
+outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))

-Mean pooling layer. `k` stands for the size of the window for each dimension of the input.
-
-Takes the keyword arguments `pad` and `stride`.
 """
-struct MeanPool{N}
+    MeanPool(k; pad = 0, stride = k)
+
+Mean pooling layer. `k` is the size of the window for each dimension of the input.
+
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
+"""
+struct MeanPool{N,M}
    k::NTuple{N,Int}
-    pad::NTuple{N,Int}
+    pad::NTuple{M,Int}
    stride::NTuple{N,Int}
 end

-MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N =
-  MeanPool(k, expand(Val(N), pad), expand(Val(N), stride))
+function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
+  stride = expand(Val(N), stride)
+  pad = calc_padding(pad, k, 1, stride)
+  return MeanPool(k, pad, stride)
+end

-(m::MeanPool)(x) = meanpool(x, m.k; pad = m.pad, stride = m.stride)
+function (m::MeanPool)(x)
+    pdims = PoolDims(x, m.k; padding=m.pad, stride=m.stride)
+    return meanpool(x, pdims)
+end

 function Base.show(io::IO, m::MeanPool)
  print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
+
+outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@ -1,85 +1,108 @@
-"""
-    testmode!(m)
-    testmode!(m, false)
+istraining() = false

-Put layers like [`Dropout`](@ref) and [`BatchNorm`](@ref) into testing mode
-(or back to training mode with `false`).
-"""
-function testmode!(m, val::Bool=true)
-  prefor(x -> _testmode!(x, val), m)
-  return m
-end
+@adjoint istraining() = true, _ -> nothing

-_testmode!(m, test) = nothing
+_isactive(m) = isnothing(m.active) ? istraining() : m.active

-"""
-    Dropout(p)
-
-A Dropout layer. For each input, either sets that input to `0` (with probability
-`p`) or scales it by `1/(1-p)`. This is used as a regularisation, i.e. it
-reduces overfitting during training.
-
-Does nothing to the input once in [`testmode!`](@ref).
-"""
-mutable struct Dropout{F}
-  p::F
-  active::Bool
-end
-
-function Dropout(p)
-  @assert 0 ≤ p ≤ 1
-  Dropout{typeof(p)}(p, true)
-end
+_dropout_shape(s, ::Colon) = size(s)
+_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)

 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)

-function (a::Dropout)(x)
-  a.active || return x
-  y = similar(x)
-  rand!(y)
-  y .= _dropout_kernel.(y, a.p, 1 - a.p)
-  return x .* y
+"""
+    dropout(x, p; dims = :)
+
+The dropout function. For each input, either sets that input to `0` (with probability
+`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions,
+e.g. `dims=1` applies dropout along columns and `dims=2` along rows.
+This is used as a regularisation, i.e. it reduces overfitting during training.
+
+See also the [`Dropout`](@ref) layer.
+"""
+dropout(x, p; dims = :) = x
+
+@adjoint function dropout(x, p; dims = :)
+  y = rand!(similar(x, _dropout_shape(x, dims)))
+  y .= _dropout_kernel.(y, p, 1 - p)
+  return x .* y, Δ -> (Δ .* y, nothing)
 end

-_testmode!(a::Dropout, test) = (a.active = !test)
+"""
+    Dropout(p, dims = :)
+
+Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input.
+
+Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
+"""
+mutable struct Dropout{F,D}
+  p::F
+  dims::D
+  active::Union{Bool, Nothing}
+end
+
+# TODO: deprecate in v0.11
+Dropout(p, dims) = Dropout(p, dims, nothing)
+
+function Dropout(p; dims = :)
+  @assert 0 ≤ p ≤ 1
+  Dropout{typeof(p),typeof(dims)}(p, dims, nothing)
+end
+
+function (a::Dropout)(x)
+  _isactive(a) || return x
+  return dropout(x, a.p; dims = a.dims)
+end
+
+testmode!(m::Dropout, mode = true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+
+function Base.show(io::IO, d::Dropout)
+  print(io, "Dropout(", d.p)
+  d.dims != (:) && print(io, ", dims = $(repr(d.dims))")
+  print(io, ")")
+end

 """
    AlphaDropout(p)
-A dropout layer. It is used in Self-Normalizing Neural Networks. 
-(https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
-The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
+
+A dropout layer. Used in
+[Self-Normalizing Neural Networks](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf).
+The AlphaDropout layer ensures that mean and variance of activations
+remain the same as before.
+
+Does nothing to the input once [`testmode!`](@ref) is true.
 """
 mutable struct AlphaDropout{F}
  p::F
-  active::Bool
-end
-
-function AlphaDropout(p)
-  @assert 0 ≤ p ≤ 1
-  AlphaDropout(p,true)
+  active::Union{Bool, Nothing}
+  function AlphaDropout(p, active = nothing)
+    @assert 0 ≤ p ≤ 1
+    new{typeof(p)}(p, active)
+  end
 end

 function (a::AlphaDropout)(x)
-  a.active || return x
+  _isactive(a) || return x
  λ = eltype(x)(1.0507009873554804934193349852946)
  α = eltype(x)(1.6732632423543772848170429916717)
  α1 = eltype(x)(-λ*α)
  noise = randn(eltype(x), size(x))
-  x = @. x*(noise > (1 - a.p)) + α1 * (noise <= (1 - a.p))
+  x = @. x*(noise > (1 - a.p)) + α1 * (noise < (1 - a.p))
  A = (a.p + a.p * (1 - a.p) * α1 ^ 2)^0.5
  B = -A * α1 * (1 - a.p)
  x = @. A * x + B
  return x
 end

-_testmode!(a::AlphaDropout, test) = (a.active = !test)
+testmode!(m::AlphaDropout, mode = true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)

 """
    LayerNorm(h::Integer)

 A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be
-used with recurrent hidden states of size `h`. Normalises the mean/stddev of
-each input before applying a per-neuron gain/bias.
+used with recurrent hidden states of size `h`. Normalises the mean and standard
+deviation of each input before applying a per-neuron gain/bias.
 """
 struct LayerNorm{T}
  diag::Diagonal{T}
@ -88,7 +111,7 @@ end
 LayerNorm(h::Integer) =
  LayerNorm(Diagonal(h))

-@treelike LayerNorm
+@functor LayerNorm

 (a::LayerNorm)(x) = a.diag(normalise(x))

@ -101,8 +124,8 @@ end
              initβ = zeros, initγ = ones,
              ϵ = 1e-8, momentum = .1)

-Batch Normalization layer. The `channels` input should be the size of the
-channel dimension in your data (see below).
+[Batch Normalization](https://arxiv.org/pdf/1502.03167.pdf) layer.
+`channels` should be the size of the channel dimension in your data (see below).

 Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
 a batch of feature vectors this is just the data dimension, for `WHCN` images
@ -112,10 +135,9 @@ it's the usual channel dimension.)
 shifts them to have a new mean and variance (corresponding to the learnable,
 per-channel `bias` and `scale` parameters).

-See [Batch Normalization: Accelerating Deep Network Training by Reducing
-Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf).
+Use [`testmode!`](@ref) during inference.

-Example:
+# Examples
 ```julia
 m = Chain(
  Dense(28^2, 64),
@ -133,25 +155,29 @@ mutable struct BatchNorm{F,V,W,N}
  σ²::W  # moving std
  ϵ::N
  momentum::N
-  active::Bool
+  active::Union{Bool, Nothing}
 end

+# TODO: deprecate in v0.11
+BatchNorm(λ, β, γ, μ, σ², ϵ, momentum) = BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
+
 BatchNorm(chs::Integer, λ = identity;
          initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
-  BatchNorm(λ, param(initβ(chs)), param(initγ(chs)),
-            zeros(chs), ones(chs), ϵ, momentum, true)
+  BatchNorm(λ, initβ(chs), initγ(chs),
+            zeros(chs), ones(chs), ϵ, momentum, nothing)
+
+trainable(bn::BatchNorm) = (bn.β, bn.γ)

 function (BN::BatchNorm)(x)
  size(x, ndims(x)-1) == length(BN.β) ||
    error("BatchNorm expected $(length(BN.β)) channels, got $(size(x, ndims(x)-1))")
  dims = length(size(x))
  channels = size(x, dims-1)
-  affine_shape = ones(Int, dims)
-  affine_shape[end-1] = channels
-  m = prod(size(x)[1:end-2]) * size(x)[end]
+  affine_shape = ntuple(i->i == ndims(x) - 1 ? size(x, i) : 1, ndims(x))
+  m = div(prod(size(x)), channels)
  γ = reshape(BN.γ, affine_shape...)
  β = reshape(BN.β, affine_shape...)
-  if !BN.active
+  if !_isactive(BN)
    μ = reshape(BN.μ, affine_shape...)
    σ² = reshape(BN.σ², affine_shape...)
    ϵ = BN.ϵ
@ -160,11 +186,12 @@ function (BN::BatchNorm)(x)
    axes = [1:dims-2; dims] # axes to reduce along (all but channels axis)
    μ = mean(x, dims = axes)
    σ² = sum((x .- μ) .^ 2, dims = axes) ./ m
-    ϵ = data(convert(T, BN.ϵ))
+    ϵ = convert(T, BN.ϵ)
    # update moving mean/std
-    mtm = data(convert(T, BN.momentum))
-    BN.μ = (1 - mtm) .* BN.μ .+ mtm .* reshape(data(μ), :)
-    BN.σ² = (1 - mtm) .* BN.σ² .+ (mtm * m / (m - 1)) .* reshape(data(σ²), :)
+    mtm = BN.momentum
+    S = eltype(BN.μ)
+    BN.μ  = (1 - mtm) .* BN.μ .+ mtm .* S.(reshape(μ, :))
+    BN.σ² = (1 - mtm) .* BN.σ² .+ (mtm * m / (m - 1)) .* S.(reshape(σ², :))
  end

  let λ = BN.λ
@ -173,13 +200,10 @@ function (BN::BatchNorm)(x)
  end
 end

-children(BN::BatchNorm) =
-  (BN.λ, BN.β, BN.γ, BN.μ, BN.σ², BN.ϵ, BN.momentum, BN.active)
+@functor BatchNorm

-mapchildren(f, BN::BatchNorm) =  # e.g. mapchildren(cu, BN)
-  BatchNorm(BN.λ, f(BN.β), f(BN.γ), f(BN.μ), f(BN.σ²), BN.ϵ, BN.momentum, BN.active)
-
-_testmode!(BN::BatchNorm, test) = (BN.active = !test)
+testmode!(m::BatchNorm, mode = true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)

 function Base.show(io::IO, l::BatchNorm)
  print(io, "BatchNorm($(join(size(l.β), ", "))")
@ -187,35 +211,6 @@ function Base.show(io::IO, l::BatchNorm)
  print(io, ")")
 end

-
-"""
-    InstanceNorm(channels::Integer, σ = identity;
-                 initβ = zeros, initγ = ones,
-                 ϵ = 1e-8, momentum = .1)
-
-Instance Normalization layer. The `channels` input should be the size of the
-channel dimension in your data (see below).
-
-Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
-a batch of feature vectors this is just the data dimension, for `WHCN` images
-it's the usual channel dimension.)
-
-`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
-shifts them to have a new mean and variance (corresponding to the learnable,
-per-channel `bias` and `scale` parameters).
-
-See [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
-
-Example:
-```julia
-m = Chain(
-  Dense(28^2, 64),
-  InstanceNorm(64, relu),
-  Dense(64, 10),
-  InstanceNorm(10),
-  softmax)
-```
-"""
 expand_inst = (x, as) -> reshape(repeat(x, outer=[1, as[length(as)]]), as...)

 mutable struct InstanceNorm{F,V,W,N}
@ -226,13 +221,46 @@ mutable struct InstanceNorm{F,V,W,N}
  σ²::W  # moving std
  ϵ::N
  momentum::N
-  active::Bool
+  active::Union{Bool, Nothing}
 end

+# TODO: deprecate in v0.11
+"""
+    InstanceNorm(channels::Integer, σ = identity;
+                 initβ = zeros, initγ = ones,
+                 ϵ = 1e-8, momentum = .1)
+
+[Instance Normalization](https://arxiv.org/abs/1607.08022) layer.
+`channels` should be the size of the channel dimension in your data (see below).
+
+Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
+a batch of feature vectors this is just the data dimension, for `WHCN` images
+it's the usual channel dimension.)
+
+`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
+shifts them to have a new mean and variance (corresponding to the learnable,
+per-channel `bias` and `scale` parameters).
+
+Use [`testmode!`](@ref) during inference.
+
+# Examples
+```julia
+m = Chain(
+  Dense(28^2, 64),
+  InstanceNorm(64, relu),
+  Dense(64, 10),
+  InstanceNorm(10),
+  softmax)
+```
+"""
+InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum) = InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
+
 InstanceNorm(chs::Integer, λ = identity;
          initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
-  InstanceNorm(λ, param(initβ(chs)), param(initγ(chs)),
-            zeros(chs), ones(chs), ϵ, momentum, true)
+  InstanceNorm(λ, initβ(chs), initγ(chs),
+            zeros(chs), ones(chs), ϵ, momentum, nothing)
+
+trainable(in::InstanceNorm) = (in.β, in.γ)

 function (in::InstanceNorm)(x)
  size(x, ndims(x)-1) == length(in.β) ||
@ -243,28 +271,26 @@ function (in::InstanceNorm)(x)
  dims = length(size(x))
  c = size(x, dims-1)
  bs = size(x, dims)
-  affine_shape = ones(Int, dims)
-  affine_shape[end-1] = c
-  affine_shape[end] = bs
-  m = prod(size(x)[1:end-2])
+  affine_shape = ntuple(i->i == ndims(x) - 1 || i == ndims(x) ? size(x, i) : 1, ndims(x))
+  m = div(prod(size(x)), c*bs)
  γ, β = expand_inst(in.γ, affine_shape), expand_inst(in.β, affine_shape)

-  if !in.active
+  if !_isactive(in)
    μ = expand_inst(in.μ, affine_shape)
    σ² = expand_inst(in.σ², affine_shape)
    ϵ = in.ϵ
  else
    T = eltype(x)

-    ϵ = data(convert(T, in.ϵ))
+    ϵ = convert(T, in.ϵ)
    axes = 1:dims-2 # axes to reduce along (all but channels and batch size axes)
    μ = mean(x, dims = axes)
    σ² = mean((x .- μ) .^ 2, dims = axes)
-
+    S = eltype(in.μ)
    # update moving mean/std
-    mtm = data(convert(T, in.momentum))
-    in.μ = dropdims(mean(repeat((1 - mtm) .* in.μ, outer=[1, bs]) .+ mtm .* reshape(data(μ), (c, bs)), dims = 2), dims=2)
-    in.σ² = dropdims(mean((repeat((1 - mtm) .* in.σ², outer=[1, bs]) .+ (mtm * m / (m - 1)) .* reshape(data(σ²), (c, bs))), dims = 2), dims=2)
+    mtm = in.momentum
+    in.μ = dropdims(mean(repeat((1 - mtm) .* in.μ, outer=[1, bs]) .+ mtm .* S.(reshape(μ, (c, bs))), dims = 2), dims=2)
+    in.σ² = dropdims(mean((repeat((1 - mtm) .* in.σ², outer=[1, bs]) .+ (mtm * m / (m - 1)) .* S.(reshape(σ², (c, bs)))), dims = 2), dims=2)
  end

  let λ = in.λ
@ -273,16 +299,118 @@ function (in::InstanceNorm)(x)
  end
 end

-children(in::InstanceNorm) =
-  (in.λ, in.β, in.γ, in.μ, in.σ², in.ϵ, in.momentum, in.active)
+@functor InstanceNorm

-mapchildren(f, in::InstanceNorm) =  # e.g. mapchildren(cu, in)
-  InstanceNorm(in.λ, f(in.β), f(in.γ), f(in.μ), f(in.σ²), in.ϵ, in.momentum, in.active)
-
-_testmode!(in::InstanceNorm, test) = (in.active = !test)
+testmode!(m::InstanceNorm, mode = true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)

 function Base.show(io::IO, l::InstanceNorm)
  print(io, "InstanceNorm($(join(size(l.β), ", "))")
  (l.λ == identity) || print(io, ", λ = $(l.λ)")
  print(io, ")")
 end
+
+"""
+    GroupNorm(chs::Integer, G::Integer, λ = identity;
+              initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i),
+              ϵ = 1f-5, momentum = 0.1f0)
+
+[Group Normalization](https://arxiv.org/pdf/1803.08494.pdf) layer.
+This layer can outperform Batch Normalization and Instance Normalization.
+
+`chs` is the number of channels, the channel dimension of your input.
+For an array of N dimensions, the `N-1`th index is the channel dimension.
+
+`G` is the number of groups along which the statistics are computed.
+The number of channels must be an integer multiple of the number of groups.
+
+Use [`testmode!`](@ref) during inference.
+
+# Examples
+```julia
+m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1),
+          GroupNorm(32,16))
+          # 32 channels, 16 groups (G = 16), thus 2 channels per group used
+```
+"""
+mutable struct GroupNorm{F,V,W,N,T}
+  G::T # number of groups
+  λ::F  # activation function
+  β::V  # bias
+  γ::V  # scale
+  μ::W  # moving mean
+  σ²::W  # moving std
+  ϵ::N
+  momentum::N
+  active::Union{Bool, Nothing}
+end
+
+# TODO: deprecate in v0.11
+GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum) = GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, nothing)
+
+GroupNorm(chs::Integer, G::Integer, λ = identity;
+          initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
+  GroupNorm(G, λ, initβ(chs), initγ(chs),
+            zeros(G,1), ones(G,1), ϵ, momentum, nothing)
+
+trainable(gn::GroupNorm) = (gn.β, gn.γ)
+
+function(gn::GroupNorm)(x)
+  size(x,ndims(x)-1) == length(gn.β) || error("Group Norm expected $(length(gn.β)) channels, but got $(size(x,ndims(x)-1)) channels")
+  ndims(x) > 2 || error("Need to pass at least 3 channels for Group Norm to work")
+  (size(x,ndims(x) -1))%gn.G == 0 || error("The number of groups ($(gn.G)) must divide the number of channels ($(size(x,ndims(x) -1)))")
+
+  dims = length(size(x))
+  groups = gn.G
+  channels = size(x, dims-1)
+  batches = size(x,dims)
+  channels_per_group = div(channels,groups)
+  affine_shape = ntuple(i->i == ndims(x) - 1 ? size(x, i) : 1, ndims(x))
+
+  # Output reshaped to (W,H...,C/G,G,N)
+  μ_affine_shape = ntuple(i->i == ndims(x) ? groups : 1, ndims(x) + 1)
+
+  m = prod(size(x)[1:end-2]) * channels_per_group
+  γ = reshape(gn.γ, affine_shape...)
+  β = reshape(gn.β, affine_shape...)
+
+  y = reshape(x,((size(x))[1:end-2]...,channels_per_group,groups,batches))
+  if !_isactive(gn)
+    og_shape = size(x)
+    μ = reshape(gn.μ, μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
+    σ² = reshape(gn.σ², μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
+    ϵ = gn.ϵ
+  else
+    T = eltype(x)
+    og_shape = size(x)
+    axes = [(1:ndims(y)-2)...] # axes to reduce along (all but channels axis)
+    μ = mean(y, dims = axes)
+    σ² = mean((y .- μ) .^ 2, dims = axes)
+
+    ϵ = convert(T, gn.ϵ)
+    # update moving mean/std
+    mtm = gn.momentum
+    S = eltype(gn.μ)
+    gn.μ = mean((1 - mtm) .* gn.μ .+ mtm .* S.(reshape(μ, (groups,batches))),dims=2)
+    gn.σ² = mean((1 - mtm) .* gn.σ² .+ (mtm * m / (m - 1)) .* S.(reshape(σ², (groups,batches))),dims=2)
+  end
+
+  let λ = gn.λ
+    x̂ = (y .- μ) ./ sqrt.(σ² .+ ϵ)
+
+    # Reshape x̂
+    x̂ = reshape(x̂,og_shape)
+    λ.(γ .* x̂ .+ β)
+  end
+end
+
+@functor GroupNorm
+
+testmode!(m::GroupNorm, mode = true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+
+function Base.show(io::IO, l::GroupNorm)
+  print(io, "GroupNorm($(join(size(l.β), ", "))")
+  (l.λ == identity) || print(io, ", λ = $(l.λ)")
+  print(io, ")")
+end
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@ -1,5 +1,5 @@
 gate(h, n) = (1:h) .+ h*(n-1)
-gate(x::AbstractVector, h, n) = x[gate(h,n)]
+gate(x::AbstractVector, h, n) = @view x[gate(h,n)]
 gate(x::AbstractMatrix, h, n) = x[gate(h,n),:]

 # Stateful recurrence
@ -12,16 +12,16 @@ in the background. `cell` should be a model of the form:

    h, y = cell(h, x...)

-For example, here's a recurrent network that keeps a running total of its inputs.
+For example, here's a recurrent network that keeps a running total of its inputs:

 ```julia
-accum(h, x) = (h+x, x)
+accum(h, x) = (h + x, x)
 rnn = Flux.Recur(accum, 0)
-rnn(2) # 2
-rnn(3) # 3
-rnn.state # 5
-rnn.(1:10) # apply to a sequence
-rnn.state # 60
+rnn(2)      # 2
+rnn(3)      # 3
+rnn.state   # 5
+rnn.(1:10)  # apply to a sequence
+rnn.state   # 60
 ```
 """
 mutable struct Recur{T}
@ -38,36 +38,22 @@ function (m::Recur)(xs...)
  return y
 end

-@treelike Recur cell, init
+@functor Recur cell, init

 Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")

-_truncate(x::AbstractArray) = Tracker.data(x)
-_truncate(x::Tuple) = _truncate.(x)
-
-"""
-    truncate!(rnn)
-
-Truncates the gradient of the hidden state in recurrent layers. The value of the
-state is preserved. See also `reset!`.
-
-Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
-
-    rnn.state = Tracker.data(rnn.state)
-"""
-truncate!(m) = prefor(x -> x isa Recur && (x.state = _truncate(x.state)), m)
-
 """
    reset!(rnn)

-Reset the hidden state of a recurrent layer back to its original value. See also
-`truncate!`.
+Reset the hidden state of a recurrent layer back to its original value.

-Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
-
-    rnn.state = hidden(rnn.cell)
+Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
+```julia
+rnn.state = hidden(rnn.cell)
+```
 """
-reset!(m) = prefor(x -> x isa Recur && (x.state = x.init), m)
+reset!(m::Recur) = (m.state = m.init)
+reset!(m) = foreach(reset!, functor(m)[1])

 flip(f, xs) = reverse(f.(reverse(xs)))

@ -83,8 +69,8 @@ end

 RNNCell(in::Integer, out::Integer, σ = tanh;
        init = glorot_uniform) =
-  RNNCell(σ, param(init(out, in)), param(init(out, out)),
-          param(init(out)), param(zeros(out)))
+  RNNCell(σ, init(out, in), init(out, out),
+          init(out), zeros(out))

 function (m::RNNCell)(h, x)
  σ, Wi, Wh, b = m.σ, m.Wi, m.Wh, m.b
@ -94,7 +80,7 @@ end

 hidden(m::RNNCell) = m.h

-@treelike RNNCell
+@functor RNNCell

 function Base.show(io::IO, l::RNNCell)
  print(io, "RNNCell(", size(l.Wi, 2), ", ", size(l.Wi, 1))
@ -122,9 +108,9 @@ end

 function LSTMCell(in::Integer, out::Integer;
                  init = glorot_uniform)
-  cell = LSTMCell(param(init(out*4, in)), param(init(out*4, out)), param(init(out*4)),
-                  param(zeros(out)), param(zeros(out)))
-  cell.b.data[gate(out, 2)] .= 1
+  cell = LSTMCell(init(out * 4, in), init(out * 4, out), init(out * 4),
+                  zeros(out), zeros(out))
+  cell.b[gate(out, 2)] .= 1
  return cell
 end

@ -142,7 +128,7 @@ end

 hidden(m::LSTMCell) = (m.h, m.c)

-@treelike LSTMCell
+@functor LSTMCell

 Base.show(io::IO, l::LSTMCell) =
  print(io, "LSTMCell(", size(l.Wi, 2), ", ", size(l.Wi, 1)÷4, ")")
@ -150,10 +136,10 @@ Base.show(io::IO, l::LSTMCell) =
 """
    LSTM(in::Integer, out::Integer)

-Long Short Term Memory recurrent layer. Behaves like an RNN but generally
-exhibits a longer memory span over sequences.
+[Long Short Term Memory](https://www.researchgate.net/publication/13853244_Long_Short-term_Memory)
+recurrent layer. Behaves like an RNN but generally exhibits a longer memory span over sequences.

-See [this article](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
+See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 for a good overview of the internals.
 """
 LSTM(a...; ka...) = Recur(LSTMCell(a...; ka...))
@ -168,8 +154,8 @@ mutable struct GRUCell{A,V}
 end

 GRUCell(in, out; init = glorot_uniform) =
-  GRUCell(param(init(out*3, in)), param(init(out*3, out)),
-          param(init(out*3)), param(zeros(out)))
+  GRUCell(init(out * 3, in), init(out * 3, out),
+          init(out * 3), zeros(out))

 function (m::GRUCell)(h, x)
  b, o = m.b, size(h, 1)
@ -183,7 +169,7 @@ end

 hidden(m::GRUCell) = m.h

-@treelike GRUCell
+@functor GRUCell

 Base.show(io::IO, l::GRUCell) =
  print(io, "GRUCell(", size(l.Wi, 2), ", ", size(l.Wi, 1)÷3, ")")
@ -191,10 +177,10 @@ Base.show(io::IO, l::GRUCell) =
 """
    GRU(in::Integer, out::Integer)

-Gated Recurrent Unit layer. Behaves like an RNN but generally
-exhibits a longer memory span over sequences.
+[Gated Recurrent Unit](https://arxiv.org/abs/1406.1078) layer. Behaves like an
+RNN but generally exhibits a longer memory span over sequences.

-See [this article](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
+See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 for a good overview of the internals.
 """
 GRU(a...; ka...) = Recur(GRUCell(a...; ka...))
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@ -1,48 +1,181 @@
-using NNlib: logsoftmax, logσ
-
 # Cost functions
+"""
+    mae(ŷ, y)

+Return the mean of absolute error; calculated as
+`sum(abs.(ŷ .- y)) / length(y)`.
+"""
+mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
+
+
+"""
+    mse(ŷ, y)
+
+Return the mean squared error between ŷ and y; calculated as
+`sum((ŷ .- y).^2) / length(y)`.
+
+# Examples
+```jldoctest
+julia> Flux.mse([0, 2], [1, 1])
+1//1
+```
+"""
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)

-function crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
-  -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
+
+"""
+    msle(ŷ, y; ϵ=eps(eltype(ŷ)))
+
+Return the mean of the squared logarithmic errors; calculated as
+`sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
+The `ϵ` term provides numerical stability.
+
+Penalizes an under-predicted estimate greater than an over-predicted estimate.
+"""
+msle(ŷ, y; ϵ=eps(eltype(ŷ))) = sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) * 1 // length(y)
+
+
+
+"""
+    huber_loss(ŷ, y; δ=1.0)
+
+Return the mean of the [Huber loss](https://en.wikipedia.org/wiki/Huber_loss)
+given the prediction `ŷ` and true values `y`.
+
+                 | 0.5 * |ŷ - y|,            for |ŷ - y| <= δ
+    Huber loss = |
+                 |  δ * (|ŷ - y| - 0.5 * δ), otherwise
+"""
+#TODO: remove dropgrad when Zygote can handle this function with CuArrays
+function huber_loss(ŷ, y;  δ=eltype(ŷ)(1))
+   abs_error = abs.(ŷ .- y)
+   temp = Zygote.dropgrad(abs_error .<  δ)
+   x = eltype(ŷ)(0.5)
+   hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
 end

-function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
-  return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
+  return -sum(xlogy.(y, ŷ)) * 1 // size(y, 2)
+end
+
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
+  return -sum(xlogy.(y, ŷ)) .* weight * 1 // size(y, 2)
+end
+
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
+  return -sum(xlogy.(y, ŷ) .* weight) * 1 // size(y, 2)
+end
+
+"""
+    crossentropy(ŷ, y; weight = nothing)
+
+Return the cross entropy between the given probability distributions;
+calculated as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
+
+`weight` can be `Nothing`, a `Number` or an `AbstractVector`.
+`weight=nothing` acts like `weight=1` but is faster.
+
+See also: [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.crossentropy(softmax([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
+3.085467254747739
+```
+"""
+crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
+
+"""
+    logitcrossentropy(ŷ, y; weight = 1)
+
+Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
+calculated as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
+
+`logitcrossentropy(ŷ, y)` is mathematically equivalent to
+[`Flux.crossentropy(softmax(ŷ), y)`](@ref) but it is more numerically stable.
+
+See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.logitcrossentropy([-1.1491, 0.8619, 0.3127], [1, 1, 0])
+3.085467254747738
+```
+"""
+function logitcrossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
+  return -sum(y .* logsoftmax(ŷ) .* weight) * 1 // size(y, 2)
 end

 """
    binarycrossentropy(ŷ, y; ϵ=eps(ŷ))

-Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerical stability.
+Return ``-y*\\log(ŷ + ϵ) - (1-y)*\\log(1-ŷ + ϵ)``. The `ϵ` term provides numerical stability.

-    julia> binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0.])
-    3-element Array{Float64,1}:
-    1.4244
-    0.352317
-    0.86167
+Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation.
+
+See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
+3-element Array{Float64,1}:
+ 1.424397097347566
+ 0.35231664672364077
+ 0.8616703662235441
+```
 """
-binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
+binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)
+
+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)

 """
-    logitbinarycrossentropy(logŷ, y)
+    logitbinarycrossentropy(ŷ, y)

-`logitbinarycrossentropy(logŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(logŷ), y)`
-but it is more numerically stable.
+`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to
+[`Flux.binarycrossentropy(σ(ŷ), y)`](@ref) but it is more numerically stable.

-    julia> logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0.])
-    3-element Array{Float64,1}:
-     1.4244
-     0.352317
-     0.86167
+See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0])
+3-element Array{Float64,1}:
+ 1.4243970973475661
+ 0.35231664672364094
+ 0.8616703662235443
+```
 """
-logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
+
+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)

 """
-    normalise(x::AbstractArray; dims=1)
+    normalise(x; dims=1)

-    Normalises x to mean 0 and standard deviation 1, across the dimensions given by dims. Defaults to normalising over columns.
+Normalise `x` to mean 0 and standard deviation 1 across the dimensions given by `dims`.
+Defaults to normalising over columns.
+
+```jldoctest
+julia> a = reshape(collect(1:9), 3, 3)
+3×3 Array{Int64,2}:
+ 1  4  7
+ 2  5  8
+ 3  6  9
+
+julia> Flux.normalise(a)
+3×3 Array{Float64,2}:
+ -1.22474  -1.22474  -1.22474
+  0.0       0.0       0.0
+  1.22474   1.22474   1.22474
+
+julia> Flux.normalise(a, dims=2)
+3×3 Array{Float64,2}:
+ -1.22474  0.0  1.22474
+ -1.22474  0.0  1.22474
+ -1.22474  0.0  1.22474
+```
 """
 function normalise(x::AbstractArray; dims=1)
  μ′ = mean(x, dims = dims)
@ -50,7 +183,114 @@ function normalise(x::AbstractArray; dims=1)
  return (x .- μ′) ./ σ′
 end

-function normalise(x::AbstractArray, dims)
-  Base.depwarn("`normalise(x::AbstractArray, dims)` is deprecated, use `normalise(a, dims=dims)` instead.", :normalise)
-  normalise(x, dims = dims)
+"""
+    kldivergence(ŷ, y)
+
+Return the
+[Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)
+between the given probability distributions.
+
+KL divergence is a measure of how much one probability distribution is different
+from the other.
+It is always non-negative and zero only when both the distributions are equal
+everywhere.
+"""
+function kldivergence(ŷ, y)
+  entropy = sum(xlogx.(y)) * 1 //size(y,2)
+  cross_entropy = crossentropy(ŷ, y)
+  return entropy + cross_entropy
+end
+
+"""
+    poisson(ŷ, y)
+
+Return how much the predicted distribution `ŷ` diverges from the expected Poisson
+distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
+
+[More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
+"""
+poisson(ŷ, y) = sum(ŷ .- xlogy.(y, ŷ)) * 1 // size(y,2)
+
+"""
+    hinge(ŷ, y)
+
+Return the [hinge loss](https://en.wikipedia.org/wiki/Hinge_loss) given the
+prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as
+`sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)`.
+
+See also: [`squared_hinge`](@ref)
+"""
+hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) * 1 // size(y, 2)
+
+"""
+    squared_hinge(ŷ, y)
+
+Return the squared hinge loss given the prediction `ŷ` and true labels `y`
+(containing 1 or -1); calculated as `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`.
+
+See also: [`hinge`](@ref)
+"""
+squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) * 1 // size(y, 2)
+
+"""
+    dice_coeff_loss(ŷ, y; smooth=1)
+
+Return a loss based on the dice coefficient.
+Used in the [V-Net](https://arxiv.org/pdf/1606.04797v1.pdf) image segmentation
+architecture.
+Similar to the F1_score. Calculated as:
+    1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)`
+"""
+dice_coeff_loss(ŷ, y; smooth=eltype(ŷ)(1.0)) = 1 - (2*sum(y .* ŷ) + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
+
+"""
+    tversky_loss(ŷ, y; β=0.7)
+
+Return the [Tversky loss](https://arxiv.org/pdf/1706.05721.pdf).
+Used with imbalanced data to give more weight to false negatives.
+Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
+Calculated as:
+    1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
+"""
+tversky_loss(ŷ, y; β=eltype(ŷ)(0.7)) = 1 - (sum(y .* ŷ) + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
+
+"""
+    flatten(x::AbstractArray)
+
+Transform (w, h, c, b)-shaped input into (w × h × c, b)-shaped output
+by linearizing all values for each element in the batch.
+"""
+function flatten(x::AbstractArray)
+  return reshape(x, :, size(x)[end])
+end
+
+"""
+    xlogx(x)
+Return `x * log(x)` for `x ≥ 0`, handling `x = 0` by taking the downward limit.
+"""
+function xlogx(x)
+  result = x * log(x)
+  ifelse(iszero(x), zero(result), result)
+end
+CuArrays.@cufunc function xlogx(x)
+  result = x * log(x)
+  ifelse(iszero(x), zero(result), result)
+end
+
+"""
+    xlogy(x, y)
+Return `x * log(y)` for `y > 0` with correct limit at `x = 0`.
+"""
+function xlogy(x, y)
+  result = x * log(y)
+  ifelse(iszero(x), zero(result), result)
+end
+CuArrays.@cufunc function xlogy(x, y)
+  result = x * log(y)
+  ifelse(iszero(x), zero(result), result)
+end
+
+@adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric)
+  res = xlogy.(x, y)
+  res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
 end
--- a/src/onehot.jl
+++ b/src/onehot.jl
@ -9,6 +9,8 @@ Base.size(xs::OneHotVector) = (Int64(xs.of),)

 Base.getindex(xs::OneHotVector, i::Integer) = i == xs.ix

+Base.getindex(xs::OneHotVector, ::Colon) = OneHotVector(xs.ix, xs.of)
+
 A::AbstractMatrix * b::OneHotVector = A[:, b.ix]

 struct OneHotMatrix{A<:AbstractVector{OneHotVector}} <: AbstractMatrix{Bool}
@ -18,11 +20,15 @@ end

 Base.size(xs::OneHotMatrix) = (Int64(xs.height),length(xs.data))

-Base.getindex(xs::OneHotMatrix, i::Integer, j::Integer) = xs.data[j][i]
+Base.getindex(xs::OneHotMatrix, i::Union{Integer, AbstractVector}, j::Integer) = xs.data[j][i]
 Base.getindex(xs::OneHotMatrix, ::Colon, i::Integer) = xs.data[i]
 Base.getindex(xs::OneHotMatrix, ::Colon, i::AbstractArray) = OneHotMatrix(xs.height, xs.data[i])
+Base.getindex(xs::OneHotMatrix, ::Colon, ::Colon) = OneHotMatrix(xs.height, copy(xs.data))

-A::AbstractMatrix * B::OneHotMatrix = A[:, map(x->x.ix, B.data)]
+Base.getindex(xs::OneHotMatrix, i::Integer, ::Colon) = map(x -> x[i], xs.data)
+
+# remove workaround when https://github.com/JuliaGPU/CuArrays.jl/issues/676 is fixed
+A::AbstractMatrix * B::OneHotMatrix = A[:, cpu(map(x->x.ix, B.data))]

 Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...])

@ -32,13 +38,34 @@ import Adapt: adapt, adapt_structure

 adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))

-@init @require CuArrays="3a865a2d-5b23-5a0f-bc46-62713ec82fae" begin
-  import .CuArrays: CuArray, cudaconvert
-  import Base.Broadcast: BroadcastStyle, ArrayStyle
-  BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
-  cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
-end
+import .CuArrays: CuArray, CuArrayStyle, cudaconvert
+import Base.Broadcast: BroadcastStyle, ArrayStyle
+BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = CuArrayStyle{2}()
+cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))

+"""
+    onehot(l, labels[, unk])
+
+Create a `OneHotVector` with its `l`-th element `true` based on the
+possible set of `labels`.
+If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
+in `labels`; otherwise, it will raise an error.
+
+# Examples
+```jldoctest
+julia> Flux.onehot(:b, [:a, :b, :c])
+3-element Flux.OneHotVector:
+ 0
+ 1
+ 0
+
+julia> Flux.onehot(:c, [:a, :b, :c])
+3-element Flux.OneHotVector:
+ 0
+ 0
+ 1
+```
+"""
 function onehot(l, labels)
  i = something(findfirst(isequal(l), labels), 0)
  i > 0 || error("Value $l is not in labels")
@ -51,23 +78,48 @@ function onehot(l, labels, unk)
  OneHotVector(i, length(labels))
 end

+"""
+    onehotbatch(ls, labels[, unk...])
+
+Create a `OneHotMatrix` with a batch of labels based on the
+possible set of `labels`.
+If `unk` is given, return [`onehot(unk, labels)`](@ref) if one of the input
+labels `ls` is not found in `labels`; otherwise it will error.
+
+# Examples
+```jldoctest
+julia> Flux.onehotbatch([:b, :a, :b], [:a, :b, :c])
+3×3 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}:
+ 0  1  0
+ 1  0  1
+ 0  0  0
+```
+"""
 onehotbatch(ls, labels, unk...) =
  OneHotMatrix(length(labels), [onehot(l, labels, unk...) for l in ls])

+Base.argmax(xs::OneHotVector) = xs.ix
+
+"""
+    onecold(y[, labels = 1:length(y)])
+
+Inverse operations of [`onehot`](@ref).
+
+# Examples
+```jldoctest
+julia> Flux.onecold([true, false, false], [:a, :b, :c])
+:a
+
+julia> Flux.onecold([0.3, 0.2, 0.5], [:a, :b, :c])
+:c
+```
+"""
 onecold(y::AbstractVector, labels = 1:length(y)) = labels[Base.argmax(y)]

 onecold(y::AbstractMatrix, labels...) =
  dropdims(mapslices(y -> onecold(y, labels...), y, dims=1), dims=1)

-function argmax(xs...)
-  Base.depwarn("`argmax(...) is deprecated, use `onecold(...)` instead.", :argmax)
-  return onecold(xs...)
-end
+onecold(y::OneHotMatrix, labels...) =
+  mapreduce(x -> Flux.onecold(x, labels...), |, y.data, dims = 2, init = 0)

-# Ambiguity hack
-
-a::TrackedMatrix * b::OneHotVector = invoke(*, Tuple{AbstractMatrix,OneHotVector}, a, b)
-a::TrackedMatrix * b::OneHotMatrix = invoke(*, Tuple{AbstractMatrix,OneHotMatrix}, a, b)
-
-onecold(x::TrackedVector, l...) = onecold(data(x), l...)
-onecold(x::TrackedMatrix, l...) = onecold(data(x), l...)
+@nograd onecold, onehot, onehotbatch
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@ -1,12 +1,14 @@
 module Optimise

-export train!,
-	SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
-	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,
-	InvDecay, ExpDecay, WeightDecay, stop, Optimiser
+using LinearAlgebra
+
+export train!, update!,
+	Descent, ADAM, Momentum, Nesterov, RMSProp,
+	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
+	InvDecay, ExpDecay, WeightDecay, stop, Optimiser,
+	ClipValue, ClipNorm

 include("optimisers.jl")
 include("train.jl")
-include("deprecations.jl")

 end
--- a/src/optimise/deprecations.jl
+++ b/src/optimise/deprecations.jl
@ -1,126 +0,0 @@
-using Base: depwarn
-using Flux: Params
-
-check_decay(opt, decay) = decay == 0 ? opt : Optimiser(opt, InvDecay(decay))
-
-# legacy update rule
-updaterule(opt, ps) = () -> _update_params!(opt, ps)
-
-function SGD(params::Union{AbstractArray, Params}, η = 0.1; decay = 0.)
-  depwarn("SGD(params) is deprecated; use Descent(η::Float64) instead", :SGD)
-
-  ps = params
-  opt = Descent(η)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function Momentum(params::Union{AbstractArray, Params}, η = 0.01; ρ = 0.9, decay = 0.)
-  depwarn("Momentum(params) is deprecated; use Momentum(η::Float64) instead", :Momentum)
-
-  ps = params
-  opt = Momentum(η, ρ)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function Nesterov(params::Union{AbstractArray, Params}, η = 0.001; ρ = 0.9, decay = 0.)
-  depwarn("Nesterov(params) is deprecated; use Nesterov(η::Float64) instead", :Nesterov)
-
-  ps = params
-  opt = Nesterov(η, ρ)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function RMSProp(params::Union{AbstractArray, Params}, η = 0.001; ρ = 0.9, decay = 0.)
-  depwarn("RMSProp(params) is deprecated; use RMSProp(η::Float64) instead", :RMSProp)
-
-  ps = params
-  opt = RMSProp(η, ρ)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function ADAM(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.)
-  depwarn("ADAM(params) is deprecated; use ADAM(η::Float64) instead", :ADAM)
-
-  ps = params
-  β = (β1, β2)
-  opt = ADAM(η, β)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function ADAGrad(params::Union{AbstractArray, Params}, η::Float64 = 0.1; decay = 0.)
-  depwarn("ADAGrad(params) is deprecated; use ADAGrad(η::Float64) instead", :ADAGrad)
-
-  ps = params
-  opt = ADAGrad(η)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function ADADelta(params::Union{AbstractArray, Params}, ρ::Float64 = 0.9; decay = 0.)
-  depwarn("ADADelta(params) is deprecated; use ADADelta(η::Float64) instead", :ADADelta)
-
-  ps = params
-  opt = ADADelta(ρ)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function AdaMax(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.)
-  depwarn("AdaMax(params) is deprecated; use AdaMax(η::Float64) instead", :AdaMax)
-
-  ps = params
-  β = (β1, β2)
-  opt = AdaMax(η, β)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function AMSGrad(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.)
-  depwarn("AMSGrad(params) is deprecated; use AMSGrad(η::Float64) instead", :AMSGrad)
-
-  ps = params
-  β = (β1, β2)
-  opt = AMSGrad(η, β)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function NADAM(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.)
-  depwarn("NADAM(params) is deprecated; use NADAM(η::Float64) instead", :NADAM)
-
-  ps = params
-  β = (β1, β2)
-  opt = NADAM(η, β)
-  opt = check_decay(opt, decay)
-  updaterule(opt, ps)
-end
-
-function ADAMW(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.)
-  depwarn("ADAMW(params) is deprecated; use ADAMW(η::Float64) instead", :ADAMW)
-
-  ps = params
-  β = (β1, β2)
-  opt = ADAMW(η, β)
-  opt = check_decay(opt, decay)
-  decay != 0 && (opt = Optimiser(opt, WeightDecay(decay)))
-  updaterule(opt, ps)
-end
-
-# Old training loop
-
-struct OldOptimiser
-  func
-end
-
-_update_params!(opt::OldOptimiser, ps) = opt.func()
-
-# Train function
-function train!(loss, data, opt; cb = () -> ())
-  depwarn("train!(loss, data, opt) is deprecated; use train!(loss, params, data, opt) instead", :train!)
-  train!(loss, (), data, OldOptimiser(opt); cb = cb)
-end
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@ -1,5 +1,4 @@
 using Flux
-using Base: @get!
 using MacroTools: @forward

 const ϵ = 1e-8
@ -7,10 +6,29 @@ const ϵ = 1e-8
 # TODO: should use weak refs

 """
-    Descent(η)
+    Descent(η = 0.1)

 Classic gradient descent optimiser with learning rate `η`.
-For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
+For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+
+# Examples
+```julia
+opt = Descent()
+
+opt = Descent(0.3)
+
+ps = params(model)
+
+gs = gradient(ps) do
+    loss(x, y)
+end
+
+Flux.Optimise.update!(opt, ps, gs)
+```
 """
 mutable struct Descent
  eta::Float64
@ -23,9 +41,22 @@ function apply!(o::Descent, x, Δ)
 end

 """
-    Momentum(params, η = 0.01; ρ = 0.9)
+    Momentum(η = 0.01, ρ = 0.9)

-Gradient descent with learning rate `η` and momentum `ρ`.
+Gradient descent optimizer with learning rate `η` and momentum `ρ`.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Momentum (`ρ`): Controls the acceleration of gradient descent in the
+                  prominent direction, in effect dampening oscillations.
+
+# Examples
+```julia
+opt = Momentum()
+
+opt = Momentum(0.01, 0.99)
+```
 """
 mutable struct Momentum
  eta::Float64
@ -37,15 +68,28 @@ Momentum(η = 0.01, ρ = 0.9) = Momentum(η, ρ, IdDict())

 function apply!(o::Momentum, x, Δ)
  η, ρ = o.eta, o.rho
-  v = get!(o.velocity, x, zero(x))::typeof(data(x))
+  v = get!(o.velocity, x, zero(x))::typeof(x)
  @. v = ρ * v - η * Δ
  @. Δ = -v
 end

 """
-    Nesterov(eta, ρ = 0.9)
+    Nesterov(η = 0.001, ρ = 0.9)

-Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
+Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
+                           prominent direction, in effect dampening oscillations.
+
+# Examples
+```julia
+opt = Nesterov()
+
+opt = Nesterov(0.003, 0.95)
+```
 """
 mutable struct Nesterov
  eta::Float64
@ -57,7 +101,7 @@ Nesterov(η = 0.001, ρ = 0.9) = Nesterov(η, ρ, IdDict())

 function apply!(o::Nesterov, x, Δ)
  η, ρ = o.eta, o.rho
-  v = get!(o.velocity, x, zero(x))::typeof(data(x))
+  v = get!(o.velocity, x, zero(x))::typeof(x)
  d = @. ρ^2 * v - (1+ρ) * η * Δ
  @. v = ρ*v - η*Δ
  @. Δ = -d
@ -66,9 +110,23 @@ end
 """
    RMSProp(η = 0.001, ρ = 0.9)

-[RMSProp](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
-optimiser. Parameters other than learning rate don't need tuning. Often a good
-choice for recurrent networks.
+Optimizer using the
+[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+algorithm. Often a good choice for recurrent networks. Parameters other than learning rate
+generally don't need tuning.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Momentum (`ρ`): Controls the acceleration of gradient descent in the
+                  prominent direction, in effect dampening oscillations.
+
+# Examples
+```julia
+opt = RMSProp()
+
+opt = RMSProp(0.002, 0.95)
+```
 """
 mutable struct RMSProp
  eta::Float64
@ -80,15 +138,28 @@ RMSProp(η = 0.001, ρ = 0.9) = RMSProp(η, ρ, IdDict())

 function apply!(o::RMSProp, x, Δ)
  η, ρ = o.eta, o.rho
-  acc = get!(o.acc, x, zero(x))::typeof(data(x))
+  acc = get!(o.acc, x, zero(x))::typeof(x)
  @. acc = ρ * acc + (1 - ρ) * Δ^2
  @. Δ *= η / (√acc + ϵ)
 end

 """
-    ADAM(η = 0.001, β = (0.9, 0.999))
+    ADAM(η = 0.001, β::Tuple = (0.9, 0.999))

 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
+
+# Examples
+```julia
+opt = ADAM()
+
+opt = ADAM(0.001, (0.9, 0.8))
+```
 """
 mutable struct ADAM
  eta::Float64
@ -109,10 +180,65 @@ function apply!(o::ADAM, x, Δ)
 end

 """
-    AdaMax(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08)
+    RADAM(η = 0.001, β::Tuple = (0.9, 0.999))

-[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
-the ∞-norm.
+[Rectified ADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimizer.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
+
+# Examples
+```julia
+opt = RADAM()
+
+opt = RADAM(0.001, (0.9, 0.8))
+```
+"""
+mutable struct RADAM
+  eta::Float64
+  beta::Tuple{Float64,Float64}
+  state::IdDict
+end
+
+RADAM(η = 0.001, β = (0.9, 0.999)) = RADAM(η, β, IdDict())
+
+function apply!(o::RADAM, x, Δ)
+  η, β = o.eta, o.beta
+  ρ∞ = 2/(1-β[2])-1
+  mt, vt, βp, t = get!(o.state, x, (zero(x), zero(x), β, 1))
+  @. mt = β[1] * mt + (1 - β[1]) * Δ
+  @. vt = β[2] * vt + (1 - β[2]) * Δ^2
+  ρ = ρ∞ - 2t*βp[2]/(1-βp[2])
+  if ρ > 4
+    r = sqrt((ρ-4)*(ρ-2)*ρ∞/((ρ∞-4)*(ρ∞-2)*ρ))
+    @. Δ =  mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + ϵ) * η * r
+  else
+    @. Δ =  mt / (1 - βp[1]) * η
+  end
+  o.state[x] = (mt, vt, βp .* β, t+1)
+  return Δ
+end
+
+"""
+    AdaMax(η = 0.001, β::Tuple = (0.9, 0.999))
+
+[AdaMax](https://arxiv.org/abs/1412.6980v9) is a variant of ADAM based on the ∞-norm.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
+
+# Examples
+```julia
+opt = AdaMax()
+
+opt = AdaMax(0.001, (0.9, 0.995))
+```
 """
 mutable struct AdaMax
  eta::Float64
@ -133,10 +259,22 @@ function apply!(o::AdaMax, x, Δ)
 end

 """
-    ADAGrad(η = 0.1; ϵ = 1e-8)
+    ADAGrad(η = 0.1)

-[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
+[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
+parameter specific learning rates based on how frequently it is updated.
 Parameters don't need tuning.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+
+# Examples
+```julia
+opt = ADAGrad()
+
+opt = ADAGrad(0.001)
+```
 """
 mutable struct ADAGrad
  eta::Float64
@ -147,16 +285,27 @@ ADAGrad(η = 0.1) = ADAGrad(η, IdDict())

 function apply!(o::ADAGrad, x, Δ)
  η = o.eta
-  acc = get!(o.acc, x, fill(ϵ, size(x)))::typeof(data(x))
+  acc = get!(o.acc, x, fill!(zero(x), ϵ))::typeof(x)
  @. acc += Δ^2
  @. Δ *= η / (√acc + ϵ)
 end

 """
-    ADADelta(ρ = 0.9, ϵ = 1e-8)
+    ADADelta(ρ = 0.9)

-[ADADelta](http://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
-tuning.
+[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning
+rate based on a window of past gradient updates.
+Parameters don't need tuning.
+
+# Parameters
+- Rho (`ρ`): Factor by which the gradient is decayed at each time step.
+
+# Examples
+```julia
+opt = ADADelta()
+
+opt = ADADelta(0.89)
+```
 """
 mutable struct ADADelta
  rho::Float64
@ -175,10 +324,23 @@ function apply!(o::ADADelta, x, Δ)
 end

 """
-    AMSGrad(η = 0.001, β = (0.9, 0.999))
+    AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999))

-[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
-tuning.
+The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
+optimiser. Parameters don't need tuning.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
+
+# Examples
+```julia
+opt = AMSGrad()
+
+opt = AMSGrad(0.001, (0.89, 0.995))
+```
 """
 mutable struct AMSGrad
  eta::Float64
@ -190,18 +352,31 @@ AMSGrad(η = 0.001, β = (0.9, 0.999)) = AMSGrad(η, β, IdDict())

 function apply!(o::AMSGrad, x, Δ)
  η, β = o.eta, o.beta
-  mt, vt, v̂t = get!(o.state, x, (fill(ϵ, size(x)), fill(ϵ, size(x)), fill(ϵ, size(x))))
+  mt, vt, v̂t = get!(o.state, x, (fill!(zero(x), ϵ), fill!(zero(x), ϵ), fill!(zero(x), ϵ)))
  @. mt = β[1] * mt + (1 - β[1]) * Δ
  @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
-  @. v̂t = max.(v̂t, vt)
+  @. v̂t = max(v̂t, vt)
  @. Δ = η * mt / (√v̂t + ϵ)
 end

 """
-    NADAM(η = 0.001, β = (0.9, 0.999))
+    NADAM(η = 0.001, β::Tuple = (0.9, 0.999))

-[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
-tuning.
+[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) is a Nesterov variant of ADAM.
+Parameters don't need tuning.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
+
+# Examples
+```julia
+opt = NADAM()
+
+opt = NADAM(0.002, (0.89, 0.995))
+```
 """
 mutable struct NADAM
  eta::Float64
@ -213,8 +388,7 @@ NADAM(η = 0.001, β = (0.9, 0.999)) = NADAM(η, β, IdDict())

 function apply!(o::NADAM, x, Δ)
  η, β = o.eta, o.beta
-  β1p, β2p = o.beta
-  mt, vt = get!(o.state, x, (zero(x), zero(x)))
+  mt, vt, (β1p, β2p) = get!(o.state, x, (zero(x), zero(x), o.beta))
  @. mt = β[1] * mt + (1 - β[1]) * Δ
  @. vt = β[2] * vt + (1 - β[2]) * Δ^2
  @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / (√(vt * β[2] / (1 - β2p)) + ϵ) * η
@ -223,9 +397,24 @@ function apply!(o::NADAM, x, Δ)
 end

 """
-    ADAMW((η = 0.001, β = (0.9, 0.999), decay = 0)
+    ADAMW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0)

-[ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam.
+[ADAMW](https://arxiv.org/abs/1711.05101) is a variant of ADAM fixing (as in repairing) its
+weight decay regularization.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
+- `decay`: Decay applied to weights during optimisation.
+
+# Examples
+```julia
+opt = ADAMW()
+
+opt = ADAMW(0.001, (0.89, 0.995), 0.1)
+```
 """
 ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
  Optimiser(ADAM(η, β), WeightDecay(decay))
@ -258,11 +447,15 @@ function apply!(o::Optimiser, x, Δ)
 end

 """
-`InvDecay(γ)`
+    InvDecay(γ = 0.001)

-Apply inverse time decay to an optimiser
+Apply inverse time decay to an optimiser, so that the effective step size at
+iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size.
+The wrapped optimiser's step size is not modified.
+
+# Examples
 ```julia
-  Optimiser(InvDecay(..), Opt(..))
+Optimiser(InvDecay(..), Opt(..))
 ```
 """
 mutable struct InvDecay
@ -281,13 +474,25 @@ function apply!(o::InvDecay, x, Δ)
 end

 """
-`ExpDecay(eta, decay, decay_step, clip)`
+    ExpDecay(η = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4)

-Schedule the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
+Discount the learning rate `η` by the factor `decay` every `decay_step` steps till
+a minimum of `clip`.

+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- `decay`: Factor by which the learning rate is discounted.
+- `decay_step`: Schedule decay operations by setting the number of steps between
+                two decay operations.
+- `clip`: Minimum value of learning rate.
+
+# Examples
 To apply exponential decay to an optimiser:
 ```julia
-  Optimiser(ExpDecay(..), Opt(..))
+Optimiser(ExpDecay(..), Opt(..))
+
+opt = Optimiser(ExpDecay(), ADAM())
 ```
 """
 mutable struct ExpDecay
@ -304,16 +509,19 @@ function apply!(o::ExpDecay, x, Δ)
  η, s, decay = o.eta, o.step, o.decay
  n = o.current[x] = get(o.current, x, 0) + 1
  if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1
-    η = max(η * decay^(s / n), o.clip)
+    η = max(η * decay, o.clip)
    o.eta = η
  end
-  @. Δ *= decay
+  @. Δ *= η
 end

 """
-`WeightDecay(wd)`
+    WeightDecay(wd = 0)

-Decay the weight parameter by `wd`
+Decay weights by `wd`.
+
+# Parameters
+- Weight decay (`wd`)
 """
 mutable struct WeightDecay
  wd::Real
@ -323,5 +531,33 @@ WeightDecay() = WeightDecay(0)

 function apply!(o::WeightDecay, x, Δ)
  wd = o.wd
-  @. Δ += wd * data(x)
+  @. Δ += wd * x
 end
+
+"""
+    ClipValue(thresh)
+
+Clip gradients when their absolute value exceeds `thresh`.
+"""
+mutable struct ClipValue{T}
+    thresh::T
+end
+
+apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
+
+"""
+    ClipNorm(thresh)
+
+Clip gradients when their L2 norm exceeds `thresh`.
+"""
+mutable struct ClipNorm{T}
+    thresh::T
+end
+
+function apply!(o::ClipNorm, x, Δ)
+    Δnrm = norm(Δ)
+    if Δnrm > o.thresh
+        rmul!(Δ, o.thresh / Δnrm)
+    end
+    return Δ
+end
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@ -1,41 +1,52 @@
 using Juno
-import Flux.Tracker: Params, gradient, data, update!
-import Base.depwarn
+import Zygote: Params, gradient

+
+
+"""
+    update!(x, x̄)
+
+Update the array `x` according to `x .-= x̄`.
+"""
+function update!(x::AbstractArray, x̄)
+  x .-= x̄
+end
+
+"""
+    update!(opt, p, g)
+    update!(opt, ps::Params, gs)
+
+Perform an update step of the parameters `ps` (or the single parameter `p`)
+according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
+
+As a result, the parameters are mutated and the optimizer's internal state may change.
+"""
 function update!(opt, x, x̄)
-  update!(x, -apply!(opt, x, data(x̄)))
+  x .-= apply!(opt, x, x̄)
 end

 function update!(opt, xs::Params, gs)
  for x in xs
+    gs[x] == nothing && continue
    update!(opt, x, gs[x])
  end
 end

-# Added as an internal API but everyone started using it.
-function _update_params!(opt, xs)
-  depwarn("`_update_params!` is deprecated, use `update!` instead.", :stop)
-  for x in xs
-    update!(opt, x, Tracker.grad(x))
-    x.tracker.grad = Tracker.zero_grad!(x.tracker.grad)
-  end
-end
-
 # Callback niceties
 call(f, xs...) = f(xs...)
 runall(f) = f
 runall(fs::AbstractVector) = () -> foreach(call, fs)

 struct StopException <: Exception end
+
 """
    stop()

 Call `Flux.stop()` in a callback to indicate when a callback condition is met.
-This would trigger the train loop to stop and exit.
+This will trigger the train loop to stop and exit.

+# Examples
 ```julia
-# Example callback:
-
 cb = function ()
  accuracy() > 0.9 && Flux.stop()
 end
@ -48,18 +59,18 @@ end
 """
    train!(loss, params, data, opt; cb)

-For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
-backpropagation and calls the optimizer `opt`.
+For each datapoint `d` in `data` compute the gradient of `loss(d...)` through
+backpropagation and call the optimizer `opt`.

-Takes a callback as keyword argument `cb`. For example, this will print "training"
-every 10 seconds:
+In case datapoints `d` are of numeric array type, assume no splatting is needed
+and compute the gradient of `loss(d)`.

-```julia
-Flux.train!(loss, params, data, opt,
-            cb = throttle(() -> println("training"), 10))
-```
+A callback is given with the keyword argument `cb`. For example, this will print
+"training" every 10 seconds (using [`Flux.throttle`](@ref)):

-The callback can call `Flux.stop()` to interrupt the training loop.
+    train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))
+
+The callback can call [`Flux.stop`](@ref) to interrupt the training loop.

 Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays.
 """
@ -68,14 +79,17 @@ function train!(loss, ps, data, opt; cb = () -> ())
  cb = runall(cb)
  @progress for d in data
    try
-      gs = gradient(ps) do
-        loss(d...)
+      if d isa AbstractArray{<:Number}
+        gs = gradient(ps) do
+          loss(d)
+        end
+      else
+        gs = gradient(ps) do
+          loss(d...)
+        end
      end
      update!(opt, ps, gs)
-      if cb() == :stop
-        depwarn("Use of `:stop` is deprecated; use `Flux.stop()` instead", :stop)
-        break
-      end
+      cb()
    catch ex
      if ex isa StopException
        break
@ -92,11 +106,12 @@ end
 Run `body` `N` times. Mainly useful for quickly doing multiple epochs of
 training in a REPL.

-```julia
-julia> @epochs 2 println("hello")
-INFO: Epoch 1
+# Examples
+```jldoctest
+julia> Flux.@epochs 2 println("hello")
+[ Info: Epoch 1
 hello
-INFO: Epoch 2
+[ Info: Epoch 2
 hello
 ```
 """
--- a/src/treelike.jl
+++ b/src/treelike.jl
@ -1,87 +0,0 @@
-import Adapt: adapt, adapt_storage
-import .Tracker: IdSet
-
-children(x) = ()
-mapchildren(f, x) = x
-
-children(x::Tuple) = x
-children(x::NamedTuple) = x
-mapchildren(f, x::Tuple) = map(f, x)
-mapchildren(f, x::NamedTuple) = map(f, x)
-
-function treelike(m::Module, T, fs = fieldnames(T))
-  @eval m begin
-    Flux.children(x::$T) = ($([:(x.$f) for f in fs]...),)
-    Flux.mapchildren(f, x::$T) = $T(f.($children(x))...)
-  end
-end
-
-macro treelike(T, fs = nothing)
-  fs == nothing || isexpr(fs, :tuple) || error("@treelike T (a, b)")
-  fs = fs == nothing ? [] : [:($(map(QuoteNode, fs.args)...),)]
-  :(treelike(@__MODULE__, $(esc(T)), $(fs...)))
-end
-
-isleaf(x) = isempty(children(x))
-
-function mapleaves(f, x; cache = IdDict())
-  haskey(cache, x) && return cache[x]
-  cache[x] = isleaf(x) ? f(x) : mapchildren(x -> mapleaves(f, x, cache = cache), x)
-end
-
-function prefor(f, x; seen = IdSet())
-  x ∈ seen && return
-  f(x)
-  foreach(x -> prefor(f, x, seen = seen), children(x))
-  return
-end
-
-function params(m)
-  ps = Params()
-  prefor(p ->
-    Tracker.istracked(p) && Tracker.isleaf(p) &&
-      !any(p′ -> p′ === p, ps) && push!(ps, p),
-    m)
-  return ps
-end
-
-params(m...) = params(m)
-
-function loadparams!(m, xs)
-  for (p, x) in zip(params(m), xs)
-    size(p) == size(x) ||
-      error("Expected param size $(size(p)), got $(size(x))")
-    copyto!(data(p), data(x))
-  end
-end
-
-# CPU/GPU movement conveniences
-
-cpu(m) = mapleaves(x -> adapt(Array, x), m)
-
-gpu_adaptor = identity
-
-@init @require CuArrays="3a865a2d-5b23-5a0f-bc46-62713ec82fae" begin
-  global gpu_adaptor = CuArrays.cu
-end
-
-gpu(x) = mapleaves(gpu_adaptor, x)
-
-# Precision
-
-adapt_storage(T::Type{<:Real}, xs::AbstractArray{<:Real}) = convert.(T, xs)
-
-paramtype(T::Type{<:Real}, m) = mapleaves(x -> adapt(T, x), m)
-
-f32(m) = paramtype(Float32, m)
-f64(m) = paramtype(Float64, m)
-
-# General parameter map
-
-function mapparams(f, m)
-  mapleaves(m) do x
-    Tracker.istracked(x) ? param(f(Tracker.data(x))) :
-    x isa Union{AbstractArray,Number} ? f(x) :
-    x
-  end
-end
--- a/src/utils.jl
+++ b/src/utils.jl
@ -1,6 +1,41 @@
 # Arrays
-glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0/sum(dims))
-glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0/sum(dims))
+nfan() = 1, 1 # fan_in, fan_out
+nfan(n) = 1, n # A vector is treated as a n×1 matrix
+nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices
+nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of convolution kernels
+
+"""
+    glorot_uniform(dims...)
+
+Return an `Array` of size `dims` containing random variables taken from a uniform
+distribution in the interval ``[-x, x]``, where `x = sqrt(24 / sum(dims)) / 2`.
+
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
+julia> Flux.glorot_uniform(2, 3)
+2×3 Array{Float32,2}:
+ 0.601094  -0.57414   -0.814925
+ 0.900868   0.805994   0.057514
+```
+"""
+glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
+
+"""
+    glorot_normal(dims...)
+
+Return an `Array` of size `dims` containing random variables taken from a normal
+distribution with mean 0 and standard deviation `sqrt(2 / sum(dims))`.
+
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
+julia> Flux.glorot_normal(3, 2)
+3×2 Array{Float32,2}:
+  0.429505  -0.0852891
+  0.523935   0.371009
+ -0.223261   0.188052
+```
+"""
+glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))

 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
@ -8,9 +43,81 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
 ones(dims...) = Base.ones(Float32, dims...)
 zeros(dims...) = Base.zeros(Float32, dims...)

+"""
+    unsqueeze(xs, dim)
+
+Return `xs` reshaped into an `Array` one dimensionality higher than `xs`,
+where `dim` indicates in which dimension `xs` is extended.
+
+# Examples
+```jldoctest
+julia> xs = [[1, 2], [3, 4], [5, 6]]
+3-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+
+julia> Flux.unsqueeze(xs, 1)
+1×3 Array{Array{Int64,1},2}:
+ [1, 2]  [3, 4]  [5, 6]
+
+julia> Flux.unsqueeze([1 2; 3 4], 2)
+2×1×2 Array{Int64,3}:
+[:, :, 1] =
+ 1
+ 3
+
+[:, :, 2] =
+ 2
+ 4
+```
+"""
 unsqueeze(xs, dim) = reshape(xs, (size(xs)[1:dim-1]..., 1, size(xs)[dim:end]...))

+"""
+    stack(xs, dim)
+
+Concatenate the given `Array` of `Array`s `xs` into a single `Array` along the
+given dimension `dim`.
+
+# Examples
+```jldoctest
+julia> xs = [[1, 2], [3, 4], [5, 6]]
+3-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+
+julia> Flux.stack(xs, 1)
+3×2 Array{Int64,2}:
+ 1  2
+ 3  4
+ 5  6
+
+julia> cat(xs, dims=1)
+3-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+```
+"""
 stack(xs, dim) = cat(unsqueeze.(xs, dim)..., dims=dim)
+
+"""
+    unstack(xs, dim)
+
+Unroll the given `xs` into an `Array` of `Array`s along the given dimension `dim`.
+
+# Examples
+```jldoctest
+julia> Flux.unstack([1 3 5 7; 2 4 6 8], 2)
+4-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+ [7, 8]
+```
+"""
 unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]

 """
@ -18,9 +125,16 @@ unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]

 Split `xs` into `n` parts.

-```julia
-julia> chunk(1:10, 3)
-3-element Array{Array{Int64,1},1}:
+# Examples
+```jldoctest
+julia> Flux.chunk(1:10, 3)
+3-element Array{UnitRange{Int64},1}:
+ 1:4
+ 5:8
+ 9:10
+
+julia> Flux.chunk(collect(1:10), 3)
+3-element Array{SubArray{Int64,1,Array{Int64,1},Tuple{UnitRange{Int64}},true},1}:
 [1, 2, 3, 4]
 [5, 6, 7, 8]
 [9, 10]
@ -35,11 +149,12 @@ batchindex(xs, i) = (reverse(Base.tail(reverse(axes(xs))))..., i)

 Count the number of times that each element of `xs` appears.

-```julia
-julia> frequencies(['a','b','b'])
+# Examples
+```jldoctest
+julia> Flux.frequencies(['a','b','b'])
 Dict{Char,Int64} with 2 entries:
-  'b' => 2
  'a' => 1
+  'b' => 2
 ```
 """
 function frequencies(xs)
@ -55,12 +170,13 @@ head(x::Tuple) = reverse(Base.tail(reverse(x)))
 squeezebatch(x) = reshape(x, head(size(x)))

 """
-  batch(xs)
+    batch(xs)

 Batch the arrays in `xs` into a single array.

-```julia
-julia> batch([[1,2,3],[4,5,6]])
+# Examples
+```jldoctest
+julia> Flux.batch([[1,2,3],[4,5,6]])
 3×2 Array{Int64,2}:
 1  4
 2  5
@ -77,6 +193,25 @@ function batch(xs)
  return data
 end

+"""
+Return the given sequence padded with `p` up to a maximum length of `n`.
+
+# Examples
+```jldoctest
+julia> rpad([1, 2], 4, 0)
+4-element Array{Int64,1}:
+ 1
+ 2
+ 0
+ 0
+
+julia> rpad([1, 2, 3], 2, 0)
+3-element Array{Int64,1}:
+ 1
+ 2
+ 3
+```
+"""
 Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))]

 """
@ -85,8 +220,9 @@ Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))
 Take a list of `N` sequences, and turn them into a single sequence where each
 item is a batch of `N`. Short sequences will be padded by `pad`.

-```julia
-julia> batchseq([[1, 2, 3], [4, 5]], 0)
+# Examples
+```jldoctest
+julia> Flux.batchseq([[1, 2, 3], [4, 5]], 0)
 3-element Array{Array{Int64,1},1}:
 [1, 4]
 [2, 5]
@ -98,14 +234,64 @@ function batchseq(xs, pad = nothing, n = maximum(length(x) for x in xs))
  [batch([xs_[j][i] for j = 1:length(xs_)]) for i = 1:n]
 end

+# Flattening models to weight vectors, and back
+
+function _restructure(m, xs)
+  i = 0
+  fmap(m) do x
+    x isa AbstractArray || return x
+    x = reshape(xs[i.+(1:length(x))], size(x))
+    i += length(x)
+    return x
+  end
+end
+
+@adjoint function _restructure(m, xs)
+  _restructure(m, xs), dm -> (nothing,destructure(dm)[1])
+end
+
+"""
+    destructure(m)
+
+Flatten a model's parameters into a single weight vector.
+
+    julia> m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+
+    julia> θ, re = destructure(m);
+
+    julia> θ
+    67-element Array{Float32,1}:
+    -0.1407104
+    ...
+
+The second return value `re` allows you to reconstruct the original network after making
+modifications to the weight vector (for example, with a hypernetwork).
+
+    julia> re(θ .* 2)
+    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+"""
+function destructure(m)
+  xs = Zygote.Buffer([])
+  fmap(m) do x
+    x isa AbstractArray && push!(xs, x)
+    return x
+  end
+  return vcat(vec.(copy(xs))...), p -> _restructure(m, p)
+end
+
 # Other

 """
-Returns a function that when invoked, will only be triggered at most once
-during `timeout` seconds. Normally, the throttled function will run
-as much as it can, without ever going more than once per `wait` duration;
-but if you'd like to disable the execution on the leading edge, pass
-`leading=false`. To enable execution on the trailing edge, ditto.
+    throttle(f, timeout; leading=true, trailing=false)
+
+Return a function that when invoked, will only be triggered at most once
+during `timeout` seconds.
+
+Normally, the throttled function will run as much as it can, without ever
+going more than once per `wait` duration; but if you'd like to disable the
+execution on the leading edge, pass `leading=false`. To enable execution on
+the trailing edge, pass `trailing=true`.
 """
 function throttle(f, timeout; leading=true, trailing=false)
  cooldown = true
--- a/src/zeros.jl
+++ b/src/zeros.jl
@ -0,0 +1,106 @@
+import Base: +, -, *, reshape, size
+import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
+
+"""
+    Zeros()
+    Zeros(size...)
+    Zeros(Type, size...)
+
+Acts as a stand-in for an array of zeros that can be
+used during training which is ignored by the optimisers.
+
+Useful to turn bias off for a forward pass of a layer.
+
+## Examples
+
+```julia
+julia> Flux.Zeros(3,3)
+3×3 Flux.Zeros{Bool,2}:
+ false  false  false
+ false  false  false
+ false  false  false
+
+julia> Flux.Zeros(Float32, 3,3)
+3×3 Flux.Zeros{Float32,2}:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+
+julia> rand(3,3) .+ Flux.Zeros()
+3×3 Array{Float64,2}:
+ 0.198739  0.490459  0.785386
+ 0.779074  0.39986   0.66383
+ 0.854981  0.447292  0.314497
+
+julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
+Conv((2, 2), 1=>3)
+```
+"""
+struct Zeros{T,N} <: AbstractArray{T,N}
+  size::Tuple
+end
+
+Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
+Zeros(sz::Integer...) = Zeros(Bool, sz...)
+
+Base.size(xs::Zeros) = xs.size
+Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
+
+Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
+
+Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
+Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
+              Zeros(T, length(inds))
+
+Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
+
+@adjoint reshape(xs::Zeros{T}, dims...) where T =
+                reshape(xs, dims...), _ -> nothing
+
+# Define basic ops
+for f in (:+, :-)
+  @eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
+    @assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
+    a
+  end
+end
+
+(a::Zeros, b::AbstractArray) = b + a
+-(a::Zeros, b::AbstractArray) = -b + a
+
+Base.copy(xs::Zeros{T,N}) where {T,N} = xs
+
+# Define broadcasting behaviour
+for op in (:+, :-)
+  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
+    bs = Broadcast.broadcast_shape(size(a), size(b))
+    size(a) == bs && return a
+    sz = similar(a, bs)
+    sz .= a
+  end
+end
+
+broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
+broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
+
+function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
+  Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+end
+
+broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
+
+for op in (:+, :-, :*)
+  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+end
+
+# Some opportunities to avoid scalar indexing, intermediaries
+# Since it replicates a little of what we expect Base to do,
+# it should be possible to remove in the future, but for now,
+# these help with performance.
+broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
+broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
+broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
+broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@ -1,4 +1,5 @@
-using Flux, Flux.Tracker, CuArrays, Test
+using Flux, Test
+using Flux.CuArrays
 using Flux: gpu

@info "Testing GPU Support"
@ -7,11 +8,11 @@ using Flux: gpu

 CuArrays.allowscalar(false)

-x = param(randn(5, 5))
+x = randn(5, 5)
 cx = gpu(x)
-@test cx isa TrackedArray && cx.data isa CuArray
+@test cx isa CuArray

-@test Flux.onecold(param(gpu([1.,2.,3.]))) == 3
+@test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3

 x = Flux.onehotbatch([1, 2, 3], 1:3)
 cx = gpu(x)
@ -21,25 +22,54 @@ cx = gpu(x)
 m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
 cm = gpu(m)

-@test all(p isa TrackedArray && p.data isa CuArray for p in params(cm))
-@test cm(gpu(rand(10, 10))) isa TrackedArray{Float32,2,CuArray{Float32,2}}
+@test all(p isa CuArray for p in params(cm))
+@test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}

-x = [1,2,3]
+x = [1.,2.,3.]
 cx = gpu(x)
@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
+@test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
+@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))

-xs = param(rand(5,5))
+x = [-1.1491, 0.8619, 0.3127]
+y = [1, 1, 0.]
+@test Flux.binarycrossentropy.(σ.(x),y) ≈ Array(Flux.binarycrossentropy.(cu(σ.(x)),cu(y)))
+@test Flux.logitbinarycrossentropy.(x,y) ≈ Array(Flux.logitbinarycrossentropy.(cu(x),cu(y)))
+
+xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)
@test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)

 c = gpu(Conv((2,2),3=>4))
+x = gpu(rand(10, 10, 3, 2))
 l = c(gpu(rand(10,10,3,2)))
-Flux.back!(sum(l))
+@test gradient(x -> sum(c(x)), x)[1] isa CuArray
+
+c = gpu(CrossCor((2,2),3=>4))
+x = gpu(rand(10, 10, 3, 2))
+l = c(gpu(rand(10,10,3,2)))
+@test gradient(x -> sum(c(x)), x)[1] isa CuArray

 end

-if CuArrays.libcudnn != nothing
-    @info "Testing Flux/CUDNN"
-    include("cudnn.jl")
-    include("curnn.jl")
+@testset "onecold gpu" begin
+  y = Flux.onehotbatch(ones(3), 1:10) |> gpu;
+  @test Flux.onecold(y) isa CuArray
+  @test y[3,:] isa CuArray
+end
+
+@testset "restructure gpu" begin
+  dudt = Dense(1,1) |> gpu
+  p,re = Flux.destructure(dudt)
+  foo(x) = sum(re(p)(x))
+  @test gradient(foo, cu(rand(1)))[1] isa CuArray
+end
+
+if CuArrays.has_cudnn()
+  @info "Testing Flux/CUDNN"
+  include("cudnn.jl")
+  include("curnn.jl")
+  include("layers.jl")
+else
+  @warn "CUDNN unavailable, not testing GPU DNN support"
 end
--- a/test/cuda/cudnn.jl
+++ b/test/cuda/cudnn.jl
@ -1,48 +1,44 @@
-using Flux, Flux.Tracker, CuArrays, Test
-using Flux.Tracker: TrackedArray, data
+using Flux, CuArrays, Test
+using Flux: pullback

@testset "CUDNN BatchNorm" begin
    @testset "4D Input" begin
-        x = TrackedArray(Float64.(collect(reshape(1:12, 2, 2, 3, 1))))
+        x = Float64.(collect(reshape(1:12, 2, 2, 3, 1)))
        m = BatchNorm(3)
        cx = gpu(x)
        cm = gpu(m)

-        y = m(x)
-        cy = cm(cx)
+        y, back = pullback((m, x) -> m(x), m, x)
+        cy, cback = pullback((m, x) -> m(x), cm, cx)

-        @test cy isa TrackedArray{Float32,4,CuArray{Float32,4}}
+        @test cpu(cy) ≈ y

-        @test cpu(data(cy)) ≈ data(y)
+        Δ = randn(size(y))
+        dm, dx = back(Δ)
+        cdm, cdx = cback(gpu(Δ))

-        g = rand(size(y)...)
-        Flux.back!(y, g)
-        Flux.back!(cy, gpu(g))
-
-        @test m.γ.grad ≈ cpu(cm.γ.grad)
-        @test m.β.grad ≈ cpu(cm.β.grad)
-        @test x.grad ≈ cpu(x.grad)
+        @test dm[].γ ≈ cpu(cdm[].γ)
+        @test dm[].β ≈ cpu(cdm[].β)
+        @test dx ≈ cpu(cdx)
    end

    @testset "2D Input" begin
-        x = TrackedArray(Float64.(collect(reshape(1:12, 3, 4))))
+        x = Float64.(collect(reshape(1:12, 3, 4)))
        m = BatchNorm(3)
        cx = gpu(x)
        cm = gpu(m)

-        y = m(x)
-        cy = cm(cx)
+        y, back = pullback((m, x) -> m(x), m, x)
+        cy, cback = pullback((m, x) -> m(x), cm, cx)

-        @test cy isa TrackedArray{Float32,2,CuArray{Float32,2}}
+        @test cpu(cy) ≈ y

-        @test cpu(data(cy)) ≈ data(y)
+        Δ = randn(size(y))
+        dm, dx = back(Δ)
+        cdm, cdx = cback(gpu(Δ))

-        g = rand(size(y)...)
-        Flux.back!(y, g)
-        Flux.back!(cy, gpu(g))
-
-        @test m.γ.grad ≈ cpu(cm.γ.grad)
-        @test m.β.grad ≈ cpu(cm.β.grad)
-        @test x.grad ≈ cpu(x.grad)
+        @test dm[].γ ≈ cpu(cdm[].γ)
+        @test dm[].β ≈ cpu(cdm[].β)
+        @test dx ≈ cpu(cdx)
    end
 end
--- a/test/cuda/curnn.jl
+++ b/test/cuda/curnn.jl
@ -1,46 +1,63 @@
 using Flux, CuArrays, Test
+using Flux: pullback
+
+@testset for R in [RNN, GRU, LSTM]
+  m = R(10, 5) |> gpu
+  x = gpu(rand(10))
+  (m̄,) = gradient(m -> sum(m(x)), m)
+  Flux.reset!(m)
+  θ = gradient(() -> sum(m(x)), params(m))
+  @test collect(m̄[].cell[].Wi) == collect(θ[m.cell.Wi])
+end

@testset "RNN" begin
-  @testset for R in [RNN, GRU, LSTM]
+  @testset for R in [RNN, GRU, LSTM], batch_size in (1, 5)
    rnn = R(10, 5)
-    curnn = mapleaves(gpu, rnn)
-    @testset for batch_size in (1, 5)
-      Flux.reset!(rnn)
-      Flux.reset!(curnn)
-      x = batch_size == 1 ?
-        param(rand(10)) :
-        param(rand(10,batch_size))
-      cux = gpu(x)
-      y = (rnn(x); rnn(x))
-      cuy = (curnn(cux); curnn(cux))
+    curnn = fmap(gpu, rnn)

-      @test y.data ≈ collect(cuy.data)
-      @test haskey(Flux.CUDA.descs, curnn.cell)
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    x = batch_size == 1 ?
+      rand(10) :
+      rand(10, batch_size)
+    cux = gpu(x)

-      Δ = randn(size(y))
+    y, back = pullback((r, x) -> r(x), rnn, x)
+    cuy, cuback = pullback((r, x) -> r(x), curnn, cux)

-      Flux.back!(y, Δ)
-      Flux.back!(cuy, gpu(Δ))
+    @test y ≈ collect(cuy)
+    @test haskey(Flux.CUDA.descs, curnn.cell)

-      @test x.grad ≈ collect(cux.grad)
-      @test rnn.cell.Wi.grad ≈ collect(curnn.cell.Wi.grad)
-      @test rnn.cell.Wh.grad ≈ collect(curnn.cell.Wh.grad)
-      @test rnn.cell.b.grad ≈ collect(curnn.cell.b.grad)
-      @test rnn.cell.h.grad ≈ collect(curnn.cell.h.grad)
-      if isdefined(rnn.cell, :c)
-        @test rnn.cell.c.grad ≈ collect(curnn.cell.c.grad)
+    ȳ = randn(size(y))
+    m̄, x̄ = back(ȳ)
+    cum̄, cux̄ = cuback(gpu(ȳ))
+
+    m̄[].cell[].Wi
+
+    m̄[].state
+    cum̄[].state
+
+    @test x̄ ≈ collect(cux̄)
+    @test m̄[].cell[].Wi ≈ collect(cum̄[].cell[].Wi)
+    @test m̄[].cell[].Wh ≈ collect(cum̄[].cell[].Wh)
+    @test m̄[].cell[].b ≈ collect(cum̄[].cell[].b)
+    if m̄[].state isa Tuple
+      for (x, cx) in zip(m̄[].state, cum̄[].state)
+        @test x ≈ collect(cx)
      end
-
-      Flux.reset!(rnn)
-      Flux.reset!(curnn)
-      ohx = batch_size == 1 ?
-        Flux.onehot(rand(1:10), 1:10) :
-        Flux.onehotbatch(rand(1:10, batch_size), 1:10)
-      cuohx = gpu(ohx)
-      y = (rnn(ohx); rnn(ohx))
-      cuy = (curnn(cuohx); curnn(cuohx))
-
-      @test y.data ≈ collect(cuy.data)
+    else
+      @test m̄[].state ≈ collect(cum̄[].state)
    end
+
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    ohx = batch_size == 1 ?
+      Flux.onehot(rand(1:10), 1:10) :
+      Flux.onehotbatch(rand(1:10, batch_size), 1:10)
+    cuohx = gpu(ohx)
+    y = (rnn(ohx); rnn(ohx))
+    cuy = (curnn(cuohx); curnn(cuohx))
+
+    @test y ≈ collect(cuy)
  end
 end
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@ -0,0 +1,98 @@
+# Test layers and data/model movements on and off the GPU
+# Add tests for layers and their gradients on the GPU
+# Most of the forward passes should be fine being applied
+# to bitstype objects, but this gives higher coverage for our use-cases
+# Check that getting the gradients does not throw
+
+# generic movement tests
+@testset "Basic GPU Movement" begin
+  @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
+  @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
+end
+
+# TODO: These layers get into scalar indexing
+# `AlphaDropout` throws a compilation error on GPUs,
+# whereas, the rest are scalar indexing issues.
+const BROKEN_LAYERS = [DepthwiseConv,
+		       AlphaDropout,
+                       InstanceNorm,
+                       GroupNorm]
+
+function gradtest(name::String, layers::Vector, xs = nothing, args...)
+  isnothing(xs) && error("Missing input to test the layers against.")
+  @testset "$name GPU grad tests" begin
+    for layer in layers
+      @testset "$layer GPU grad test" begin
+        l = gpu(layer(args...))
+        xs = gpu(xs)
+        if any(x -> isa(l, x), BROKEN_LAYERS)
+          ps = Flux.params(l)
+          @test_broken gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
+        else
+          ps = Flux.params(l)
+          @test gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
+          gs = gradient(() -> sum(l(xs)), ps)
+
+          # Handle pooling layers
+          if !isempty(ps)
+            @test gs[first(ps)] isa Flux.CuArrays.CuArray
+          end
+        end
+      end
+    end
+  end
+end
+
+# Repeats from Conv, CrossCor
+
+r = rand(Float32, 28, 28, 1, 1)
+conv_layers = [Conv, ConvTranspose, CrossCor, DepthwiseConv]
+gradtest("Conv", conv_layers, r, (2,2), 1=>3)
+
+pooling_layers = [MaxPool, MeanPool]
+gradtest("Pooling", pooling_layers, r, (2,2))
+
+dropout_layers = [Dropout, AlphaDropout]
+gradtest("Dropout", dropout_layers, r, 0.5f0)
+
+norm_layers = [LayerNorm, BatchNorm]
+gradtest("Normalising", norm_layers, rand(Float32, 28,28,3,1), 1)
+
+instancenorm = [InstanceNorm]
+gradtest("InstanceNorm", instancenorm, r, 1)
+
+groupnorm = [GroupNorm]
+gradtest("GroupNorm", groupnorm, rand(Float32, 28,28,3,1), 3, 1)
+
+const stateless_layers = [Flux.mse,
+                          Flux.crossentropy,
+                          Flux.logitcrossentropy,
+                          Flux.normalise]
+
+const stateless_layers_broadcasted = [Flux.binarycrossentropy,
+                                      Flux.logitbinarycrossentropy]
+
+function stateless_gradtest(f, args...)
+  @test gradient((args...) -> sum(f(args...)), args...)[1] isa CuArray
+end
+
+function stateless_gradtest_broadcasted(f, args...)
+  @test gradient((args...) -> sum(f.(args...)), args...)[1] isa CuArray
+end
+
+@testset "Stateless GPU grad tests" begin
+  x = gpu(rand(3,3))
+  y = gpu(rand(3,3))
+
+  for layer in stateless_layers
+    if layer == Flux.normalise
+      stateless_gradtest(layer, x)
+    else
+      stateless_gradtest(layer, x, y)
+    end
+  end
+
+  for layer in stateless_layers_broadcasted
+    stateless_gradtest_broadcasted(layer, x, y)
+  end
+end
--- a/test/data.jl
+++ b/test/data.jl
@ -1,22 +1,116 @@
-using Flux.Data
-using Test
+@testset "DataLoader" begin
+    X = reshape([1:10;], (2, 5))
+    Y = [1:5;]

-@test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
+    d = DataLoader(X, batchsize=2)
+    @inferred first(d)
+    batches = collect(d)
+    @test eltype(batches) == eltype(d) == typeof(X)
+    @test length(batches) == 3
+    @test batches[1] == X[:,1:2]
+    @test batches[2] == X[:,3:4]
+    @test batches[3] == X[:,5:5]

-@test length(CMUDict.phones()) == 39
+    d = DataLoader(X, batchsize=2, partial=false)
+    @inferred first(d)
+    batches = collect(d)
+    @test eltype(batches) == eltype(d) == typeof(X)
+    @test length(batches) == 2
+    @test batches[1] == X[:,1:2]
+    @test batches[2] == X[:,3:4]

-@test length(CMUDict.symbols()) == 84
+    d = DataLoader((X,), batchsize=2, partial=false)
+    @inferred first(d)
+    batches = collect(d)
+    @test eltype(batches) == eltype(d) == Tuple{typeof(X)}
+    @test length(batches) == 2
+    @test batches[1] == (X[:,1:2],)
+    @test batches[2] == (X[:,3:4],)

-@test MNIST.images()[1] isa Matrix
-@test MNIST.labels() isa Vector{Int64}
+    d = DataLoader((X, Y), batchsize=2)
+    @inferred first(d)
+    batches = collect(d)
+    @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
+    @test length(batches) == 3
+    @test length(batches[1]) == 2
+    @test length(batches[2]) == 2
+    @test length(batches[3]) == 2
+    @test batches[1][1] == X[:,1:2]
+    @test batches[1][2] == Y[1:2]
+    @test batches[2][1] == X[:,3:4]
+    @test batches[2][2] == Y[3:4]
+    @test batches[3][1] == X[:,5:5]
+    @test batches[3][2] == Y[5:5]

-@test FashionMNIST.images()[1] isa Matrix
-@test FashionMNIST.labels() isa Vector{Int64}
+    # test with NamedTuple
+    d = DataLoader((x=X, y=Y), batchsize=2)
+    @inferred first(d)
+    batches = collect(d)
+    @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
+    @test length(batches) == 3
+    @test length(batches[1]) == 2
+    @test length(batches[2]) == 2
+    @test length(batches[3]) == 2
+    @test batches[1][1] == batches[1].x == X[:,1:2]
+    @test batches[1][2] == batches[1].y == Y[1:2]
+    @test batches[2][1] == batches[2].x == X[:,3:4]
+    @test batches[2][2] == batches[2].y == Y[3:4]
+    @test batches[3][1] == batches[3].x == X[:,5:5]
+    @test batches[3][2] == batches[3].y == Y[5:5]

-@test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
+    # test interaction with `train!`
+    θ = ones(2)
+    X = zeros(2, 10)
+    loss(x) = sum((x .- θ).^2)
+    d  = DataLoader(X) 
+    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    @test norm(θ) < 1e-4

-@test Iris.features() isa Matrix
-@test size(Iris.features()) == (4,150)
+    # test interaction with `train!`
+    θ = zeros(2)
+    X = ones(2, 10)
+    Y = fill(2, 10)
+    loss(x, y) = sum((y - x'*θ).^2)
+    d  = DataLoader((X, Y)) 
+    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    @test norm(θ .- 1) < 1e-10
+end

-@test Iris.labels() isa Vector{String}
-@test size(Iris.labels()) == (150,)
+@testset "CMUDict" begin 
+    @test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
+
+    @test length(CMUDict.phones()) == 39
+
+    @test length(CMUDict.symbols()) == 84
+end
+
+@testset "MNIST" begin 
+    @test MNIST.images()[1] isa Matrix
+    @test MNIST.labels() isa Vector{Int64}
+end
+
+@testset "FashionMNIST" begin 
+    @test FashionMNIST.images()[1] isa Matrix
+    @test FashionMNIST.labels() isa Vector{Int64}
+end
+
+@testset "Sentiment" begin 
+    @test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
+end
+
+@testset "Iris" begin 
+    @test Iris.features() isa Matrix
+    @test size(Iris.features()) == (4,150)
+
+    @test Iris.labels() isa Vector{String}
+    @test size(Iris.labels()) == (150,)
+end
+
+
+@testset "Housing" begin
+    @test Housing.features() isa Matrix # test broken due to SSL certifate expiration problem
+    @test size(Housing.features()) == (506, 13)
+
+    @test Housing.targets() isa Array{Float64}
+    @test size(Housing.targets()) == (506, 1)
+end
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@ -1,57 +1,117 @@
 using Test, Random
+import Flux: activations

@testset "basic" begin
-    @testset "Chain" begin
-        @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10))
-        @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10))
-        # numeric test should be put into testset of corresponding layer
+  @testset "helpers" begin
+    @testset "activations" begin
+      dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x))
+      x = randn(10)
+      @test activations(dummy_model, x)[1] == x.^2
+      @test activations(dummy_model, x)[2] == (x.^2 .- 3)
+      @test activations(dummy_model, x)[3] == tan.(x.^2 .- 3)
+
+      @test activations(Chain(), x) == ()
+      @test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type
+    end
+  end
+
+  @testset "Chain" begin
+    @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10))
+    @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10))
+    # numeric test should be put into testset of corresponding layer
+  end
+
+  @testset "Activations" begin
+    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
+    X = Float32.([1.0; 1.0; 1.0])
+    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
+  end
+
+  @testset "Dense" begin
+    @testset "constructors" begin
+      @test size(Dense(10, 100).W) == (100, 10)
+      @test Dense(rand(100,10), rand(10)).σ == identity
+
+      @test_throws MethodError Dense(10, 10.5)
+      @test_throws MethodError Dense(10, 10.5, tanh)
    end

-    @testset "Dense" begin
-        @test  length(Dense(10, 5)(randn(10))) == 5
-        @test_throws DimensionMismatch Dense(10, 5)(randn(1))
-        @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
-        @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting
+    @test  length(Dense(10, 5)(randn(10))) == 5
+    @test_throws DimensionMismatch Dense(10, 5)(randn(1))
+    @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
+    @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting

-        @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(1, 1)
-        @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
-        @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
-        @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
+    @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(1, 1)
+    @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
+    @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
+    @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
+  end

+  @testset "Diagonal" begin
+    @test length(Flux.Diagonal(10)(randn(10))) == 10
+    @test length(Flux.Diagonal(10)(1)) == 10
+    @test length(Flux.Diagonal(10)(randn(1))) == 10
+    @test_throws DimensionMismatch Flux.Diagonal(10)(randn(2))
+
+    @test Flux.Diagonal(2)([1 2]) == [1 2; 1 2]
+    @test Flux.Diagonal(2)([1,2]) == [1,2]
+    @test Flux.Diagonal(2)([1 2; 3 4]) == [1 2; 3 4]
+  end
+
+  @testset "Maxout" begin
+    # Note that the normal common usage of Maxout is as per the docstring
+    # These are abnormal constructors used for testing purposes
+
+    @testset "Constructor" begin
+      mo = Maxout(() -> identity, 4)
+      input = rand(40)
+      @test mo(input) == input
    end

-    @testset "Diagonal" begin
-        @test length(Flux.Diagonal(10)(randn(10))) == 10
-        @test length(Flux.Diagonal(10)(1)) == 10
-        @test length(Flux.Diagonal(10)(randn(1))) == 10
-        @test_throws DimensionMismatch Flux.Diagonal(10)(randn(2))
-
-        @test Flux.Diagonal(2)([1 2]) == [1 2; 1 2]
-        @test Flux.Diagonal(2)([1,2]) == [1,2]
-        @test Flux.Diagonal(2)([1 2; 3 4]) == [1 2; 3 4]
+    @testset "simple alternatives" begin
+      mo = Maxout((x -> x, x -> 2x, x -> 0.5x))
+      input = rand(40)
+      @test mo(input) == 2*input
    end

-    @testset "Maxout" begin
-        # Note that the normal common usage of Maxout is as per the docstring
-        # These are abnormal constructors used for testing purposes
-
-        @testset "Constructor" begin
-            mo = Maxout(() -> identity, 4)
-            input = rand(40)
-            @test mo(input) == input
-        end
-
-        @testset "simple alternatives" begin
-            mo = Maxout((x -> x, x -> 2x, x -> 0.5x))
-            input = rand(40)
-            @test mo(input) == 2*input
-        end
-
-        @testset "complex alternatives" begin
-            mo = Maxout((x -> [0.5; 0.1]*x, x -> [0.2; 0.7]*x))
-            input = [3.0 2.0]
-            target = [0.5, 0.7].*input
-            @test mo(input) == target
-        end
+    @testset "complex alternatives" begin
+      mo = Maxout((x -> [0.5; 0.1]*x, x -> [0.2; 0.7]*x))
+      input = [3.0 2.0]
+      target = [0.5, 0.7].*input
+      @test mo(input) == target
    end
+
+    @testset "params" begin
+      mo = Maxout(()->Dense(32, 64), 4)
+      ps = params(mo)
+      @test length(ps) == 8  #4 alts, each with weight and bias
+    end
+  end
+
+  @testset "SkipConnection" begin
+    @testset "zero sum" begin
+      input = randn(10, 10, 10, 10)
+      @test SkipConnection(x -> zeros(size(x)), (a,b) -> a + b)(input) == input
+    end
+
+    @testset "concat size" begin
+      input = randn(10, 2)
+      @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
+    end
+  end
+
+  @testset "output dimensions" begin
+    m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+    @test Flux.outdims(m, (10, 10)) == (6, 6)
+
+    m = Dense(10, 5)
+    @test Flux.outdims(m, (5, 2)) == (5,)
+    @test Flux.outdims(m, (10,)) == (5,)
+
+    m = Flux.Diagonal(10)
+    @test Flux.outdims(m, (10,)) == (10,)
+
+    m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
+    @test Flux.outdims(m, (10, 10)) == (8, 8)
+  end
 end
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@ -1,12 +1,17 @@
 using Flux, Test
 using Flux: maxpool, meanpool
+using Flux: gradient

@testset "Pooling" begin
  x = randn(Float32, 10, 10, 3, 2)
+  gmp = GlobalMaxPool()
+  @test size(gmp(x)) == (1, 1, 3, 2)
+  gmp = GlobalMeanPool()
+  @test size(gmp(x)) == (1, 1, 3, 2)
  mp = MaxPool((2, 2))
-  @test mp(x) == maxpool(x, (2,2))
+  @test mp(x) == maxpool(x, PoolDims(x, 2))
  mp = MeanPool((2, 2))
-  @test mp(x) == meanpool(x, (2,2))
+  @test mp(x) == meanpool(x, PoolDims(x, 2))
 end

@testset "CNN" begin
@ -20,16 +25,194 @@ end
    Dense(288, 10), softmax)

  @test size(m(r)) == (10, 5)
+
+  # Test bias switch
+  bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
+  ip = zeros(Float32, 28,28,1,1)
+
+  op = bias(ip)
+  @test sum(op) == prod(size(op))
+
+  bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
+  op = bias(ip)
+  @test sum(op) === 0.f0
+  gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
+  @test gs[bias.bias] == nothing
+
+  # Train w/o bias and make sure no convergence happens
+  # when only bias can be converged
+  bias = Conv((2, 2), 1=>3, bias = Flux.Zeros());
+  ip = zeros(Float32, 28,28,1,1)
+  op = zeros(Float32, 27,27,3,1) .+ 2.f0
+  opt = Descent()
+
+  for _ = 1:10^3
+    gs = gradient(params(bias)) do
+      Flux.mse(bias(ip), op)
+    end
+    Flux.Optimise.update!(opt, params(bias), gs)
+  end
+
+  @test Flux.mse(bias(ip), op) ≈ 4.f0
+end
+
+@testset "asymmetric padding" begin
+  r = ones(Float32, 28, 28, 1, 1)
+  m = Conv((3, 3), 1=>1, relu; pad=(0,1,1,2))
+  m.weight[:] .= 1.0
+  m.bias[:] .= 0.0
+  y_hat = m(r)[:,:,1,1]
+  @test size(y_hat) == (27, 29)
+  @test y_hat[1, 1] ≈ 6.0
+  @test y_hat[2, 2] ≈ 9.0
+  @test y_hat[end, 1] ≈ 4.0
+  @test y_hat[1, end] ≈ 3.0
+  @test y_hat[1, end-1] ≈ 6.0
+  @test y_hat[end, end] ≈ 2.0
 end

@testset "Depthwise Conv" begin
  r = zeros(Float32, 28, 28, 3, 5)
-
-  m1 = DepthwiseConv((2, 2), 3=>5)
-
+  m1 = DepthwiseConv((2, 2), 3=>15)
  @test size(m1(r), 3) == 15

-  m2 = DepthwiseConv((2, 2), 3)
+  m3 = DepthwiseConv((2, 3), 3=>9)
+  @test size(m3(r), 3) == 9

-  @test size(m2(r), 3) == 3
+  # Test that we cannot ask for non-integer multiplication factors
+  @test_throws AssertionError DepthwiseConv((2,2), 3=>10)
+end
+
+@testset "ConvTranspose" begin
+  x = zeros(Float32, 28, 28, 1, 1)
+  y = Conv((3,3), 1 => 1)(x)
+  x_hat = ConvTranspose((3, 3), 1 => 1)(y)
+  @test size(x_hat) == size(x)
+
+  m = ConvTranspose((3,3), 1=>1)
+  # Test that the gradient call does not throw: #900
+  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
+end
+
+@testset "CrossCor" begin
+  x = rand(Float32, 28, 28, 1, 1)
+  w = rand(2,2,1,1)
+  y = CrossCor(w, [0.0])
+
+  @test isapprox(sum(w .* x[1:2, 1:2, :, :]), y(x)[1, 1, 1, 1], rtol=1e-7)
+
+  r = zeros(Float32, 28, 28, 1, 5)
+  m = Chain(
+    CrossCor((2, 2), 1=>16, relu),
+    MaxPool((2,2)),
+    CrossCor((2, 2), 16=>8, relu),
+    MaxPool((2,2)),
+    x -> reshape(x, :, size(x, 4)),
+    Dense(288, 10), softmax)
+
+  @test size(m(r)) == (10, 5)
+  @test y(x) != Conv(w, [0.0])(x)
+  @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x) == Conv(w, [0.0])(x)
+end
+
+@testset "Conv with non quadratic window #700" begin
+  data = zeros(Float32, 7,7,1,1)
+  data[4,4,1,1] = 1
+
+  l = Conv((3,3), 1=>1)
+  expected = zeros(eltype(l.weight),5,5,1,1)
+  expected[2:end-1,2:end-1,1,1] = l.weight
+  @test expected ≈ l(data)
+
+  l = Conv((3,1), 1=>1)
+  expected = zeros(eltype(l.weight),5,7,1,1)
+  expected[2:end-1,4,1,1] = l.weight
+  @test expected ≈ l(data)
+
+  l = Conv((1,3), 1=>1)
+  expected = zeros(eltype(l.weight),7,5,1,1)
+  expected[4,2:end-1,1,1] = l.weight
+  @test expected ≈ l(data)
+
+  @test begin
+    # we test that the next expression does not throw
+    randn(Float32, 10,10,1,1) |> Conv((6,1), 1=>1, Flux.σ)
+    true
+  end
+end
+
+@testset "conv output dimensions" begin
+  m = Conv((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = Conv((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+
+  m = ConvTranspose((3, 3), 3 => 16)
+  @test Flux.outdims(m, (8, 8)) == (10, 10)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (2, 2)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (4, 4)) == (5, 5)
+
+  m = DepthwiseConv((3, 3), 3 => 6)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+
+  m = CrossCor((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = CrossCor((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+
+  m = MaxPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MaxPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MaxPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+
+  m = MeanPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MeanPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MeanPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+end
+
+@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, CrossCor), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
+  data = ones(Float32, (k .+ 3)..., 1,1)
+  l = ltype(k, 1=>1, pad=SamePad())
+  @test size(l(data)) == size(data)
+
+  l = ltype(k, 1=>1, pad=SamePad(), dilation = k .÷ 2)
+  @test size(l(data)) == size(data)
+
+  stride = 3
+  l = ltype(k, 1=>1, pad=SamePad(), stride = stride)
+  if ltype == ConvTranspose
+    @test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] .- stride .+ 1
+  else
+    @test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ stride)
+  end
+end
+
+@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
+  data = ones(Float32, (k .+ 3)..., 1,1)
+
+  l = ltype(k, pad=SamePad())
+  @test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ k)
 end
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@ -1,46 +1,58 @@
-using Flux: testmode!
-using Flux.Tracker: data
+using Flux, Test, Statistics
+using Zygote: pullback
+
+evalwgrad(f, x...) = pullback(f, x...)[1]

@testset "Dropout" begin
  x = [1.,2.,3.]
-  @test x == testmode!(Dropout(0.1))(x)
-  @test x == Dropout(0)(x)
-  @test zero(x) == Dropout(1)(x)
+  @test x == Dropout(0.1)(x)
+  @test x == evalwgrad(Dropout(0), x)
+  @test zero(x) == evalwgrad(Dropout(1), x)

  x = rand(100)
  m = Dropout(0.9)
-  y = m(x)
+  y = evalwgrad(m, x)
  @test count(a->a==0, y) > 50
-  testmode!(m)
-  y = m(x)
+  testmode!(m, true)
+  y = evalwgrad(m, x) # should override istraining
  @test count(a->a==0, y) == 0
  testmode!(m, false)
-  y = m(x)
+  y = evalwgrad(m, x)
  @test count(a->a==0, y) > 50

-  x = rand(100)
+  x = rand(Float32, 100)
  m = Chain(Dense(100,100),
            Dropout(0.9))
-  y = m(x)
+  y = evalwgrad(m, x)
  @test count(a->a == 0, y) > 50
-  testmode!(m)
-  y = m(x)
+  testmode!(m, true)
+  y = evalwgrad(m, x) # should override istraining
  @test count(a->a == 0, y) == 0
+
+  x = rand(100, 50)
+  m = Dropout(0.5, dims = 2)
+  y = m(x)
+  c = map(i->count(a->a==0, @view y[i, :]), 1:100)
+  @test minimum(c) == maximum(c)
+  m = Dropout(0.5, dims = 1)
+  y = m(x)
+  c = map(i->count(a->a==0, @view y[:, i]), 1:50)
+  @test minimum(c) == maximum(c)
 end

@testset "BatchNorm" begin
-  let m = BatchNorm(2), x = param([1 3 5;
-                                   2 4 6])
+  let m = BatchNorm(2), x = [1.0 3.0 5.0;
+                             2.0 4.0 6.0]

-    @test m.β.data == [0, 0]  # initβ(2)
-    @test m.γ.data == [1, 1]  # initγ(2)
+    @test length(params(m)) == 2
+
+    @test m.β == [0, 0]  # initβ(2)
+    @test m.γ == [1, 1]  # initγ(2)
    # initial m.σ is 1
    # initial m.μ is 0
-    @test m.active
-
-    # @test m(x).data ≈ [-1 -1; 0 0; 1 1]'
-    m(x)

+    y = evalwgrad(m, x)
+    @test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5)
    # julia> x
    #  2×3 Array{Float64,2}:
    #  1.0  3.0  5.0
@ -59,41 +71,32 @@ end
    # 2×1 Array{Float64,2}:
    #  1.3
    #  1.3
-    @test m.σ² ≈ .1 .* var(x.data, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
+    @test m.σ² ≈ .1 .* var(x, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]

-    testmode!(m)
-    @test !m.active
-
-    x′ = m(x).data
+    x′ = m(x)
    @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.3), atol = 1.0e-5)
  end

  # with activation function
-  let m = BatchNorm(2, sigmoid), x = param([1 3 5;
-                                            2 4 6])
-    @test m.active
-    m(x)
-
-    testmode!(m)
-    @test !m.active
-
-    y = m(x).data
-    @test isapprox(y, data(sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ))), atol = 1.0e-7)
+  let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0;
+                                      2.0 4.0 6.0]
+    y = m(x)
+    @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
  end

-  let m = BatchNorm(2), x = param(reshape(1:6, 3, 2, 1))
+  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
    y = reshape(permutedims(x, [2, 1, 3]), 2, :)
    y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
    @test m(x) == y
  end

-  let m = BatchNorm(2), x = param(reshape(1:12, 2, 3, 2, 1))
+  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
    y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
    y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
    @test m(x) == y
  end

-  let m = BatchNorm(2), x = param(reshape(1:24, 2, 2, 3, 2, 1))
+  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
    y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
    y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
    @test m(x) == y
@ -105,20 +108,18 @@ end
  end
 end

-
@testset "InstanceNorm" begin
  # helper functions
  expand_inst = (x, as) -> reshape(repeat(x, outer=[1, as[length(as)]]), as...)
  # begin tests
  let m = InstanceNorm(2), sizes = (3, 2, 2),
-      x = param(reshape(collect(1:prod(sizes)), sizes))
+        x = reshape(collect(1:prod(sizes)), sizes)

-      @test m.β.data == [0, 0]  # initβ(2)
-      @test m.γ.data == [1, 1]  # initγ(2)
-
-      @test m.active
-
-      m(x)
+      @test length(params(m)) == 2
+      x = Float64.(x)
+      @test m.β == [0, 0]  # initβ(2)
+      @test m.γ == [1, 1]  # initγ(2)
+      y = evalwgrad(m, x)

      #julia> x
      #[:, :, 1] =
@ -143,37 +144,28 @@ end
      # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8
      @test m.μ ≈ [0.5, 0.8]
      # momentum * var * num_items / (num_items - 1) + (1 - momentum) * sigma_sq
-      # julia> reshape(mean(.1 .* var(x.data, dims = 1, corrected=false) .* (3 / 2), dims=3), :) .+ .9 .* 1.
+      # julia> reshape(mean(.1 .* var(x, dims = 1, corrected=false) .* (3 / 2), dims=3), :) .+ .9 .* 1.
      # 2-element Array{Float64,1}:
      #  1.
      #  1.
-      @test m.σ² ≈ reshape(mean(.1 .* var(x.data, dims = 1, corrected=false) .* (3 / 2), dims=3), :) .+ .9 .* 1.
+      @test m.σ² ≈ reshape(mean(.1 .* var(x, dims = 1, corrected=false) .* (3 / 2), dims=3), :) .+ .9 .* 1.

-      testmode!(m)
-      @test !m.active
-
-      x′ = m(x).data
+      x′ = m(x)
      @test isapprox(x′[1], (1 - 0.5) / sqrt(1. + 1f-5), atol = 1.0e-5)
  end
  # with activation function
  let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2),
-      x = param(reshape(collect(1:prod(sizes)), sizes))
-
+      x = reshape(collect(1:prod(sizes)), sizes)
+    x = Float64.(x)
    affine_shape = collect(sizes)
    affine_shape[1] = 1

-    @test m.active
-    m(x)
-
-    testmode!(m)
-    @test !m.active
-
-    y = m(x).data
-    @test isapprox(y, data(sigmoid.((x .- expand_inst(m.μ, affine_shape)) ./ sqrt.(expand_inst(m.σ², affine_shape) .+ m.ϵ))), atol = 1.0e-7)
+    y = m(x)
+    @test isapprox(y, sigmoid.((x .- expand_inst(m.μ, affine_shape)) ./ sqrt.(expand_inst(m.σ², affine_shape) .+ m.ϵ)), atol = 1.0e-7)
  end

-  let m = InstanceNorm(2), sizes = (2, 4, 1, 2, 3),
-      x = param(reshape(collect(1:prod(sizes)), sizes))
+  let m = trainmode!(InstanceNorm(2)), sizes = (2, 4, 1, 2, 3),
+      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
    y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
    y = reshape(m(y), sizes...)
    @test m(x) == y
@ -181,16 +173,16 @@ end

  # check that μ, σ², and the output are the correct size for higher rank tensors
  let m = InstanceNorm(2), sizes = (5, 5, 3, 4, 2, 6),
-      x = param(reshape(collect(1:prod(sizes)), sizes))
-    y = m(x)
+      x = reshape(Float32.(collect(1:prod(sizes))), sizes)
+    y = evalwgrad(m, x)
    @test size(m.μ) == (sizes[end - 1], )
    @test size(m.σ²) == (sizes[end - 1], )
    @test size(y) == sizes
  end

  # show that instance norm is equal to batch norm when channel and batch dims are squashed
-  let m_inorm = InstanceNorm(2), m_bnorm = BatchNorm(12), sizes = (5, 5, 3, 4, 2, 6),
-      x = param(reshape(collect(1:prod(sizes)), sizes))
+  let m_inorm = trainmode!(InstanceNorm(2)), m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
+      x = reshape(Float32.(collect(1:prod(sizes))), sizes)
    @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes)
  end

@ -200,3 +192,105 @@ end
  end

 end
+
+if VERSION >= v"1.1"
+@testset "GroupNorm" begin
+  # begin tests
+  squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
+
+  let m = GroupNorm(4,2), sizes = (3,4,2),
+        x = reshape(collect(1:prod(sizes)), sizes)
+
+      @test length(params(m)) == 2
+      x = Float64.(x)
+      @test m.β == [0, 0, 0, 0]  # initβ(32)
+      @test m.γ == [1, 1, 1, 1]  # initγ(32)
+
+      y = evalwgrad(m, x)
+
+      #julia> x
+      #[:, :, 1]  =
+      # 1.0  4.0  7.0  10.0
+      # 2.0  5.0  8.0  11.0
+      # 3.0  6.0  9.0  12.0
+      #
+      #[:, :, 2] =
+      # 13.0  16.0  19.0  22.0
+      # 14.0  17.0  20.0  23.0
+      # 15.0  18.0  21.0  24.0
+      #
+      # μ will be
+      # (1. + 2. + 3. + 4. + 5. + 6.) / 6 = 3.5
+      # (7. + 8. + 9. + 10. + 11. + 12.) / 6 = 9.5
+      #
+      # (13. + 14. + 15. + 16. + 17. + 18.) / 6 = 15.5
+      # (19. + 20. + 21. + 22. + 23. + 24.) / 6 = 21.5
+      #
+      # μ =
+      # 3.5   15.5
+      # 9.5   21.5
+      #
+      # ∴ update rule with momentum:
+      # (1. - .1) * 0 + .1 * (3.5 + 15.5) / 2 = 0.95
+      # (1. - .1) * 0 + .1 * (9.5 + 21.5) / 2 = 1.55
+      @test m.μ ≈ [0.95, 1.55]
+
+      # julia> mean(var(reshape(x,3,2,2,2),dims=(1,2)).* .1,dims=2) .+ .9*1.
+      # 2-element Array{Float64,1}:
+      #  1.25
+      #  1.25
+      @test m.σ² ≈ mean(squeeze(var(reshape(x,3,2,2,2),dims=(1,2))).*.1,dims=2) .+ .9*1.
+
+      x′ = m(x)
+      @test isapprox(x′[1], (1 - 0.95) / sqrt(1.25 + 1f-5), atol = 1.0e-5)
+  end
+  # with activation function
+  let m = GroupNorm(4,2, sigmoid), sizes = (3, 4, 2),
+      x = reshape(collect(1:prod(sizes)), sizes)
+    x = Float64.(x)
+    μ_affine_shape = ones(Int,length(sizes) + 1)
+    μ_affine_shape[end-1] = 2 # Number of groups
+
+    affine_shape = ones(Int,length(sizes) + 1)
+    affine_shape[end-2] = 2 # Channels per group
+    affine_shape[end-1] = 2 # Number of groups
+    affine_shape[1] = sizes[1]
+    affine_shape[end] = sizes[end]
+
+    og_shape = size(x)
+
+    y = m(x)
+    x_ = reshape(x,affine_shape...)
+    out = reshape(sigmoid.((x_ .- reshape(m.μ,μ_affine_shape...)) ./ sqrt.(reshape(m.σ²,μ_affine_shape...) .+ m.ϵ)),og_shape)
+    @test isapprox(y, out, atol = 1.0e-7)
+  end
+
+  let m = trainmode!(GroupNorm(2,2)), sizes = (2, 4, 1, 2, 3),
+      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
+    y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
+    y = reshape(m(y), sizes...)
+    @test m(x) == y
+  end
+
+  # check that μ, σ², and the output are the correct size for higher rank tensors
+  let m = GroupNorm(4,2), sizes = (5, 5, 3, 4, 4, 6),
+      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
+    y = evalwgrad(m, x)
+    @test size(m.μ) == (m.G,1)
+    @test size(m.σ²) == (m.G,1)
+    @test size(y) == sizes
+  end
+
+  # show that group norm is the same as instance norm when the group size is the same as the number of channels
+  let IN = trainmode!(InstanceNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,5),
+      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
+    @test IN(x) ≈ GN(x)
+  end
+
+  # show that group norm is the same as batch norm for a group of size 1 and batch of size 1
+  let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,1),
+      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
+    @test BN(x) ≈ GN(x)
+  end
+end
+end
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@ -1,9 +1,26 @@
 using Test
 using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
-            σ, binarycrossentropy, logitbinarycrossentropy
+            σ, binarycrossentropy, logitbinarycrossentropy, flatten,
+            xlogx, xlogy

 const ϵ = 1e-7

+@testset "xlogx & xlogy" begin
+  @test iszero(xlogx(0))
+  @test isnan(xlogx(NaN))
+  @test xlogx(2) ≈ 2.0 * log(2.0)
+  @inferred xlogx(2)
+  @inferred xlogx(0)
+
+  @test iszero(xlogy(0, 1))
+  @test isnan(xlogy(NaN, 1))
+  @test isnan(xlogy(1, NaN))
+  @test isnan(xlogy(NaN, NaN))
+  @test xlogy(2, 3) ≈ 2.0 * log(3.0)
+  @inferred xlogy(2, 3)
+  @inferred xlogy(0, 1)
+end
+
@testset "losses" begin
  # First, regression-style y's
  y = [1, 1, 0, 0]
@ -13,6 +30,20 @@ const ϵ = 1e-7
    @test mse(ŷ, y) ≈ (.1^2 + .9^2)/2
  end

+  @testset "mae" begin
+    @test Flux.mae(ŷ, y) ≈ 1/2
+  end
+
+  @testset "huber_loss" begin
+    @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
+  end
+
+  y = [123.0,456.0,789.0]
+  ŷ = [345.0,332.0,789.0]
+  @testset "msle" begin
+    @test Flux.msle(ŷ, y) ≈ 0.38813985859136585
+  end
+
  # Now onehot y's
  y = onehotbatch([1, 1, 0, 0], 0:1)
  ŷ = [.1 .9; .9 .1; .9 .1; .1 .9]'
@ -21,6 +52,7 @@ const ϵ = 1e-7
  lossvalue = 1.203972804325936

  @testset "crossentropy" begin
+    @test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ crossentropy([0.1,0.9], [0.1,0.9])
    @test crossentropy(ŷ, y) ≈ lossvalue
  end

@ -50,15 +82,63 @@ const ϵ = 1e-7
    @test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0)
  end

+  y = [1 2 3]
+  ŷ = [4.0 5.0 6.0]
+  @testset "kldivergence" begin
+    @test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ Flux.kldivergence([0.1,0.9], [0.1,0.9])
+    @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
+    @test Flux.kldivergence(y, y) ≈ 0
+  end
+
+  y = [1 2 3 4]
+  ŷ = [5.0 6.0 7.0 8.0]
+  @testset "hinge" begin
+    @test Flux.hinge(ŷ, y) ≈ 0
+    @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
+  end
+
+  @testset "squared_hinge" begin
+    @test Flux.squared_hinge(ŷ, y) ≈ 0
+    @test Flux.squared_hinge(y, 0.5 .* y) ≈ 0.0625
+  end
+
+  y = [0.1 0.2 0.3]
+  ŷ = [0.4 0.5 0.6]
+  @testset "poisson" begin
+    @test Flux.poisson(ŷ, y) ≈ 0.6278353988097339
+    @test Flux.poisson(y, y) ≈ 0.5044459776946685
+  end
+
+  y = [1.0 0.5 0.3 2.4]
+  ŷ = [0 1.4 0.5 1.2]
+  @testset "dice_coeff_loss" begin
+    @test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999
+    @test Flux.dice_coeff_loss(y, y) ≈ 0.0
+  end
+
+  @testset "tversky_loss" begin
+    @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
+    @test Flux.tversky_loss(ŷ, y, β = 0.8) ≈ -0.09490740740740744
+    @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
+  end
+
  @testset "no spurious promotions" begin
-    for T in (Float16, Float32, Float64)
+    for T in (Float32, Float64)
      y = rand(T, 2)
      ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy)
-        fwd, back = Flux.Tracker.forward(mse, ŷ, y)
-        @test typeof(fwd) == Flux.Tracker.TrackedReal{T}
-        @test eltype(back(one(T))[1]) == Flux.Tracker.TrackedReal{T}
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,
+              Flux.mae, Flux.huber_loss, Flux.msle, Flux.squared_hinge, Flux.dice_coeff_loss, Flux.tversky_loss)
+        fwd, back = Flux.pullback(f, ŷ, y)
+        @test fwd isa T
+        @test eltype(back(one(T))[1]) == T
      end
    end
  end
 end
+
+@testset "helpers" begin
+  @testset "flatten" begin
+    x = randn(Float32, 10, 10, 3, 2)
+    @test size(flatten(x)) == (300, 2)
+  end
+end
--- a/test/onehot.jl
+++ b/test/onehot.jl
@ -11,3 +11,9 @@ using Test
  @test onecold(a, labels) == 'C'
  @test onecold(A, labels) == ['C', 'A', 'D']
 end
+
+@testset "onehotbatch indexing" begin
+  y = Flux.onehotbatch(ones(3), 1:10)
+  @test y[:,1] isa Flux.OneHotVector
+  @test y[:,:] isa Flux.OneHotMatrix
+end
--- a/test/optimise.jl
+++ b/test/optimise.jl
@ -1,42 +1,44 @@
 using Flux.Optimise
 using Flux.Optimise: runall
-using Flux.Tracker
+using Flux: Params, gradient
 using Test
+
@testset "Optimise" begin
  w = randn(10, 10)
  @testset for opt in [ADAMW(), ADAGrad(0.1), AdaMax(), ADADelta(0.9), AMSGrad(),
-                       NADAM(), Descent(0.1), ADAM(), Nesterov(), RMSProp(),
+                       NADAM(), RADAM(), Descent(0.1), ADAM(), Nesterov(), RMSProp(),
                       Momentum()]
-    w′ = param(randn(10, 10))
+    w′ = randn(10, 10)
    loss(x) = Flux.mse(w*x, w′*x)
    for t = 1: 10^5
      θ = Params([w′])
-      θ̄ = gradient(() -> loss(rand(10)), θ)
+      x = rand(10)
+      θ̄ = gradient(() -> loss(x), θ)
      Optimise.update!(opt, θ, θ̄)
    end
-    @test Flux.mse(w, w′) < 0.01
+    @test loss(rand(10, 10)) < 0.01
  end
 end

@testset "Optimiser" begin
  w = randn(10, 10)
  @testset for Opt in [InvDecay, WeightDecay, ExpDecay]
-    w′ = param(randn(10, 10))
+    w′ = randn(10, 10)
    loss(x) = Flux.mse(w*x, w′*x)
    opt = Optimiser(Opt(), ADAM(0.001))
    for t = 1:10^5
-      l = loss(rand(10))
-      back!(l)
-      delta = Optimise.apply!(opt, w′.data, w′.grad)
-      w′.data .-= delta
+      θ = Params([w′])
+      x = rand(10)
+      θ̄ = gradient(() -> loss(x), θ)
+      Optimise.update!(opt, θ, θ̄)
    end
-    @test Flux.mse(w, w′) < 0.01
+    @test loss(rand(10, 10)) < 0.01
  end
 end

@testset "Training Loop" begin
  i = 0
-  l = param(1)
+  l = 1

  Flux.train!(() -> (sleep(0.1); i += 1; l),
              (),
@ -53,3 +55,59 @@ end
  cbs()
  @test x == 1
 end
+
+@testset "ExpDecay" begin
+
+  @testset "Sanity Check" begin
+    o = ExpDecay(0.2, 0.5, 1, 1e-3)
+    p = [0.0]
+    steps = 1:8
+    eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
+    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
+    @test eta_actual == eta_expected
+  end
+
+  w = randn(10, 10)
+  o = ExpDecay(0.1, 0.1, 1000, 1e-4)
+  w1 = randn(10,10)
+  loss(x) = Flux.mse(w*x, w1*x)
+  flag = 1
+  decay_steps = []
+  for t = 1:10^5
+    prev_eta = o.eta
+    θ = Params([w1])
+    x = rand(10)
+    θ̄ = gradient(() -> loss(x), θ)
+    prev_grad = collect(θ̄[w1])
+    delta = Optimise.apply!(o, w1, θ̄[w1])
+    w1 .-= delta
+    new_eta = o.eta
+    if new_eta != prev_eta
+      push!(decay_steps, t)
+    end
+    array = fill(o.eta, size(prev_grad))
+    if array .* prev_grad != delta
+      flag = 0
+    end
+  end
+  @test flag == 1
+  # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
+  ground_truth = []
+  for i in 1:4
+    push!(ground_truth, 1000*i)  # Expected decay steps for this example.
+  end
+  @test decay_steps == ground_truth
+  @test o.eta == o.clip
+end
+
+@testset "Clipping" begin
+    w = randn(10, 10)
+    loss(x) = sum(w * x)
+    θ = Params([w])
+    x = 1000 * randn(10)
+    w̄ = gradient(() -> loss(x), θ)[w]
+    w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
+    @test all(w̄_value .<= 1)
+    w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
+    @test norm(w̄_norm) <= 1
+end
--- a/test/runtests.jl
+++ b/test/runtests.jl
@ -1,33 +1,46 @@
-using Flux, Test, Random, Statistics
-using Random
+using Flux 
+using Flux.Data
+using Test 
+using Random, Statistics, LinearAlgebra
+using IterTools: ncycle

 Random.seed!(0)

-# So we can use the system CuArrays
-insert!(LOAD_PATH, 2, "@v#.#")
-
-@testset "Flux" begin
-
-@info "Testing Basics"
-
-include("utils.jl")
-include("onehot.jl")
-include("optimise.jl")
-include("data.jl")
-
-@info "Testing Layers"
-
-include("layers/basic.jl")
-include("layers/normalisation.jl")
-include("layers/stateless.jl")
-include("layers/conv.jl")
-
-@info "Running Gradient Checks"
-
-include("tracker.jl")
-
-if Base.find_package("CuArrays") != nothing
-  include("cuda/cuda.jl")
+@testset "Utils" begin
+  include("utils.jl")
 end

+@testset "Onehot" begin
+  include("onehot.jl")
+end
+
+@testset "Optimise" begin
+  include("optimise.jl")
+end
+
+@testset "Data" begin
+  include("data.jl")
+end
+
+@testset "Layers" begin
+  include("layers/basic.jl")
+  include("layers/normalisation.jl")
+  include("layers/stateless.jl")
+  include("layers/conv.jl")
+end
+
+@testset "CUDA" begin
+  if Flux.use_cuda[]
+    include("cuda/cuda.jl")
+  else
+    @warn "CUDA unavailable, not testing GPU support"
+  end
+end
+
+@static if VERSION >= v"1.4"
+  using Documenter
+  @testset "Docs" begin
+    DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
+    doctest(Flux)
+  end
 end
--- a/test/tracker.jl
+++ b/test/tracker.jl
@ -1,15 +0,0 @@
-using Flux, Test
-using Tracker: gradcheck
-
-gradtest(f, xs::AbstractArray...) = gradcheck((xs...) -> sum(sin.(f(xs...))), xs...)
-gradtest(f, dims...) = gradtest(f, rand.(Float64, dims)...)
-
-@testset "Tracker" begin
-
-@test gradtest(Flux.mse, rand(5,5), rand(5, 5))
-@test gradtest(Flux.crossentropy, rand(5,5), rand(5, 5))
-
-@test gradtest(x -> Flux.normalise(x), rand(4,3))
-@test gradtest(x -> Flux.normalise(x, dims = 2), rand(3,4))
-
-end
--- a/test/utils.jl
+++ b/test/utils.jl
@ -1,6 +1,6 @@
 using Flux
-using Flux: throttle, jacobian, glorot_uniform, glorot_normal, stack, unstack
-using StatsBase: std
+using Flux: throttle, nfan, glorot_uniform, glorot_normal, stack, unstack
+using StatsBase: var
 using Random
 using Test

@ -52,31 +52,30 @@ using Test
  end
 end

-@testset "Jacobian" begin
-  A = param(randn(2,2))
-  x = randn(2)
-  m(x) = A*x
-  y = m(x)
-  J = jacobian(m,x)
-  @test J ≈ A.data
-end
-
@testset "Initialization" begin
  # Set random seed so that these tests don't fail randomly
  Random.seed!(0)

-  # glorot_uniform should yield a kernel with stddev ~= sqrt(6/(n_in + n_out)),
-  # and glorot_normal should yield a kernel with stddev != 2/(n_in _ n_out)
-  for (n_in, n_out) in [(100, 100), (100, 400)]
-    v = glorot_uniform(n_in, n_out)
-    @test minimum(v) > -1.1*sqrt(6/(n_in + n_out))
-    @test minimum(v) < -0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) >  0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) <  1.1*sqrt(6/(n_in + n_out))
+  @testset "Fan in/out" begin
+    @test nfan() == (1, 1) #For a constant
+    @test nfan(100) == (1, 100) #For vector
+    @test nfan(100, 200) == (200, 100) #For Dense layer
+    @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
+    @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
+    @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
+  end

-    v = glorot_normal(n_in, n_out)
-    @test std(v) > 0.9*sqrt(2/(n_in + n_out))
-    @test std(v) < 1.1*sqrt(2/(n_in + n_out))
+  @testset "glorot" begin
+    # glorot_uniform and glorot_normal should both yield a kernel with
+    # variance ≈ 2/(fan_in + fan_out)
+    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+      for init ∈ [glorot_uniform, glorot_normal]
+        v = init(dims...)
+        fan_in, fan_out = nfan(dims...)
+        σ2 = 2 / (fan_in + fan_out)
+        @test 0.9σ2 < var(v) < 1.1σ2
+      end
+    end
  end
 end

@ -85,6 +84,15 @@ end
  @test size.(params(m)) == [(5, 10), (5,)]
  m = RNN(10, 5)
  @test size.(params(m)) == [(5, 10), (5, 5), (5,), (5,)]
+
+  # Layer duplicated in same chain, params just once pls.
+  c = Chain(m, m)
+  @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5,)]
+
+  # Self-referential array. Just want params, no stack overflow pls.
+  r = Any[nothing,m]
+  r[1] = r
+  @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5,)]
 end

@testset "Basic Stacking" begin
@ -96,12 +104,11 @@ end
@testset "Precision" begin
  m = Chain(Dense(10, 5, relu), Dense(5, 2))
  x = rand(10)
-  @test eltype(m[1].W.data) == Float32
-  @test eltype(m(x).data) == Float32
-  @test eltype(f64(m)(x).data) == Float64
-  @test eltype(f64(m)[1].W.data) == Float64
-  @test eltype(f32(f64(m))[1].W.data) == Float32
-  @test Tracker.isleaf(f32(f64(m))[1].W)
+  @test eltype(m[1].W) == Float32
+  @test eltype(m(x)) == Float32
+  @test eltype(f64(m)(x)) == Float64
+  @test eltype(f64(m)[1].W) == Float64
+  @test eltype(f32(f64(m))[1].W) == Float32
 end

@testset "Stacking" begin
				`@ -0,0 +1 @@`
				`custom: https://numfocus.salsalabs.org/donate-to-julia/index.html`