Merge #1238

1238: Fix inline code block r=dhairyagandhi96 a=harryscholes ### PR Checklist - [ ] Tests are added - [ ] Entry in NEWS.md - [x] Documentation, if applicable - [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes). Co-authored-by: harryscholes <harryscholes@gmail.com>
Fix inline code block
2020-06-19 08:28:41 +00:00 · 2020-06-19 09:24:44 +01:00 · 2020-06-16 17:21:28 +00:00 · 2020-06-16 13:04:20 +00:00 · 2020-06-16 14:02:24 +01:00 · 2020-06-16 13:32:27 +02:00
47 changed files with 1785 additions and 765 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,12 @@
+[Please delete this text and describe your change here.
+For bugfixes, please detail the bug and include a test case which your patch fixes.
+If you are adding a new feature, please clearly describe the design, its rationale, the possible alternatives considered.
+It is easiest to merge new features when there is clear precedent in other systems; we need to know we're taking
+the right direction since it can be hard to change later.]
+
+### PR Checklist
+
+- [ ] Tests are added
+- [ ] Entry in NEWS.md
+- [ ] Documentation, if applicable
+- [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes).
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@ -6,16 +6,8 @@ on:

 jobs:
  CompatHelper:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        julia-version: [1.3]
-        julia-arch: [x64]
-        os: [ubuntu-latest]
+    runs-on: ubuntu-latest
    steps:
-      - uses: julia-actions/setup-julia@latest
-        with:
-          version: ${{ matrix.julia-version }}
      - name: Pkg.add("CompatHelper")
        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
      - name: CompatHelper.main()
--- a/.travis.yml
+++ b/.travis.yml
@ -7,6 +7,7 @@ os:

 julia:
  - 1.3
+  - 1
  - nightly

 notifications:
--- a/Manifest.toml
+++ b/Manifest.toml
@ -8,71 +8,77 @@ version = "0.5.0"

 [[AbstractTrees]]
 deps = ["Markdown"]
-git-tree-sha1 = "86d092c2599f1f7bb01668bf8eb3412f98d61e47"
+git-tree-sha1 = "33e450545eaf7699da1a6e755f9ea65f14077a45"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.3.2"
+version = "0.3.3"

 [[Adapt]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
+git-tree-sha1 = "fd04049c7dd78cfef0b06cdc1f0f181467655712"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "1.0.1"
+version = "1.1.0"

 [[ArrayLayouts]]
 deps = ["FillArrays", "LinearAlgebra"]
-git-tree-sha1 = "bc779df8d73be70e4e05a63727d3a4dfb4c52b1f"
+git-tree-sha1 = "a504dca2ac7eda8761c8f7c1ed52427a1be75a3c"
 uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
-version = "0.1.5"
+version = "0.2.6"

 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"

 [[BinaryProvider]]
-deps = ["Libdl", "SHA"]
-git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
+deps = ["Libdl", "Logging", "SHA"]
+git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
 uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.8"
+version = "0.5.10"

 [[CEnum]]
-git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
+git-tree-sha1 = "1b77a77c3b28e0b3f413f7567c9bb8dd9bdccd14"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.2.0"
+version = "0.3.0"

 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "d7ceadd8f821177d05b897c0517e94633db535fe"
+git-tree-sha1 = "831b825d10104bd29e28f6da93312a976830717b"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "3.1.0"
+version = "4.0.0"

 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "01e90fa34e25776bc7c8661183d4519149ebfe59"
+git-tree-sha1 = "f56bbf18c86bcff7a961a32a4947a5abb2963a29"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "6.0.0"
+version = "6.3.0"

 [[CUDAnative]]
-deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "f86269ff60ebe082a2806ecbce51f3cadc68afe9"
+deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "ExprTools", "GPUCompiler", "LLVM", "Libdl", "Pkg", "Printf"]
+git-tree-sha1 = "ac86db2b05fdfec96b011e25a504ffe7476e8a68"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.10.2"
+version = "3.1.0"
+
+[[CodeTracking]]
+deps = ["InteractiveUtils", "UUIDs"]
+git-tree-sha1 = "cab4da992adc0a64f63fa30d2db2fd8bec40cab4"
+uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
+version = "0.5.11"

 [[CodecZlib]]
-deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
-git-tree-sha1 = "05916673a2627dd91b4969ff8ba6941bc85a960e"
+deps = ["TranscodingStreams", "Zlib_jll"]
+git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
 uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
-version = "0.6.0"
+version = "0.7.0"

 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8"
+git-tree-sha1 = "c73d9cfc2a9d8433dc77f5bff4bddf46b1d78c20"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.9.1"
+version = "0.10.3"

 [[Colors]]
-deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
-git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b"
+deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"]
+git-tree-sha1 = "1e9bba7984e78aa8cdeea7f9f7cc984ad4e4b1c7"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.11.2"
+version = "0.12.2"

 [[CommonSubexpressions]]
 deps = ["Test"]
@ -82,26 +88,32 @@ version = "0.2.0"

 [[CompilerSupportLibraries_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "b57c5d019367c90f234a7bc7e24ff0a84971da5d"
+git-tree-sha1 = "7c4f882c41faa72118841185afc58a2eb00ef612"
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "0.2.0+1"
+version = "0.3.3+0"
+
+[[Cthulhu]]
+deps = ["CodeTracking", "InteractiveUtils", "REPL", "UUIDs", "Unicode"]
+git-tree-sha1 = "f3643e78353199d3097821e806348bd83f364155"
+uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+version = "1.1.1"

 [[CuArrays]]
-deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "7c20c5a45bb245cf248f454d26966ea70255b271"
+deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
+git-tree-sha1 = "1582b74d2322df7dd94549d4ac9d095e0f20e884"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.7.2"
+version = "2.2.1"

 [[DataAPI]]
-git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
+git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.1.0"
+version = "1.3.0"

 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8"
+git-tree-sha1 = "af6d9c86e191c917c2276fbede1137e8ea20157f"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.10"
+version = "0.17.17"

 [[Dates]]
 deps = ["Printf"]
@ -127,52 +139,55 @@ version = "1.0.1"
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"

-[[FFTW]]
-deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
-git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
-uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.2.0"
-
-[[FFTW_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "ddb57f4cf125243b4aa4908c94d73a805f3cbf2c"
-uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
-version = "3.3.9+4"
+[[ExprTools]]
+git-tree-sha1 = "6f0517056812fd6aa3af23d4b70d5325a2ae4e95"
+uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+version = "0.1.1"

 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
+git-tree-sha1 = "44f561e293987ffc84272cd3d2b14b0b93123d63"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.5"
+version = "0.8.10"

 [[FixedPointNumbers]]
-git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa"
+git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.7.1"
+version = "0.8.0"

 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "88b082d492be6b63f967b6c96b352e25ced1a34c"
+git-tree-sha1 = "869540e4367122fbffaace383a5bdc34d6e5e5ac"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.9"
+version = "0.10.10"
+
+[[Functors]]
+deps = ["MacroTools"]
+git-tree-sha1 = "f40adc6422f548176bb4351ebd29e4abf773040a"
+uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+version = "0.1.0"
+
+[[Future]]
+deps = ["Random"]
+uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"

 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
-git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
+git-tree-sha1 = "d887693eb1bd5e1fd573262a978745481895ec7d"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "2.0.1"
+version = "3.4.1"
+
+[[GPUCompiler]]
+deps = ["Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs"]
+git-tree-sha1 = "5275aa268ecd09640b32560e1eae90c78816e4d1"
+uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
+version = "0.2.0"

 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
+git-tree-sha1 = "90ee39f9beaaa186e4968417ea2b8ed5673c91c0"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.3.1"
-
-[[IntelOpenMP_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
-uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
-version = "2018.0.3+0"
+version = "0.3.3"

 [[InteractiveUtils]]
 deps = ["Markdown"]
@ -180,17 +195,18 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

 [[Juno]]
 deps = ["Base64", "Logging", "Media", "Profile"]
-git-tree-sha1 = "4f2249fb58cfb140eeb89428e31791e2f8959d8c"
+git-tree-sha1 = "a686b0cf235fa3e491b79b4783c2d2382292b436"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.8.0"
+version = "0.8.2"

 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
+git-tree-sha1 = "dd3f584c3dbefe39b2a8fbafa1a3b77e31e21255"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.3.3"
+version = "1.5.1"

 [[LibGit2]]
+deps = ["Printf"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"

 [[Libdl]]
@ -203,17 +219,11 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

-[[MKL_jll]]
-deps = ["IntelOpenMP_jll", "Libdl", "Pkg"]
-git-tree-sha1 = "720629cc8cbd12c146ca01b661fd1a6cf66e2ff4"
-uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
-version = "2019.0.117+2"
-
 [[MacroTools]]
-deps = ["DataStructures", "Markdown", "Random"]
-git-tree-sha1 = "07ee65e03e28ca88bc9a338a3726ae0c3efaa94b"
+deps = ["Markdown", "Random"]
+git-tree-sha1 = "f7d2e3f654af75f01ec49be82c231c382214223a"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.4"
+version = "0.5.5"

 [[Markdown]]
 deps = ["Base64"]
@ -247,18 +257,17 @@ version = "0.3.3"

 [[OpenSpecFun_jll]]
 deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
-git-tree-sha1 = "d110040968b9afe95c6bd9c6233570b0fe8abd22"
+git-tree-sha1 = "d51c416559217d974a1113522d5919235ae67a87"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.3+2"
+version = "0.5.3+3"

 [[OrderedCollections]]
-deps = ["Random", "Serialization", "Test"]
-git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
+git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.1.0"
+version = "1.2.0"

 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

 [[Printf]]
@ -310,15 +319,15 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

 [[SpecialFunctions]]
 deps = ["OpenSpecFun_jll"]
-git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
+git-tree-sha1 = "d8d8b8a9f4119829410ecd706da4cc8594a1e020"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.10.0"
+version = "0.10.3"

 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
+git-tree-sha1 = "5c06c0aeb81bef54aed4b3f446847905eb6cbda0"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.12.1"
+version = "0.12.3"

 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
@ -326,9 +335,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

 [[StatsBase]]
 deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
+git-tree-sha1 = "a6102b1f364befdb05746f386b67c6b7e3262c45"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.32.1"
+version = "0.33.0"

 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
@ -336,9 +345,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

 [[TimerOutputs]]
 deps = ["Printf"]
-git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
+git-tree-sha1 = "f458ca23ff80e46a630922c555d838303e4b9603"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.3"
+version = "0.5.6"

 [[TranscodingStreams]]
 deps = ["Random", "Test"]
@ -355,21 +364,21 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

 [[ZipFile]]
 deps = ["Libdl", "Printf", "Zlib_jll"]
-git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
+git-tree-sha1 = "254975fef2fc526583bb9b7c9420fe66ffe09f2f"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.9.1"
+version = "0.9.2"

 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e"
+git-tree-sha1 = "a2e0d558f6031002e380a90613b199e37a8565bf"
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.11+8"
+version = "1.2.11+10"

 [[Zygote]]
-deps = ["ArrayLayouts", "DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "7dc5fdb4917ac5a84e199ae654316a01cd4a278b"
+deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "Future", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
+git-tree-sha1 = "707ceea58e2bd0ff3077ab13a92f8355181d3ee4"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.9"
+version = "0.4.20"

 [[ZygoteRules]]
 deps = ["MacroTools"]
--- a/NEWS.md
+++ b/NEWS.md
@ -1,3 +1,19 @@
+# v0.11
+* Change to `DataLoader`'s constructor [https://github.com/FluxML/Flux.jl/pull/1152]
+* Use `DataLoader` with `NamedTuple`s, so that tensors can be accessed by name [https://github.com/FluxML/Flux.jl/pull/1221].
+* Error if Dense layers weights and biases are not arrays [https://github.com/FluxML/Flux.jl/pull/1218].
+
+# v0.10.5
+* Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`.
+* Added option to set `bias` to [Flux.Zeros](https://github.com/FluxML/Flux.jl/pull/873) to eliminating `bias` from being trained.
+* Added `GlobalMaxPool` and `GlobalMeanPool` [layers](https://github.com/FluxML/Flux.jl/pull/950) for performing global pooling operations.
+* Added `ClipValue` and `ClipNorm` in this [pr](https://github.com/FluxML/Flux.jl/pull/1133) to `Flux.Optimise` to provide a cleaner API for gradient clipping.
+* Added new kwarg-only [constructors](https://github.com/FluxML/Flux.jl/pull/873) for the various convolutional layers.
+* Documented the convolutional layer constructors accepting `weight` and `bias` keyword arguments to supply custom arrays for those fields.
+* Testing suite improvements now test for gradients of all layers along with GPU support.
+* Functors have now moved to [Functors.jl](https://github.com/FluxML/Flux.jl/pull/1174) to allow for their use outside of Flux.
+* Added [helper functions](https://github.com/FluxML/Flux.jl/pull/873) `Flux.convfilter` and `Flux.depthwiseconvfilter` to construct weight arrays for convolutions outside of layer constructors so as to not have to depend on the default layers for custom implementations.
+
 # v0.10.0
 * The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
  - The dependency on Tracker.jl has been removed.
--- a/Project.toml
+++ b/Project.toml
@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.3"
+version = "0.11.0-DEV"

 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@ -9,7 +9,9 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@ -25,18 +27,19 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

 [compat]
 AbstractTrees = "0.2, 0.3"
-Adapt = "1"
-CodecZlib = "0.5, 0.6"
-Colors = "0.8, 0.9, 0.10, 0.11"
-CuArrays = "1.6"
+Adapt = "1, 2.0"
+CodecZlib = "0.5, 0.6, 0.7"
+Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
+CuArrays = "2"
+Functors = "0.1"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
 Reexport = "0.2"
 StatsBase = "0"
 ZipFile = "0.7, 0.8, 0.9"
-Zygote = "0.4"
-julia = "1"
+Zygote = "0.4.13"
+julia = "1.3"

 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
--- a/docs/make.jl
+++ b/docs/make.jl
@ -1,6 +1,8 @@
 using Documenter, Flux, NNlib

+DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
 makedocs(modules=[Flux, NNlib],
+         doctest = VERSION >= v"1.4",
         sitename = "Flux",
         pages = ["Home" => "index.md",
                  "Building Models" =>
@ -19,12 +21,16 @@ makedocs(modules=[Flux, NNlib],
                  "GPU Support" => "gpu.md",
                  "Saving & Loading" => "saving.md",
                  "The Julia Ecosystem" => "ecosystem.md",
+                  "Utility Functions" => "utilities.md",
                  "Performance Tips" => "performance.md",
+                  "Datasets" => "datasets.md",
                  "Community" => "community.md"],
-         format = Documenter.HTML(assets = ["assets/flux.css"],
-                                  analytics = "UA-36890222-9",
-                                  prettyurls = haskey(ENV, "CI")))
+         format = Documenter.HTML(
+             analytics = "UA-36890222-9",
+             assets = ["assets/flux.css"],
+             prettyurls = get(ENV, "CI", nothing) == "true"),
+         )

-deploydocs(repo = "github.com/FluxML/Flux.jl.git",    
+deploydocs(repo = "github.com/FluxML/Flux.jl.git",
           target = "build",
           push_preview = true)
--- a/docs/src/data/dataloader.md
+++ b/docs/src/data/dataloader.md
@ -3,4 +3,4 @@ Flux provides the `DataLoader` type in the `Flux.Data` module to handle iteratio

 ```@docs
 Flux.Data.DataLoader
-```
+```
--- a/docs/src/data/onehot.md
+++ b/docs/src/data/onehot.md
@ -7,15 +7,15 @@ julia> using Flux: onehot, onecold

 julia> onehot(:b, [:a, :b, :c])
 3-element Flux.OneHotVector:
- false
-  true
- false
+ 0
+ 1
+ 0

 julia> onehot(:c, [:a, :b, :c])
 3-element Flux.OneHotVector:
- false
- false
-  true
+ 0
+ 0
+ 1
 ```

 The inverse is `onecold` (which can take a general probability distribution, as well as just booleans).
@ -31,6 +31,11 @@ julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c])
 :c
 ```

+```@docs
+Flux.onehot
+Flux.onecold
+```
+
 ## Batches

 `onehotbatch` creates a batch (matrix) of one-hot vectors, and `onecold` treats matrices as batches.
@ -52,3 +57,7 @@ julia> onecold(ans, [:a, :b, :c])
 ```

 Note that these operations returned `OneHotVector` and `OneHotMatrix` rather than `Array`s. `OneHotVector`s behave like normal vectors but avoid any unnecessary cost compared to using an integer index directly. For example, multiplying a matrix with a one-hot vector simply slices out the relevant row of the matrix under the hood.
+
+```@docs
+Flux.onehotbatch
+```
--- a/docs/src/datasets.md
+++ b/docs/src/datasets.md
@ -0,0 +1,20 @@
+# Datasets
+
+Flux includes several standard machine learning datasets.
+
+```@docs
+Flux.Data.Iris.features()
+Flux.Data.Iris.labels()
+Flux.Data.MNIST.images()
+Flux.Data.MNIST.labels()
+Flux.Data.FashionMNIST.images()
+Flux.Data.FashionMNIST.labels()
+Flux.Data.CMUDict.phones()
+Flux.Data.CMUDict.symbols()
+Flux.Data.CMUDict.rawdict()
+Flux.Data.CMUDict.cmudict()
+Flux.Data.Sentiment.train()
+Flux.Data.Sentiment.test()
+Flux.Data.Sentiment.dev()
+```
+
--- a/docs/src/models/advanced.md
+++ b/docs/src/models/advanced.md
@ -19,7 +19,7 @@ Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0
 julia> Flux.params(a) # default behavior
 Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])

-julia> Flux.trainable(a::Affine) = (a.W, a.b,)
+julia> Flux.trainable(a::Affine) = (a.W,)

 julia> Flux.params(a)
 Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@ -32,8 +32,6 @@ julia> gradient(f, [2, 1], [2, 0])
 But machine learning models can have *hundreds* of parameters! To handle this, Flux lets you work with collections of parameters, via `params`. You can get the gradient of all parameters used in a program without explicitly passing them in.

 ```jldoctest basics
-julia> using Flux
-
 julia> x = [2, 1];

 julia> y = [2, 0];
@ -220,7 +218,7 @@ Flux.@functor Affine

 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).

-For some more helpful tricks, including parameter freezing, please checkout the [advanced usage guide](advacned.md).
+For some more helpful tricks, including parameter freezing, please checkout the [advanced usage guide](advanced.md).

 ## Utility functions

@ -240,5 +238,5 @@ Currently limited to the following layers:
 - `MeanPool`

 ```@docs
-outdims
+Flux.outdims
 ```
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@ -14,10 +14,17 @@ These layers are used to build convolutional neural networks (CNNs).
 ```@docs
 Conv
 MaxPool
+GlobalMaxPool
 MeanPool
+GlobalMeanPool
 DepthwiseConv
 ConvTranspose
 CrossCor
+SamePad
+flatten
+Flux.Zeros
+Flux.convfilter
+Flux.depthwiseconvfilter
 ```

 ## Recurrent Layers
@ -29,6 +36,7 @@ RNN
 LSTM
 GRU
 Flux.Recur
+Flux.reset!
 ```

 ## Other General Purpose Layers
@ -46,20 +54,22 @@ SkipConnection
 These layers don't affect the structure of the network but may improve training times or reduce overfitting.

 ```@docs
+Flux.normalise
 BatchNorm
-Dropout
 Flux.dropout
+Dropout
 AlphaDropout
 LayerNorm
+InstanceNorm
 GroupNorm
 ```

 ### Testmode

-Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified.
+Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `Flux.testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified.

 ```@docs
-testmode!
+Flux.testmode!
 trainmode!
 ```

--- a/docs/src/models/regularisation.md
+++ b/docs/src/models/regularisation.md
@ -64,3 +64,7 @@ julia> activations(c, rand(10))
 julia> sum(norm, ans)
 2.1166067f0
 ```
+
+```@docs
+Flux.activations
+```
--- a/docs/src/performance.md
+++ b/docs/src/performance.md
@ -39,7 +39,7 @@ E.g. the following will have run into the same problem as above:
    leaky_tanh(x) = 0.01*x + tanh(x)
 ```

-While one could change the activation function (e.g. to use `0.01f0x`), the idiomatic (and safe way)  to avoid type casts whenever inputs changes is to use `oftype`:
+While one could change the activation function (e.g. to use `0.01f0*x`), the idiomatic (and safe way)  to avoid type casts whenever inputs changes is to use `oftype`:
 ```
    leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x)
 ```
@ -52,7 +52,7 @@ e.g.
 ```julia
 function loss_total(xs::AbstractVector{<:Vector}, ys::AbstractVector{<:Vector})
    sum(zip(xs, ys)) do (x, y_target)
-        y_pred = model(x) #  evaluate the model
+        y_pred = model(x)  # evaluate the model
        return loss(y_pred, y_target)
    end
 end
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@ -52,6 +52,7 @@ Momentum
 Nesterov
 RMSProp
 ADAM
+RADAM
 AdaMax
 ADAGrad
 ADADelta
@ -79,7 +80,7 @@ Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
 The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.

 ```julia
-function apply!(o::Momentum, x, Δ)
+function Flux.Optimise.apply!(o::Momentum, x, Δ)
  η, ρ = o.eta, o.rho
  v = get!(o.velocity, x, zero(x))::typeof(x)
  @. v = ρ * v - η * Δ
@ -139,3 +140,16 @@ ExpDecay
 InvDecay
 WeightDecay
 ```
+
+## Gradient Clipping
+
+Gradient clipping is useful for training recurrent neural networks, which have a tendency to suffer from the exploding gradient problem. An example usage is
+
+```julia
+opt = Optimiser(ClipValue(1e-3), ADAM(1e-3))
+```
+
+```@docs
+ClipValue
+ClipNorm
+```
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@ -32,6 +32,7 @@ Flux.train!(loss, ps, data, opt)
 ```

 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
+For a list of all built-in loss functions, check out the [layer reference](../models/layers.md).

 At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.

@ -94,6 +95,10 @@ julia> @epochs 2 Flux.train!(...)
 # Train for two epochs
 ```

+```@docs
+Flux.@epochs
+```
+
 ## Callbacks

 `train!` takes an additional argument, `cb`, that's used for callbacks so that you can observe the training process. For example:
@ -137,7 +142,7 @@ function my_custom_train!(loss, ps, data, opt)
  for d in data
    gs = gradient(ps) do
      training_loss = loss(d...)
-      # Insert what ever code you want here that needs Training loss, e.g. logging
+      # Insert whatever code you want here that needs Training loss, e.g. logging
      return training_loss
    end
    # insert what ever code you want here that needs gradient
--- a/docs/src/utilities.md
+++ b/docs/src/utilities.md
@ -0,0 +1,49 @@
+# Utility Functions
+
+Flux contains some utility functions for working with data; these functions
+help create inputs for your models or batch your dataset.
+Other functions can be used to initialize your layers or to regularly execute
+callback functions.
+
+## Working with Data
+
+```@docs
+Flux.unsqueeze
+Flux.stack
+Flux.unstack
+Flux.chunk
+Flux.frequencies
+Flux.batch
+Flux.batchseq
+Base.rpad(v::AbstractVector, n::Integer, p)
+```
+
+## Layer Initialization
+
+These are primarily useful if you are planning to write your own layers.
+Flux initializes convolutional layers and recurrent cells with `glorot_uniform`
+by default.
+To change the default on an applicable layer, pass the desired function with the
+`init` keyword. For example:
+```jldoctest; setup = :(using Flux)
+julia> conv = Conv((3, 3), 1 => 8, relu; init=Flux.glorot_normal)
+Conv((3, 3), 1=>8, relu)
+```
+
+```@docs
+Flux.glorot_uniform
+Flux.glorot_normal
+```
+
+## Model Abstraction
+
+```@docs
+Flux.destructure
+```
+
+## Callback Helpers
+
+```@docs
+Flux.throttle
+Flux.stop
+```
--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -3,29 +3,33 @@ module Flux
 # Zero Flux Given

 using Base: tail
-using Zygote, MacroTools, Juno, Reexport, Statistics, Random
+using Statistics, Random, LinearAlgebra
+using Zygote, MacroTools, Juno, Reexport
 using MacroTools: @forward
@reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd

 export gradient

-export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
+export Chain, Dense, Maxout, RNN, LSTM, GRU, SamePad, Conv, CrossCor, ConvTranspose,
+       GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, flatten,
       DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
       SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!

 include("optimise/Optimise.jl")
 using .Optimise
 using .Optimise: @epochs
-export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
+export Descent, ADAM, Momentum, Nesterov, RMSProp,
  ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
-  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
+  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay,
+  ClipValue, ClipNorm


 using CuArrays
 const use_cuda = Ref(false)

 include("utils.jl")
+include("zeros.jl")
 include("onehot.jl")
 include("functor.jl")

--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@ -51,4 +51,6 @@ export Iris
 include("housing.jl")
 export Housing

+@deprecate DataLoader(x...; kws...) DataLoader(x; kws...)
+
 end
--- a/src/data/cmudict.jl
+++ b/src/data/cmudict.jl
@ -24,18 +24,35 @@ function load()
  end
 end

+"""
+    phones()
+
+Return a `Vector` containing the phones used in the CMU Pronouncing Dictionary.
+"""
 function phones()
  load()
  Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
                        "\n", keepempty = false), "\t")))
 end

+"""
+    symbols()
+
+Return a `Vector` containing the symbols used in the CMU Pronouncing Dictionary.
+A symbol is a phone with optional auxiliary symbols, indicating for example the
+amount of stress on the phone.
+"""
 function symbols()
  load()
  Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
                "\n", keepempty = false))
 end

+"""
+    rawdict()
+
+Return the unfiltered CMU Pronouncing Dictionary.
+"""
 function rawdict()
  load()
  Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
@ -44,6 +61,14 @@ end

 validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)

+"""
+    cmudict()
+
+Return a filtered CMU Pronouncing Dictionary.
+
+It is filtered so each word contains only ASCII characters and a combination of
+word characters (as determined by the regex engine using `\\w`), '-' and '.'.
+"""
 cmudict() = filter(p -> validword(p.first), rawdict())

 alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
--- a/src/data/dataloader.jl
+++ b/src/data/dataloader.jl
@ -1,7 +1,7 @@
 # Adapted from Knet's src/data.jl (author: Deniz Yuret)

-struct DataLoader
-    data
+struct DataLoader{D}
+    data::D
    batchsize::Int
    nobs::Int
    partial::Bool
@ -11,37 +11,43 @@ struct DataLoader
 end

 """
-    DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+    DataLoader(data; batchsize=1, shuffle=false, partial=true)

 An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
 (except possibly the last one). 

-Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in 
-supervised learning. The last dimension in each tensor is considered to be the observation
-dimension. 
+Takes as input a single data tensor, or a tuple (or a named tuple) of tensors.
+The last dimension in each tensor is considered to be the observation dimension.

 If `shuffle=true`, shuffles the observations each time iterations are re-started.
 If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.

-The original data is preserved as a tuple in the `data` field of the DataLoader. 
+The original data is preserved in the `data` field of the DataLoader. 

-Example usage:
+Usage example:

    Xtrain = rand(10, 100)
    train_loader = DataLoader(Xtrain, batchsize=2) 
    # iterate over 50 mini-batches of size 2
-    for x in train_loader: 
+    for x in train_loader
        @assert size(x) == (10, 2)
        ...
    end

    train_loader.data   # original dataset

+    # similar, but yielding tuples
+    train_loader = DataLoader((Xtrain,), batchsize=2) 
+    for (x,) in train_loader
+        @assert size(x) == (10, 2)
+        ...
+    end
+
    Xtrain = rand(10, 100)
    Ytrain = rand(100)
-    train_loader = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true) 
+    train_loader = DataLoader((Xtrain, Ytrain), batchsize=2, shuffle=true) 
    for epoch in 1:100
-        for (x, y) in train_loader: 
+        for (x, y) in train_loader
            @assert size(x) == (10, 2)
            @assert size(y) == (2,)
            ...
@ -51,26 +57,26 @@ Example usage:
    # train for 10 epochs
    using IterTools: ncycle 
    Flux.train!(loss, ps, ncycle(train_loader, 10), opt)
+
+    # can use NamedTuple to name tensors
+    train_loader = DataLoader((images=Xtrain, labels=Ytrain), batchsize=2, shuffle=true)
+    for datum in train_loader
+        @assert size(datum.images) == (10, 2)
+        @assert size(datum.labels) == (2,)
+    end
 """
-function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
-    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
+function DataLoader(data; batchsize=1, shuffle=false, partial=true)
    batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
    
-    nx = size(data[1])[end]
-    for i=2:length(data)
-        nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
+    n = _nobs(data) 
+    if n < batchsize
+        @warn "Number of observations less than batchsize, decreasing the batchsize to $n"
+        batchsize = n
    end
-    if nx < batchsize
-        @warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
-        batchsize = nx
-    end
-    imax = partial ? nx : nx - batchsize + 1
-    ids = 1:min(nx, batchsize)
-    DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
+    imax = partial ? n : n - batchsize + 1
+    DataLoader(data, batchsize, n, partial, imax, [1:n;], shuffle)
 end

-getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
-
@propagate_inbounds function Base.iterate(d::DataLoader, i=0)     # returns data in d.indices[i+1:i+batchsize]
    i >= d.imax && return nothing
    if d.shuffle && i == 0
@ -78,15 +84,27 @@ getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
    end
    nexti = min(i + d.batchsize, d.nobs)
    ids = d.indices[i+1:nexti]
-    if length(d.data) == 1
-        batch = getdata(d.data[1], ids)
-    else
-        batch = ((getdata(x, ids) for x in d.data)...,)
-    end
+    batch = _getobs(d.data, ids)
    return (batch, nexti)
 end

 function Base.length(d::DataLoader)
    n = d.nobs / d.batchsize
    d.partial ? ceil(Int,n) : floor(Int,n)
-end
+end
+
+_nobs(data::AbstractArray) = size(data)[end]
+
+function _nobs(data::Union{Tuple, NamedTuple})
+    length(data) > 0 || throw(ArgumentError("Need at least one data input"))
+    n = _nobs(data[1])
+    if !all(x -> _nobs(x) == n, Base.tail(data))
+        throw(DimensionMismatch("All data should contain same number of observations"))
+    end
+    return n
+end
+
+_getobs(data::AbstractArray, i) = data[ntuple(i -> Colon(), Val(ndims(data) - 1))..., i]
+_getobs(data::Union{Tuple, NamedTuple}, i) = map(Base.Fix2(_getobs, i), data)
+
+Base.eltype(::DataLoader{D}) where D = D
--- a/src/data/fashion-mnist.jl
+++ b/src/data/fashion-mnist.jl
@ -33,9 +33,10 @@ const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte")

 Load the Fashion-MNIST images.

-Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
+Each image is a 28×28 array of `Gray` colour values
+(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).

-Returns the 60,000 training images by default; pass `:test` to retreive the
+Return the 60,000 training images by default; pass `:test` to retrieve the
 10,000 test images.
 """
 function images(set = :train)
@ -49,10 +50,10 @@ end
    labels()
    labels(:test)

-Load the labels corresponding to each of the images returned from `images()`.
+Load the labels corresponding to each of the images returned from [`images()`](@ref).
 Each label is a number from 0-9.

-Returns the 60,000 training labels by default; pass `:test` to retreive the
+Return the 60,000 training labels by default; pass `:test` to retrieve the
 10,000 test labels.
 """
 function labels(set = :train)
--- a/src/data/housing.jl
+++ b/src/data/housing.jl
@ -50,7 +50,7 @@ function load()
    isfile(deps("housing.data")) && return
    
    @info "Downloading the Boston housing Dataset"
-    download_and_verify("$(cache_prefix)https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
+    download_and_verify("$(cache_prefix)http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
                        deps("housing.data"),
                        "baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")
    
--- a/src/data/iris.jl
+++ b/src/data/iris.jl
@ -2,13 +2,12 @@
 Fisher's classic iris dataset.

 Measurements from 3 different species of iris: setosa, versicolor and
-virginica.  There are 50 examples of each species.
+virginica. There are 50 examples of each species.

-There are 4 measurements for each example: sepal length, sepal width, petal
-length and petal width.  The measurements are in centimeters.
+There are 4 measurements for each example: sepal length, sepal width,
+petal length and petal width. The measurements are in centimeters.

 The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris).
-
 """
 module Iris

@ -33,9 +32,7 @@ end
 Get the labels of the iris dataset, a 150 element array of strings listing the
 species of each example.

-```jldoctest
-julia> using Flux
-
+```jldoctest; setup = :(Flux.Data.Iris.load())
 julia> labels = Flux.Data.Iris.labels();

 julia> summary(labels)
@ -54,13 +51,11 @@ end
 """
    features()

-Get the features of the iris dataset.  This is a 4x150 matrix of Float64
-elements.  It has a row for each feature (sepal length, sepal width,
+Get the features of the iris dataset. This is a 4x150 matrix of Float64
+elements. It has a row for each feature (sepal length, sepal width,
 petal length, petal width) and a column for each example.

-```jldoctest
-julia> using Flux
-
+```jldoctest; setup = :(Flux.Data.Iris.load())
 julia> features = Flux.Data.Iris.features();

 julia> summary(features)
--- a/src/data/mnist.jl
+++ b/src/data/mnist.jl
@ -83,9 +83,10 @@ getfeatures(io::IO, index::Integer) = vec(getimage(io, index))

 Load the MNIST images.

-Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
+Each image is a 28×28 array of `Gray` colour values
+(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).

-Returns the 60,000 training images by default; pass `:test` to retreive the
+Return the 60,000 training images by default; pass `:test` to retrieve the
 10,000 test images.
 """
 function images(set = :train)
@ -99,10 +100,10 @@ end
    labels()
    labels(:test)

-Load the labels corresponding to each of the images returned from `images()`.
+Load the labels corresponding to each of the images returned from [`images()`](@ref).
 Each label is a number from 0-9.

-Returns the 60,000 training labels by default; pass `:test` to retreive the
+Return the 60,000 training labels by default; pass `:test` to retrieve the
 10,000 test labels.
 """
 function labels(set = :train)
--- a/src/data/sentiment.jl
+++ b/src/data/sentiment.jl
@ -1,3 +1,4 @@
+"Stanford Sentiment Treebank dataset."
 module Sentiment

 using ZipFile
@ -39,8 +40,28 @@ function gettrees(name)
  return parsetree.(ss)
 end

+"""
+    train()
+
+Return the train split of the Stanford Sentiment Treebank.
+The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
+"""
 train() = gettrees("train")
+
+"""
+    test()
+
+Return the test split of the Stanford Sentiment Treebank.
+The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
+"""
 test() = gettrees("test")
+
+"""
+    dev()
+
+Return the dev split of the Stanford Sentiment Treebank.
+The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
+"""
 dev() = gettrees("dev")

 end
--- a/src/functor.jl
+++ b/src/functor.jl
@ -1,41 +1,6 @@
 import Adapt: adapt, adapt_storage
 using Zygote: IdSet
-
-functor(x) = (), _ -> x
-
-functor(x::Tuple) = x, y -> y
-functor(x::NamedTuple) = x, y -> y
-
-functor(x::AbstractArray) = x, y -> y
-functor(x::AbstractArray{<:Number}) = (), _ -> x
-
-function makefunctor(m::Module, T, fs = fieldnames(T))
-  @eval m begin
-    Flux.functor(x::$T) = ($([:($f=x.$f) for f in fs]...),), y -> $T(y...)
-  end
-end
-
-function functorm(T, fs = nothing)
-  fs == nothing || isexpr(fs, :tuple) || error("@functor T (a, b)")
-  fs = fs == nothing ? [] : [:($(map(QuoteNode, fs.args)...),)]
-  :(makefunctor(@__MODULE__, $(esc(T)), $(fs...)))
-end
-
-macro functor(args...)
-  functorm(args...)
-end
-
-isleaf(x) = functor(x)[1] === ()
-
-function fmap1(f, x)
-  func, re = functor(x)
-  re(map(f, func))
-end
-
-function fmap(f, x; cache = IdDict())
-  haskey(cache, x) && return cache[x]
-  cache[x] = isleaf(x) ? f(x) : fmap1(x -> fmap(f, x, cache = cache), x)
-end
+import Functors: @functor, functor, fmap

 trainable(m) = functor(m)[1]

@ -59,7 +24,7 @@ testmode!(m, mode = true) = m
    trainmode!(m, mode = true)

 Set a layer of model's train mode (see below).
-Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
+Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`).

 _Note_: if you manually set a model into train mode, you need to manually place
 it into test mode during testing phase.
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@ -4,17 +4,23 @@
 Chain multiple layers / functions together, so that they are called in sequence
 on a given input.

-```julia
-m = Chain(x -> x^2, x -> x+1)
-m(5) == 26
-
-m = Chain(Dense(10, 5), Dense(5, 2))
-x = rand(10)
-m(x) == m[2](m[1](x))
-```
-
 `Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
 `m[1:3](x)` will calculate the output of the first three layers.
+
+# Examples
+```jldoctest
+julia> m = Chain(x -> x^2, x -> x+1);
+
+julia> m(5) == 26
+true
+
+julia> m = Chain(Dense(10, 5), Dense(5, 2));
+
+julia> x = rand(10);
+
+julia> m(x) == m[2](m[1](x))
+true
+```
 """
 struct Chain{T<:Tuple}
  layers::T
@ -24,7 +30,7 @@ end
@forward Chain.layers Base.getindex, Base.length, Base.first, Base.last,
  Base.iterate, Base.lastindex

-functor(c::Chain) = c.layers, ls -> Chain(ls...)
+functor(::Type{<:Chain}, c) = c.layers, ls -> Chain(ls...)

 applychain(::Tuple{}, x) = x
 applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
@ -60,6 +66,7 @@ outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(
 # only slightly changed to better handle interaction with Zygote @dsweber2
 """
    activations(c::Chain, input)
+
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
@ -78,24 +85,24 @@ extraChain(::Tuple{}, x) = ()
 """
    Dense(in::Integer, out::Integer, σ = identity)

-Creates a traditional `Dense` layer with parameters `W` and `b`.
+Create a traditional `Dense` layer with parameters `W` and `b`.

    y = σ.(W * x .+ b)

 The input `x` must be a vector of length `in`, or a batch of vectors represented
 as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.

-```julia
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
 julia> d = Dense(5, 2)
 Dense(5, 2)

 julia> d(rand(5))
-Tracked 2-element Array{Float64,1}:
-  0.00257447
-  -0.00449443
-```
+2-element Array{Float32,1}:
+  -0.16210233
+   0.12311903```
 """
-struct Dense{F,S,T}
+struct Dense{F,S<:AbstractArray,T<:AbstractArray}
  W::S
  b::T
  σ::F
@ -145,7 +152,7 @@ outdims(l::Dense, isize) = (size(l.W)[1],)
 """
    Diagonal(in::Integer)

-Creates an element-wise linear transformation layer with learnable
+Create an element-wise linear transformation layer with learnable
 vectors `α` and `β`:

    y = α .* x .+ β
@ -176,18 +183,11 @@ outdims(l::Diagonal, isize) = (length(l.α),)
 """
    Maxout(over)

-`Maxout` is a neural network layer, which has a number of internal layers,
-which all have the same input, and the maxout returns the elementwise maximium
-of the internal layers' outputs.
+The [Maxout](https://arxiv.org/pdf/1302.4389.pdf) layer has a number of
+internal layers which all receive the same input. It returns the elementwise
+maximum of the internal layers' outputs.

 Maxout over linear dense layers satisfies the univeral approximation theorem.
-
-Reference:
-Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, and Yoshua Bengio.
-2013. Maxout networks.
-In Proceedings of the 30th International Conference on International Conference on Machine Learning - Volume 28 (ICML'13),
-Sanjoy Dasgupta and David McAllester (Eds.), Vol. 28. JMLR.org III-1319-III-1327.
-https://arxiv.org/pdf/1302.4389.pdf
 """
 struct Maxout{FS<:Tuple}
    over::FS
@ -196,17 +196,18 @@ end
 """
    Maxout(f, n_alts)

-Constructs a Maxout layer over `n_alts` instances of  the layer given  by `f`.
-The function takes no arguement and should return some callable layer.
-Conventionally this is a linear dense layer.
+Construct a Maxout layer over `n_alts` instances of the layer given by `f`.
+The function takes no arguments and should return some callable layer.
+Conventionally, this is a linear dense layer.

-For example the following example which
-will construct a `Maxout` layer over 4 internal dense linear layers,
-each identical in structure (784 inputs, 128 outputs).
+# Examples
+
+This constructs a `Maxout` layer over 4 internal dense linear layers, each
+identical in structure (784 inputs, 128 outputs):
 ```julia
-    insize = 784
-    outsize = 128
-    Maxout(()->Dense(insize, outsize), 4)
+insize = 784
+outsize = 128
+Maxout(()->Dense(insize, outsize), 4)
 ```
 """
 function Maxout(f, n_alts)
@ -223,16 +224,18 @@ end
 outdims(l::Maxout, isize) = outdims(first(l.over), isize)

 """
-    SkipConnection(layers, connection)
+    SkipConnection(layer, connection)

-Creates a Skip Connection, of a layer or `Chain` of consecutive layers
-plus a shortcut connection. The connection function will combine the result of the layers
-with the original input, to give the final output.
+Create a skip connection which consists of a layer or `Chain` of consecutive
+layers and a shortcut connection linking the block's input to the output
+through a user-supplied 2-argument callable. The first argument to the callable
+will be propagated through the given `layer` while the second is the unchanged,
+"skipped" input.

-The simplest 'ResNet'-type connection is just `SkipConnection(layer, +)`,
+The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`,
 and requires the output of the layers to be the same shape as the input.
 Here is a more complicated example:
-```
+```julia
 m = Conv((3,3), 4=>7, pad=(1,1))
 x = ones(5,5,4,10);
 size(m(x)) == (5, 5, 7, 10)
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@ -7,26 +7,60 @@ _convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+

 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
+
 """
-    Conv(size, in=>out)
-    Conv(size, in=>out, relu)
+    SamePad

-Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
-`in` and `out` specify the number of input and output channels respectively.
+Padding for convolutional layers will be calculated so that outputshape == inputshape when stride = 1.

-Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
-         giving us a 16-channel output. Output is activated with ReLU.
+For stride > 1 the output shape depends on the type of convolution layer.
+"""
+struct SamePad end

-    size = (2,2)
+calc_padding(pad, k::NTuple{N,T}, dilation, stride) where {T,N}= expand(Val(2*N), pad)
+function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
+  #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/pdf/1603.07285
+
+  # Effective kernel size, including dilation
+  k_eff = @. k + (k - 1) * (dilation - 1)
+  # How much total padding needs to be applied?
+  pad_amt = @. k_eff - 1
+  # In case amount of padding is odd we need to apply different amounts to each side.
+  return Tuple(mapfoldl(i -> [ceil(Int, i/2), floor(Int, i/2)], vcat, pad_amt))
+end
+
+"""
+    Conv(filter, in => out, σ = identity; init = glorot_uniform,
+         stride = 1, pad = 0, dilation = 1)
+
+    filter = (2,2)
    in = 1
    out = 16
    Conv((2, 2), 1=>16, relu)

+Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
+`in` and `out` specify the number of input and output channels respectively.
+
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.

+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
+
+# Examples
+
+Apply a `Conv` layer to a 1-channel input using a 2×2 window filter size, giving us a
+16-channel output. Output is activated with ReLU.
+```julia
+filter = (2,2)
+in = 1
+out = 16
+Conv(filter, in => out, relu)
+```
 """
 struct Conv{N,M,F,A,V}
  σ::F
@ -37,25 +71,68 @@ struct Conv{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end

-function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
+"""
+    Conv(weight::AbstractArray, bias::AbstractArray)
+    Conv(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the convolutional layer with user defined weight and bias arrays.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+There is also a keyword-only constuctor available for all convoultional
+layers.
+
+```julia
+weight = rand(Float32, 3, 3, 5)
+bias = zeros(Float32, 5)
+Conv(weight = weight,
+    bias = bias,
+    σ = sigmoid)
+```
+"""
+function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
              stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
-  pad = expand(Val(2*(N-2)), pad)
  dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return Conv(σ, w, b, stride, pad, dilation)
 end

-Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1) where N =
-  Conv(init(k..., ch...), zeros(ch[2]), σ,
-       stride = stride, pad = pad, dilation = dilation)
+function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+              activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+"""
+    convfilter(filter::Tuple, in=>out)
+
+Constructs a standard convolutional weight matrix with given `filter` and
+channels from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`depthwiseconvfilter`](@ref)
+"""
+convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+          init = glorot_uniform) where N = init(filter..., ch...)
+
+function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+            init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
+            weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+
+  Conv(weight, bias, σ,
+      stride = stride, pad = pad, dilation = dilation)
+end

@functor Conv

 function (c::Conv)(x::AbstractArray)
  # TODO: breaks gpu broadcast :(
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
-  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
+  σ, b = c.σ, reshape(c.bias, ntuple(_->1, length(c.stride))..., :, 1)
  cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
  σ.(conv(x, c.weight, cdims) .+ b)
 end
@ -76,8 +153,8 @@ end
 """
    outdims(l::Conv, isize::Tuple)

-Calculate the output dimensions given the input dimensions, `isize`.
-Batch size and channel size are ignored as per `NNlib.jl`.
+Calculate the output dimensions given the input dimensions `isize`.
+Batch size and channel size are ignored as per [NNlib.jl](https://github.com/FluxML/NNlib.jl).

 ```julia
 m = Conv((3, 3), 3 => 16)
@ -89,16 +166,23 @@ outdims(l::Conv, isize) =
  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))

 """
-    ConvTranspose(size, in=>out)
-    ConvTranspose(size, in=>out, relu)
+    ConvTranspose(filter, in=>out)
+    ConvTranspose(filter, in=>out, activation)
+    ConvTranspose(filter, in => out, σ = identity; init = glorot_uniform,
+                  stride = 1, pad = 0, dilation = 1)

-Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
+Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.

-Data should be stored in WHCN order. In other words, a 100×100 RGB image would
-be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
+Data should be stored in WHCN order (width, height, # channels, batch size).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
+and a batch of 50 would be a `100×100×3×50` array.
+
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.

 Takes the keyword arguments `pad`, `stride` and `dilation`.
+Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1.
 """
 struct ConvTranspose{N,M,F,A,V}
  σ::F
@ -109,18 +193,39 @@ struct ConvTranspose{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end

-function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+"""
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray)
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the convolutional transpose layer with user defined weight and bias arrays.
+forward pass.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
-  pad = expand(Val(2*(N-2)), pad)
  dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return ConvTranspose(σ, w, b, stride, pad, dilation)
 end

-ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-              init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
-ConvTranspose(init(k..., reverse(ch)...), zeros(ch[2]), σ,
+function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                        activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
+
+  ConvTranspose(weight, bias, σ,
              stride = stride, pad = pad, dilation = dilation)
+end

@functor ConvTranspose

@ -132,9 +237,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
    batch_size = size(x)[end]
    # Create DenseConvDims() that looks like the corresponding conv()
    return DenseConvDims((I..., C_in, batch_size), size(c.weight);
-        stride=c.stride,
-        padding=c.pad,
-        dilation=c.dilation,
+                        stride=c.stride,
+                        padding=c.pad,
+                        dilation=c.dilation,
    )
 end

@ -145,7 +250,7 @@ function (c::ConvTranspose)(x::AbstractArray)
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
  cdims = conv_transpose_dims(c, x)
-  return σ.(∇conv_data(x, c.weight, cdims) .+ b)
+  σ.(∇conv_data(x, c.weight, cdims) .+ b)
 end

 function Base.show(io::IO, l::ConvTranspose)
@ -164,17 +269,24 @@ end
 outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)

 """
-    DepthwiseConv(size, in=>out)
-    DepthwiseConv(size, in=>out, relu)
+    DepthwiseConv(filter::Tuple, in=>out)
+    DepthwiseConv(filter::Tuple, in=>out, activation)
+    DepthwiseConv(filter, in => out, σ = identity; init = glorot_uniform,
+                  stride = 1, pad = 0, dilation = 1)

-Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
+Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Note that `out` must be an integer multiple of `in`.

-Data should be stored in WHCN order. In other words, a 100×100 RGB image would
-be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
+Data should be stored in WHCN order (width, height, # channels, batch size).
+In other words, a 100×100 RGB image would be a `100×100×3×1` array,
+and a batch of 50 would be a `100×100×3×50` array.
+
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.

 Takes the keyword arguments `pad`, `stride` and `dilation`.
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 """
 struct DepthwiseConv{N,M,F,A,V}
  σ::F
@ -185,20 +297,54 @@ struct DepthwiseConv{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end

-function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-                       stride = 1, pad = 0, dilation = 1) where {T,N}
+"""
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray)
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
+forward pass.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
-  pad = expand(Val(2*(N-2)), pad)
  dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end

+function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+"""
+    depthwiseconvfilter(filter::Tuple, in=>out)
+
+Constructs a depthwise convolutional weight array defined by `filter` and channels
+from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`convfilter`](@ref)
+"""
+depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+                    init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
+
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
  @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
+
  return DepthwiseConv(
-    init(k..., div(ch[2], ch[1]), ch[1]),
-    zeros(ch[2]),
+    weight,
+    bias,
    σ;
    stride = stride,
    pad = pad,
@ -231,25 +377,34 @@ outdims(l::DepthwiseConv, isize) =
  output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))

 """
-    CrossCor(size, in=>out)
-    CrossCor(size, in=>out, relu)
+    CrossCor(filter, in=>out)
+    CrossCor(filter, in=>out, activation)
+    CrossCor(filter, in => out, σ = identity; init = glorot_uniform,
+             stride = 1, pad = 0, dilation = 1)

-Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
+Standard cross convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.

-Example: Applying CrossCor layer to a 1-channel input using a 2x2 window size,
-         giving us a 16-channel output. Output is activated with ReLU.
-
-    size = (2,2)
-    in = 1
-    out = 16
-    CrossCor((2, 2), 1=>16, relu)
-
-Data should be stored in WHCN order (width, height, # channels, # batches).
+Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.

+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
 Takes the keyword arguments `pad`, `stride` and `dilation`.
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
+
+# Examples
+
+Apply a `CrossCor` layer to a 1-channel input using a 2×2 window filter size, giving us a
+16-channel output. Output is activated with ReLU.
+```julia
+filter = (2,2)
+in = 1
+out = 16
+CrossCor((2, 2), 1=>16, relu)
+```
 """
 struct CrossCor{N,M,F,A,V}
  σ::F
@ -260,18 +415,39 @@ struct CrossCor{N,M,F,A,V}
  dilation::NTuple{N,Int}
 end

-function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+"""
+    CrossCor(weight::AbstractArray, bias::AbstractArray)
+    CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the standard cross convolutional layer with user defined weight and bias
+arrays.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                  stride = 1, pad = 0, dilation = 1) where {T,N}
  stride = expand(Val(N-2), stride)
-  pad = expand(Val(2*(N-2)), pad)
  dilation = expand(Val(N-2), dilation)
+  pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
  return CrossCor(σ, w, b, stride, pad, dilation)
 end

-CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
-  CrossCor(init(k..., ch...), zeros(ch[2]), σ,
+function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+                  init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                  weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+
+  CrossCor(weight, bias, σ,
       stride = stride, pad = pad, dilation = dilation)
+end

@functor CrossCor

@ -305,11 +481,62 @@ outdims(l::CrossCor, isize) =
  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))

 """
-    MaxPool(k)
+    GlobalMaxPool()

-Max pooling layer. `k` stands for the size of the window for each dimension of the input.
+Global max pooling layer.

-Takes the keyword arguments `pad` and `stride`.
+Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output,
+by performing max pooling on the complete (w,h)-shaped feature maps.
+"""
+struct GlobalMaxPool end
+
+function (g::GlobalMaxPool)(x)
+  # Input size
+  x_size = size(x)
+  # Kernel size
+  k = x_size[1:end-2]
+  # Pooling dimensions
+  pdims = PoolDims(x, k)
+
+  return maxpool(x, pdims)
+end
+
+function Base.show(io::IO, g::GlobalMaxPool)
+  print(io, "GlobalMaxPool()")
+end
+
+"""
+    GlobalMeanPool()
+
+Global mean pooling layer.
+
+Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output,
+by performing mean pooling on the complete (w,h)-shaped feature maps.
+"""
+struct GlobalMeanPool end
+
+function (g::GlobalMeanPool)(x)
+  # Input size
+  x_size = size(x)
+  # Kernel size
+  k = x_size[1:end-2]
+  # Pooling dimensions
+  pdims = PoolDims(x, k)
+
+  return meanpool(x, pdims)
+end
+
+function Base.show(io::IO, g::GlobalMeanPool)
+  print(io, "GlobalMeanPool()")
+end
+
+"""
+    MaxPool(k; pad = 0, stride = k)
+
+Max pooling layer. `k` is the size of the window for each dimension of the input.
+
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
+=======
 """
 struct MaxPool{N,M}
  k::NTuple{N,Int}
@ -319,8 +546,7 @@ end

 function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
  stride = expand(Val(N), stride)
-  pad = expand(Val(2*N), pad)
-
+  pad = calc_padding(pad, k, 1, stride)
  return MaxPool(k, pad, stride)
 end

@ -336,11 +562,11 @@ end
 outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))

 """
-    MeanPool(k)
+    MeanPool(k; pad = 0, stride = k)

-Mean pooling layer. `k` stands for the size of the window for each dimension of the input.
+Mean pooling layer. `k` is the size of the window for each dimension of the input.

-Takes the keyword arguments `pad` and `stride`.
+Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 """
 struct MeanPool{N,M}
    k::NTuple{N,Int}
@ -350,7 +576,7 @@ end

 function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
  stride = expand(Val(N), stride)
-  pad = expand(Val(2*N), pad)
+  pad = calc_padding(pad, k, 1, stride)
  return MeanPool(k, pad, stride)
 end

@ -363,4 +589,4 @@ function Base.show(io::IO, m::MeanPool)
  print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end

-outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
+outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@ -10,14 +10,14 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)

 """
-    dropout(p, dims = :)
+    dropout(x, p; dims = :)

-Dropout function. For each input, either sets that input to `0` (with probability
-`p`) or scales it by `1/(1-p)`. The `dims` argument is to specify the unbroadcasted
-dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
-used as a regularisation, i.e. it reduces overfitting during training. 
- 
-See also [`Dropout`](@ref).
+The dropout function. For each input, either sets that input to `0` (with probability
+`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions,
+e.g. `dims=1` applies dropout along columns and `dims=2` along rows.
+This is used as a regularisation, i.e. it reduces overfitting during training.
+
+See also the [`Dropout`](@ref) layer.
 """
 dropout(x, p; dims = :) = x

@ -30,9 +30,9 @@ end
 """
    Dropout(p, dims = :)

-A Dropout layer. In the forward pass, applies the [`dropout`](@ref) function on the input.
+Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input.

-Does nothing to the input once [`testmode!`](@ref) is false.
+Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 """
 mutable struct Dropout{F,D}
  p::F
@ -64,12 +64,13 @@ end

 """
    AlphaDropout(p)
-    
-A dropout layer. It is used in Self-Normalizing Neural Networks.
-(https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
-The AlphaDropout layer ensures that mean and variance of activations remains the same as before.

-Does nothing to the input once [`testmode!`](@ref) is false.
+A dropout layer. Used in
+[Self-Normalizing Neural Networks](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf).
+The AlphaDropout layer ensures that mean and variance of activations
+remain the same as before.
+
+Does nothing to the input once [`testmode!`](@ref) is true.
 """
 mutable struct AlphaDropout{F}
  p::F
@ -100,8 +101,8 @@ testmode!(m::AlphaDropout, mode = true) =
    LayerNorm(h::Integer)

 A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be
-used with recurrent hidden states of size `h`. Normalises the mean/stddev of
-each input before applying a per-neuron gain/bias.
+used with recurrent hidden states of size `h`. Normalises the mean and standard
+deviation of each input before applying a per-neuron gain/bias.
 """
 struct LayerNorm{T}
  diag::Diagonal{T}
@ -123,8 +124,8 @@ end
              initβ = zeros, initγ = ones,
              ϵ = 1e-8, momentum = .1)

-Batch Normalization layer. The `channels` input should be the size of the
-channel dimension in your data (see below).
+[Batch Normalization](https://arxiv.org/pdf/1502.03167.pdf) layer.
+`channels` should be the size of the channel dimension in your data (see below).

 Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
 a batch of feature vectors this is just the data dimension, for `WHCN` images
@ -136,10 +137,7 @@ per-channel `bias` and `scale` parameters).

 Use [`testmode!`](@ref) during inference.

-See [Batch Normalization: Accelerating Deep Network Training by Reducing
-Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf).
-
-Example:
+# Examples
 ```julia
 m = Chain(
  Dense(28^2, 64),
@ -213,37 +211,6 @@ function Base.show(io::IO, l::BatchNorm)
  print(io, ")")
 end

-
-"""
-    InstanceNorm(channels::Integer, σ = identity;
-                 initβ = zeros, initγ = ones,
-                 ϵ = 1e-8, momentum = .1)
-
-Instance Normalization layer. The `channels` input should be the size of the
-channel dimension in your data (see below).
-
-Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
-a batch of feature vectors this is just the data dimension, for `WHCN` images
-it's the usual channel dimension.)
-
-`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
-shifts them to have a new mean and variance (corresponding to the learnable,
-per-channel `bias` and `scale` parameters).
-
-Use [`testmode!`](@ref) during inference.
-
-See [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
-
-Example:
-```julia
-m = Chain(
-  Dense(28^2, 64),
-  InstanceNorm(64, relu),
-  Dense(64, 10),
-  InstanceNorm(10),
-  softmax)
-```
-"""
 expand_inst = (x, as) -> reshape(repeat(x, outer=[1, as[length(as)]]), as...)

 mutable struct InstanceNorm{F,V,W,N}
@ -258,6 +225,34 @@ mutable struct InstanceNorm{F,V,W,N}
 end

 # TODO: deprecate in v0.11
+"""
+    InstanceNorm(channels::Integer, σ = identity;
+                 initβ = zeros, initγ = ones,
+                 ϵ = 1e-8, momentum = .1)
+
+[Instance Normalization](https://arxiv.org/abs/1607.08022) layer.
+`channels` should be the size of the channel dimension in your data (see below).
+
+Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
+a batch of feature vectors this is just the data dimension, for `WHCN` images
+it's the usual channel dimension.)
+
+`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
+shifts them to have a new mean and variance (corresponding to the learnable,
+per-channel `bias` and `scale` parameters).
+
+Use [`testmode!`](@ref) during inference.
+
+# Examples
+```julia
+m = Chain(
+  Dense(28^2, 64),
+  InstanceNorm(64, relu),
+  Dense(64, 10),
+  InstanceNorm(10),
+  softmax)
+```
+"""
 InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum) = InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)

 InstanceNorm(chs::Integer, λ = identity;
@ -316,28 +311,27 @@ function Base.show(io::IO, l::InstanceNorm)
 end

 """
-Group Normalization.
-This layer can outperform Batch-Normalization and Instance-Normalization.
+    GroupNorm(chs::Integer, G::Integer, λ = identity;
+              initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i),
+              ϵ = 1f-5, momentum = 0.1f0)

-	GroupNorm(chs::Integer, G::Integer, λ = identity;
-	          initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i),
-	          ϵ = 1f-5, momentum = 0.1f0)
+[Group Normalization](https://arxiv.org/pdf/1803.08494.pdf) layer.
+This layer can outperform Batch Normalization and Instance Normalization.

-``chs`` is the number of channels, the channel dimension of your input.
-For an array of N dimensions, the (N-1)th index is the channel dimension.
+`chs` is the number of channels, the channel dimension of your input.
+For an array of N dimensions, the `N-1`th index is the channel dimension.

-``G`` is the number of groups along which the statistics would be computed.
+`G` is the number of groups along which the statistics are computed.
 The number of channels must be an integer multiple of the number of groups.

 Use [`testmode!`](@ref) during inference.

-Example:
-```
+# Examples
+```julia
 m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1),
-          GroupNorm(32,16)) # 32 channels, 16 groups (G = 16), thus 2 channels per group used
+          GroupNorm(32,16))
+          # 32 channels, 16 groups (G = 16), thus 2 channels per group used
 ```
-
-Link : https://arxiv.org/pdf/1803.08494.pdf
 """
 mutable struct GroupNorm{F,V,W,N,T}
  G::T # number of groups
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@ -12,16 +12,16 @@ in the background. `cell` should be a model of the form:

    h, y = cell(h, x...)

-For example, here's a recurrent network that keeps a running total of its inputs.
+For example, here's a recurrent network that keeps a running total of its inputs:

 ```julia
-accum(h, x) = (h+x, x)
+accum(h, x) = (h + x, x)
 rnn = Flux.Recur(accum, 0)
-rnn(2) # 2
-rnn(3) # 3
-rnn.state # 5
-rnn.(1:10) # apply to a sequence
-rnn.state # 60
+rnn(2)      # 2
+rnn(3)      # 3
+rnn.state   # 5
+rnn.(1:10)  # apply to a sequence
+rnn.state   # 60
 ```
 """
 mutable struct Recur{T}
@ -47,9 +47,10 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")

 Reset the hidden state of a recurrent layer back to its original value.

-Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
-
-    rnn.state = hidden(rnn.cell)
+Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
+```julia
+rnn.state = hidden(rnn.cell)
+```
 """
 reset!(m::Recur) = (m.state = m.init)
 reset!(m) = foreach(reset!, functor(m)[1])
@ -135,8 +136,8 @@ Base.show(io::IO, l::LSTMCell) =
 """
    LSTM(in::Integer, out::Integer)

-Long Short Term Memory recurrent layer. Behaves like an RNN but generally
-exhibits a longer memory span over sequences.
+[Long Short Term Memory](https://www.researchgate.net/publication/13853244_Long_Short-term_Memory)
+recurrent layer. Behaves like an RNN but generally exhibits a longer memory span over sequences.

 See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 for a good overview of the internals.
@ -176,8 +177,8 @@ Base.show(io::IO, l::GRUCell) =
 """
    GRU(in::Integer, out::Integer)

-Gated Recurrent Unit layer. Behaves like an RNN but generally
-exhibits a longer memory span over sequences.
+[Gated Recurrent Unit](https://arxiv.org/abs/1406.1078) layer. Behaves like an
+RNN but generally exhibits a longer memory span over sequences.

 See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
 for a good overview of the internals.
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@ -2,7 +2,8 @@
 """
    mae(ŷ, y)

-Return the mean of absolute error `sum(abs.(ŷ .- y)) / length(y)` 
+Return the mean of absolute error; calculated as
+`sum(abs.(ŷ .- y)) / length(y)`.
 """
 mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)

@ -10,7 +11,14 @@ mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 """
    mse(ŷ, y)

-Return the mean squared error `sum((ŷ .- y).^2) / length(y)`. 
+Return the mean squared error between ŷ and y; calculated as
+`sum((ŷ .- y).^2) / length(y)`.
+
+# Examples
+```jldoctest
+julia> Flux.mse([0, 2], [1, 1])
+1//1
+```
 """
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)

@ -18,10 +26,11 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 """
    msle(ŷ, y; ϵ=eps(eltype(ŷ)))

-Returns the mean of the squared logarithmic errors `sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
-The `ϵ` term provides numerical stability. 
+Return the mean of the squared logarithmic errors; calculated as
+`sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
+The `ϵ` term provides numerical stability.

-This error penalizes an under-predicted estimate greater than an over-predicted estimate.
+Penalizes an under-predicted estimate greater than an over-predicted estimate.
 """
 msle(ŷ, y; ϵ=eps(eltype(ŷ))) = sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) * 1 // length(y)

@ -30,50 +39,68 @@ msle(ŷ, y; ϵ=eps(eltype(ŷ))) = sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) *
 """
    huber_loss(ŷ, y; δ=1.0)

-Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, δ is set to 1.0.
+Return the mean of the [Huber loss](https://en.wikipedia.org/wiki/Huber_loss)
+given the prediction `ŷ` and true values `y`.

-                    | 0.5*|ŷ - y|,   for |ŷ - y| <= δ
-      Hubber loss = |
-                    |  δ*(|ŷ - y| - 0.5*δ),  otherwise
-
-[`Huber Loss`](https://en.wikipedia.org/wiki/Huber_loss).
+                 | 0.5 * |ŷ - y|,            for |ŷ - y| <= δ
+    Huber loss = |
+                 |  δ * (|ŷ - y| - 0.5 * δ), otherwise
 """
+#TODO: remove dropgrad when Zygote can handle this function with CuArrays
 function huber_loss(ŷ, y;  δ=eltype(ŷ)(1))
   abs_error = abs.(ŷ .- y)
-   temp = abs_error .<  δ
+   temp = Zygote.dropgrad(abs_error .<  δ)
   x = eltype(ŷ)(0.5)
   hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
 end

 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
-  return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
+  return -sum(xlogy.(y, ŷ)) * 1 // size(y, 2)
 end

 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
-  return -sum(y .* log.(ŷ)) .* weight * 1 // size(y, 2)
+  return -sum(xlogy.(y, ŷ)) .* weight * 1 // size(y, 2)
 end

 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
-  return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
+  return -sum(xlogy.(y, ŷ) .* weight) * 1 // size(y, 2)
 end

 """
-    crossentropy(ŷ, y; weight=1)
+    crossentropy(ŷ, y; weight = nothing)

-Return the crossentropy computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`. 
+Return the cross entropy between the given probability distributions;
+calculated as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.

-See also [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref).
+`weight` can be `Nothing`, a `Number` or an `AbstractVector`.
+`weight=nothing` acts like `weight=1` but is faster.
+
+See also: [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.crossentropy(softmax([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
+3.085467254747739
+```
 """
 crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)

 """
-    logitcrossentropy(ŷ, y; weight=1)
+    logitcrossentropy(ŷ, y; weight = 1)

-Return the crossentropy computed after a [softmax](@ref) operation: 
+Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
+calculated as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.

-  -sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)
+`logitcrossentropy(ŷ, y)` is mathematically equivalent to
+[`Flux.crossentropy(softmax(ŷ), y)`](@ref) but it is more numerically stable.

-See also [`crossentropy`](@ref), [`binarycrossentropy`](@ref).
+See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.logitcrossentropy([-1.1491, 0.8619, 0.3127], [1, 1, 0])
+3.085467254747738
+```
 """
 function logitcrossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
  return -sum(y .* logsoftmax(ŷ) .* weight) * 1 // size(y, 2)
@ -82,11 +109,22 @@ end
 """
    binarycrossentropy(ŷ, y; ϵ=eps(ŷ))

-Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerical stability.
+Return ``-y*\\log(ŷ + ϵ) - (1-y)*\\log(1-ŷ + ϵ)``. The `ϵ` term provides numerical stability.

 Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation.
+
+See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
+3-element Array{Float64,1}:
+ 1.424397097347566
+ 0.35231664672364077
+ 0.8616703662235441
+```
 """
-binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
+binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)

 # Re-definition to fix interaction with CuArrays.
 CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
@ -94,10 +132,19 @@ CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1
 """
    logitbinarycrossentropy(ŷ, y)

-`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(ŷ), y)`
-but it is more numerically stable.
+`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to
+[`Flux.binarycrossentropy(σ(ŷ), y)`](@ref) but it is more numerically stable.

-See also [`binarycrossentropy`](@ref), [`sigmoid`](@ref), [`logsigmoid`](@ref).  
+See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref)
+
+# Examples
+```jldoctest
+julia> Flux.logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0])
+3-element Array{Float64,1}:
+ 1.4243970973475661
+ 0.35231664672364094
+ 0.8616703662235443
+```
 """
 logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)

@ -107,26 +154,27 @@ CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
 """
    normalise(x; dims=1)

-Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
+Normalise `x` to mean 0 and standard deviation 1 across the dimensions given by `dims`.
+Defaults to normalising over columns.

-```julia-repl
+```jldoctest
 julia> a = reshape(collect(1:9), 3, 3)
 3×3 Array{Int64,2}:
-  1  4  7
-  2  5  8
-  3  6  9
+ 1  4  7
+ 2  5  8
+ 3  6  9

-julia> normalise(a)
+julia> Flux.normalise(a)
 3×3 Array{Float64,2}:
-  -1.22474  -1.22474  -1.22474
+ -1.22474  -1.22474  -1.22474
  0.0       0.0       0.0
  1.22474   1.22474   1.22474

-julia> normalise(a, dims=2)
+julia> Flux.normalise(a, dims=2)
 3×3 Array{Float64,2}:
-  -1.22474  0.0  1.22474
-  -1.22474  0.0  1.22474
-  -1.22474  0.0  1.22474
+ -1.22474  0.0  1.22474
+ -1.22474  0.0  1.22474
+ -1.22474  0.0  1.22474
 ```
 """
 function normalise(x::AbstractArray; dims=1)
@ -138,13 +186,17 @@ end
 """
    kldivergence(ŷ, y)

-KLDivergence is a measure of how much one probability distribution is different from the other.
-It is always non-negative and zero only when both the distributions are equal everywhere.
+Return the
+[Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)
+between the given probability distributions.

-[KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
+KL divergence is a measure of how much one probability distribution is different
+from the other.
+It is always non-negative and zero only when both the distributions are equal
+everywhere.
 """
 function kldivergence(ŷ, y)
-  entropy = sum(y .* log.(y)) * 1 //size(y,2)
+  entropy = sum(xlogx.(y)) * 1 //size(y,2)
  cross_entropy = crossentropy(ŷ, y)
  return entropy + cross_entropy
 end
@ -152,51 +204,93 @@ end
 """
    poisson(ŷ, y)

-Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
-Returns `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`
+Return how much the predicted distribution `ŷ` diverges from the expected Poisson
+distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.

-[Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
+[More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
-poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) * 1 // size(y,2)
+poisson(ŷ, y) = sum(ŷ .- xlogy.(y, ŷ)) * 1 // size(y,2)

 """
    hinge(ŷ, y)

-Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
-Returns `sum((max.(0, 1 .- ŷ .* y))) / size(y, 2)`
+Return the [hinge loss](https://en.wikipedia.org/wiki/Hinge_loss) given the
+prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as
+`sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)`.

-[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
-See also [`squared_hinge`](@ref).
+See also: [`squared_hinge`](@ref)
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) * 1 // size(y, 2)

 """
    squared_hinge(ŷ, y)

-Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1).
-Returns `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`
+Return the squared hinge loss given the prediction `ŷ` and true labels `y`
+(containing 1 or -1); calculated as `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`.

-See also [`hinge`](@ref).
+See also: [`hinge`](@ref)
 """
 squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) * 1 // size(y, 2)

 """
    dice_coeff_loss(ŷ, y; smooth=1)

-Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score.
-Returns `1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)`
-
-[V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
+Return a loss based on the dice coefficient.
+Used in the [V-Net](https://arxiv.org/pdf/1606.04797v1.pdf) image segmentation
+architecture.
+Similar to the F1_score. Calculated as:
+    1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)`
 """
 dice_coeff_loss(ŷ, y; smooth=eltype(ŷ)(1.0)) = 1 - (2*sum(y .* ŷ) + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)

 """
    tversky_loss(ŷ, y; β=0.7)

-Used with imbalanced data to give more weightage to False negatives. 
+Return the [Tversky loss](https://arxiv.org/pdf/1706.05721.pdf).
+Used with imbalanced data to give more weight to false negatives.
 Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
-Returns `1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)`
-
-[Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
+Calculated as:
+    1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
 """
 tversky_loss(ŷ, y; β=eltype(ŷ)(0.7)) = 1 - (sum(y .* ŷ) + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
+
+"""
+    flatten(x::AbstractArray)
+
+Transform (w, h, c, b)-shaped input into (w × h × c, b)-shaped output
+by linearizing all values for each element in the batch.
+"""
+function flatten(x::AbstractArray)
+  return reshape(x, :, size(x)[end])
+end
+
+"""
+    xlogx(x)
+Return `x * log(x)` for `x ≥ 0`, handling `x = 0` by taking the downward limit.
+"""
+function xlogx(x)
+  result = x * log(x)
+  ifelse(iszero(x), zero(result), result)
+end
+CuArrays.@cufunc function xlogx(x)
+  result = x * log(x)
+  ifelse(iszero(x), zero(result), result)
+end
+
+"""
+    xlogy(x, y)
+Return `x * log(y)` for `y > 0` with correct limit at `x = 0`.
+"""
+function xlogy(x, y)
+  result = x * log(y)
+  ifelse(iszero(x), zero(result), result)
+end
+CuArrays.@cufunc function xlogy(x, y)
+  result = x * log(y)
+  ifelse(iszero(x), zero(result), result)
+end
+
+@adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric)
+  res = xlogy.(x, y)
+  res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
+end
--- a/src/onehot.jl
+++ b/src/onehot.jl
@ -27,7 +27,8 @@ Base.getindex(xs::OneHotMatrix, ::Colon, ::Colon) = OneHotMatrix(xs.height, copy

 Base.getindex(xs::OneHotMatrix, i::Integer, ::Colon) = map(x -> x[i], xs.data)

-A::AbstractMatrix * B::OneHotMatrix = A[:, map(x->x.ix, B.data)]
+# remove workaround when https://github.com/JuliaGPU/CuArrays.jl/issues/676 is fixed
+A::AbstractMatrix * B::OneHotMatrix = A[:, cpu(map(x->x.ix, B.data))]

 Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...])

@ -37,30 +38,28 @@ import Adapt: adapt, adapt_structure

 adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))

-import .CuArrays: CuArray, cudaconvert
+import .CuArrays: CuArray, CuArrayStyle, cudaconvert
 import Base.Broadcast: BroadcastStyle, ArrayStyle
-BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
+BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = CuArrayStyle{2}()
 cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))

 """
    onehot(l, labels[, unk])

-Create an [`OneHotVector`](@ref) wtih `l`-th element be `true` based on possible `labels` set.
-If `unk` is given, it retruns `onehot(unk, labels)` if the input label `l` is not find in `labels`; otherwise
-it will error.
-
-## Examples
+Create a `OneHotVector` with its `l`-th element `true` based on the
+possible set of `labels`.
+If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
+in `labels`; otherwise, it will raise an error.

+# Examples
 ```jldoctest
-julia> using Flux: onehot
-
-julia> onehot(:b, [:a, :b, :c])
+julia> Flux.onehot(:b, [:a, :b, :c])
 3-element Flux.OneHotVector:
 0
 1
 0

-julia> onehot(:c, [:a, :b, :c])
+julia> Flux.onehot(:c, [:a, :b, :c])
 3-element Flux.OneHotVector:
 0
 0
@ -82,15 +81,14 @@ end
 """
    onehotbatch(ls, labels[, unk...])

-Create an [`OneHotMatrix`](@ref) with a batch of labels based on possible `labels` set, returns the
-`onehot(unk, labels)` if given labels `ls` is not found in set `labels`.
-
-## Examples
+Create a `OneHotMatrix` with a batch of labels based on the
+possible set of `labels`.
+If `unk` is given, return [`onehot(unk, labels)`](@ref) if one of the input
+labels `ls` is not found in `labels`; otherwise it will error.

+# Examples
 ```jldoctest
-julia> using Flux: onehotbatch
-
-julia> onehotbatch([:b, :a, :b], [:a, :b, :c])
+julia> Flux.onehotbatch([:b, :a, :b], [:a, :b, :c])
 3×3 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}:
 0  1  0
 1  0  1
@ -107,13 +105,12 @@ Base.argmax(xs::OneHotVector) = xs.ix

 Inverse operations of [`onehot`](@ref).

+# Examples
 ```jldoctest
-julia> using Flux: onecold
-
-julia> onecold([true, false, false], [:a, :b, :c])
+julia> Flux.onecold([true, false, false], [:a, :b, :c])
 :a

-julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c])
+julia> Flux.onecold([0.3, 0.2, 0.5], [:a, :b, :c])
 :c
 ```
 """
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@ -1,9 +1,12 @@
 module Optimise

+using LinearAlgebra
+
 export train!, update!,
-	SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
+	Descent, ADAM, Momentum, Nesterov, RMSProp,
 	ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, 
-	InvDecay, ExpDecay, WeightDecay, stop, Optimiser
+	InvDecay, ExpDecay, WeightDecay, stop, Optimiser,
+	ClipValue, ClipNorm

 include("optimisers.jl")
 include("train.jl")
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@ -6,24 +6,25 @@ const ϵ = 1e-8
 # TODO: should use weak refs

 """
-    Descent(η)
+    Descent(η = 0.1)

 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`

-## Parameters
-  - Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.

-## Example
-```julia-repl
-opt = Descent() # uses default η (0.1)
+# Examples
+```julia
+opt = Descent()

-opt = Descent(0.3) # use provided η
+opt = Descent(0.3)

 ps = params(model)

 gs = gradient(ps) do
-  loss(x, y)
+    loss(x, y)
 end

 Flux.Optimise.update!(opt, ps, gs)
@ -40,17 +41,19 @@ function apply!(o::Descent, x, Δ)
 end

 """
-    Momentum(η, ρ)
+    Momentum(η = 0.01, ρ = 0.9)

-Gradient descent with learning rate `η` and momentum `ρ`.
+Gradient descent optimizer with learning rate `η` and momentum `ρ`.

-## Parameters
-  - Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`.
-  - Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`.
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Momentum (`ρ`): Controls the acceleration of gradient descent in the
+                  prominent direction, in effect dampening oscillations.

-## Examples
+# Examples
 ```julia
-opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9
+opt = Momentum()

 opt = Momentum(0.01, 0.99)
 ```
@ -71,17 +74,19 @@ function apply!(o::Momentum, x, Δ)
 end

 """
-    Nesterov(η, ρ)
+    Nesterov(η = 0.001, ρ = 0.9)

-Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
+Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.

-## Parameters
-  - Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
-  - Nesterov Momentum (ρ): Parameters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
+                           prominent direction, in effect dampening oscillations.

-## Examples
+# Examples
 ```julia
-opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9
+opt = Nesterov()

 opt = Nesterov(0.003, 0.95)
 ```
@ -103,23 +108,25 @@ function apply!(o::Nesterov, x, Δ)
 end

 """
-    RMSProp(η, ρ)
+    RMSProp(η = 0.001, ρ = 0.9)

-Implements the RMSProp algortihm. Often a good choice for recurrent networks. Parameters other than learning rate generally don't need tuning.
+Optimizer using the
+[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+algorithm. Often a good choice for recurrent networks. Parameters other than learning rate
+generally don't need tuning.

-## Parameters
-  - Learning Rate (η): Defaults to `0.001`.
-  - Rho (ρ): Defaults to `0.9`.
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Momentum (`ρ`): Controls the acceleration of gradient descent in the
+                  prominent direction, in effect dampening oscillations.

-## Examples
+# Examples
 ```julia
-opt = RMSProp() # uses default η = 0.001 and ρ = 0.9
+opt = RMSProp()

 opt = RMSProp(0.002, 0.95)
 ```
-
-## References
-[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 """
 mutable struct RMSProp
  eta::Float64
@ -137,23 +144,22 @@ function apply!(o::RMSProp, x, Δ)
 end

 """
-    ADAM(η, β::Tuple)
+    ADAM(η = 0.001, β::Tuple = (0.9, 0.999))

-Implements the ADAM optimiser.
+[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.

-## Paramters
-  - Learning Rate (`η`): Defaults to `0.001`.
-  - Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
-
-## Examples
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.

+# Examples
 ```julia
-opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
+opt = ADAM()

 opt = ADAM(0.001, (0.9, 0.8))
 ```
-## References
-[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 """
 mutable struct ADAM
  eta::Float64
@ -174,24 +180,22 @@ function apply!(o::ADAM, x, Δ)
 end

 """
-    RADAM(η, β::Tuple)
+    RADAM(η = 0.001, β::Tuple = (0.9, 0.999))

-Implements the rectified ADAM optimizer.
+[Rectified ADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimizer.

-## Parameters
-  - Learning Rate (η): Defaults to `0.001`
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
-
-## Examples
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.

+# Examples
 ```julia
-opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
+opt = RADAM()

 opt = RADAM(0.001, (0.9, 0.8))
 ```
-
-## References
-[RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
 """
 mutable struct RADAM
  eta::Float64
@ -219,22 +223,22 @@ function apply!(o::RADAM, x, Δ)
 end

 """
-    AdaMax(η, β::Tuple)
+    AdaMax(η = 0.001, β::Tuple = (0.9, 0.999))

-Variant of ADAM based on ∞-norm.
+[AdaMax](https://arxiv.org/abs/1412.6980v9) is a variant of ADAM based on the ∞-norm.

-## Parameters
-  - Learning Rate (η): Defaults to `0.001`
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.

-## Examples
+# Examples
 ```julia
-opt = AdaMax() # uses default η and β
+opt = AdaMax()

 opt = AdaMax(0.001, (0.9, 0.995))
 ```
-## References
-[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser.
 """
 mutable struct AdaMax
  eta::Float64
@ -255,23 +259,22 @@ function apply!(o::AdaMax, x, Δ)
 end

 """
-    ADAGrad(η)
+    ADAGrad(η = 0.1)

-Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated.
+[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
+parameter specific learning rates based on how frequently it is updated.
+Parameters don't need tuning.

-## Parameters
-  - Learning Rate (η): Defaults to `0.1`
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.

-## Examples
+# Examples
 ```julia
-opt = ADAGrad() # uses default η = 0.1
+opt = ADAGrad()

 opt = ADAGrad(0.001)
 ```
-
-## References
-[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
-Parameters don't need tuning.
 """
 mutable struct ADAGrad
  eta::Float64
@ -288,21 +291,21 @@ function apply!(o::ADAGrad, x, Δ)
 end

 """
-    ADADelta(ρ)
+    ADADelta(ρ = 0.9)

-Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning.
+[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning
+rate based on a window of past gradient updates.
+Parameters don't need tuning.

-## Parameters
-  - Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`.
+# Parameters
+- Rho (`ρ`): Factor by which the gradient is decayed at each time step.

-## Examples
+# Examples
 ```julia
-opt = ADADelta() # uses default ρ = 0.9
+opt = ADADelta()
+
 opt = ADADelta(0.89)
 ```
-
-## References
-[ADADelta](https://arxiv.org/abs/1212.5701) optimiser.
 """
 mutable struct ADADelta
  rho::Float64
@ -321,22 +324,23 @@ function apply!(o::ADADelta, x, Δ)
 end

 """
-    AMSGrad(η, β::Tuple)
+    AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999))

-Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning.
+The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
+optimiser. Parameters don't need tuning.

-## Parameters
-  - Learning Rate (η): Defaults to `0.001`.
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.

-## Examples
+# Examples
 ```julia
-opt = AMSGrad() # uses default η and β
+opt = AMSGrad()
+
 opt = AMSGrad(0.001, (0.89, 0.995))
 ```
-
-## References
-[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser.
 """
 mutable struct AMSGrad
  eta::Float64
@ -356,22 +360,23 @@ function apply!(o::AMSGrad, x, Δ)
 end

 """
-    NADAM(η, β::Tuple)
+    NADAM(η = 0.001, β::Tuple = (0.9, 0.999))

-Nesterov variant of ADAM. Parameters don't need tuning.
+[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) is a Nesterov variant of ADAM.
+Parameters don't need tuning.

-## Parameters
-  - Learning Rate (η): Defaults to `0.001`.
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.

-## Examples
+# Examples
 ```julia
-opt = NADAM() # uses default η and β
+opt = NADAM()
+
 opt = NADAM(0.002, (0.89, 0.995))
 ```
-
-## References
-[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser.
 """
 mutable struct NADAM
  eta::Float64
@ -392,23 +397,24 @@ function apply!(o::NADAM, x, Δ)
 end

 """
-    ADAMW(η, β::Tuple, decay)
+    ADAMW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0)

-Variant of ADAM defined by fixing weight decay regularization.
+[ADAMW](https://arxiv.org/abs/1711.05101) is a variant of ADAM fixing (as in repairing) its
+weight decay regularization.

-## Parameters
-  - Learning Rate (η): Defaults to `0.001`.
-  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999).
-  - decay: Decay applied to weights during optimisation. Defaults to 0.
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+                                   second (β2) momentum estimate.
+- `decay`: Decay applied to weights during optimisation.

-## Examples
+# Examples
 ```julia
-opt = ADAMW() # uses default η, β and decay
+opt = ADAMW()
+
 opt = ADAMW(0.001, (0.89, 0.995), 0.1)
 ```
-
-## References
-[ADAMW](https://arxiv.org/abs/1711.05101)
 """
 ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
  Optimiser(ADAM(η, β), WeightDecay(decay))
@ -441,14 +447,13 @@ function apply!(o::Optimiser, x, Δ)
 end

 """
-    InvDecay(γ)
+    InvDecay(γ = 0.001)

-Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
+Apply inverse time decay to an optimiser, so that the effective step size at
+iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size.
+The wrapped optimiser's step size is not modified.

-## Parameters
-  - gamma (γ): Defaults to `0.001`
-
-## Example
+# Examples
 ```julia
 Optimiser(InvDecay(..), Opt(..))
 ```
@ -469,20 +474,24 @@ function apply!(o::InvDecay, x, Δ)
 end

 """
-    ExpDecay(eta, decay, decay_step, clip)
+    ExpDecay(η = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4)

-Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
+Discount the learning rate `η` by the factor `decay` every `decay_step` steps till
+a minimum of `clip`.

-## Parameters
-  - Learning Rate (eta): Defaults to `0.001`.
-  - decay: Factor by which the learning rate is discounted. Defaults to `0.1`.
-  - decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`.
-  - clip: Minimum value of learning rate. Defaults to `1e-4`.
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+- `decay`: Factor by which the learning rate is discounted.
+- `decay_step`: Schedule decay operations by setting the number of steps between
+                two decay operations.
+- `clip`: Minimum value of learning rate.

-## Example
+# Examples
 To apply exponential decay to an optimiser:
 ```julia
 Optimiser(ExpDecay(..), Opt(..))
+
 opt = Optimiser(ExpDecay(), ADAM())
 ```
 """
@ -500,19 +509,19 @@ function apply!(o::ExpDecay, x, Δ)
  η, s, decay = o.eta, o.step, o.decay
  n = o.current[x] = get(o.current, x, 0) + 1
  if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1
-    η = max(η * decay^(s / n), o.clip)
+    η = max(η * decay, o.clip)
    o.eta = η
  end
  @. Δ *= η
 end

 """
-    WeightDecay(wd)
+    WeightDecay(wd = 0)

-Decays the weight by `wd`
+Decay weights by `wd`.

-## Parameters
-  - weight decay (wd): 0
+# Parameters
+- Weight decay (`wd`)
 """
 mutable struct WeightDecay
  wd::Real
@ -524,3 +533,31 @@ function apply!(o::WeightDecay, x, Δ)
  wd = o.wd
  @. Δ += wd * x
 end
+
+"""
+    ClipValue(thresh)
+
+Clip gradients when their absolute value exceeds `thresh`.
+"""
+mutable struct ClipValue{T}
+    thresh::T
+end
+
+apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
+
+"""
+    ClipNorm(thresh)
+
+Clip gradients when their L2 norm exceeds `thresh`.
+"""
+mutable struct ClipNorm{T}
+    thresh::T
+end
+
+function apply!(o::ClipNorm, x, Δ)
+    Δnrm = norm(Δ)
+    if Δnrm > o.thresh
+        rmul!(Δ, o.thresh / Δnrm)
+    end
+    return Δ
+end
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@ -2,23 +2,25 @@ using Juno
 import Zygote: Params, gradient


+
 """
-    update!(opt, p, g)
-    update!(opt, ps::Params, gs)
+    update!(x, x̄)

-Perform an update step of the parameters `ps` (or the single parameter `p`) 
-according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
-
-As a result, the parameters are mutated and the optimizer's internal state may change. 
-
-  update!(x, x̄)
-  
 Update the array `x` according to `x .-= x̄`.
 """
 function update!(x::AbstractArray, x̄)
  x .-= x̄
 end

+"""
+    update!(opt, p, g)
+    update!(opt, ps::Params, gs)
+
+Perform an update step of the parameters `ps` (or the single parameter `p`)
+according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
+
+As a result, the parameters are mutated and the optimizer's internal state may change.
+"""
 function update!(opt, x, x̄)
  x .-= apply!(opt, x, x̄)
 end
@ -41,11 +43,10 @@ struct StopException <: Exception end
    stop()

 Call `Flux.stop()` in a callback to indicate when a callback condition is met.
-This would trigger the train loop to stop and exit.
+This will trigger the train loop to stop and exit.

+# Examples
 ```julia
-# Example callback:
-
 cb = function ()
  accuracy() > 0.9 && Flux.stop()
 end
@ -58,19 +59,18 @@ end
 """
    train!(loss, params, data, opt; cb)

-For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
-backpropagation and calls the optimizer `opt`.
+For each datapoint `d` in `data` compute the gradient of `loss(d...)` through
+backpropagation and call the optimizer `opt`.

-In case datapoints `d` are of numeric array type, assumes no splatting is needed 
-and computes the gradient of `loss(d)`.
+In case datapoints `d` are of numeric array type, assume no splatting is needed
+and compute the gradient of `loss(d)`.

-Takes a callback as keyword argument `cb`. For example, this will print "training"
-every 10 seconds:
+A callback is given with the keyword argument `cb`. For example, this will print
+"training" every 10 seconds (using [`Flux.throttle`](@ref)):

-  train!(loss, params, data, opt,
-         cb = throttle(() -> println("training"), 10))
+    train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))

-The callback can call `Flux.stop()` to interrupt the training loop.
+The callback can call [`Flux.stop`](@ref) to interrupt the training loop.

 Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays.
 """
@ -106,11 +106,12 @@ end
 Run `body` `N` times. Mainly useful for quickly doing multiple epochs of
 training in a REPL.

-```julia
-julia> @epochs 2 println("hello")
-INFO: Epoch 1
+# Examples
+```jldoctest
+julia> Flux.@epochs 2 println("hello")
+[ Info: Epoch 1
 hello
-INFO: Epoch 2
+[ Info: Epoch 2
 hello
 ```
 """
--- a/src/utils.jl
+++ b/src/utils.jl
@ -1,10 +1,40 @@
 # Arrays
-nfan() = 1, 1 #fan_in, fan_out
-nfan(n) = 1, n #A vector is treated as a n×1 matrix
-nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
-nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
+nfan() = 1, 1 # fan_in, fan_out
+nfan(n) = 1, n # A vector is treated as a n×1 matrix
+nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices
+nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of convolution kernels

+"""
+    glorot_uniform(dims...)
+
+Return an `Array` of size `dims` containing random variables taken from a uniform
+distribution in the interval ``[-x, x]``, where `x = sqrt(24 / sum(dims)) / 2`.
+
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
+julia> Flux.glorot_uniform(2, 3)
+2×3 Array{Float32,2}:
+ 0.601094  -0.57414   -0.814925
+ 0.900868   0.805994   0.057514
+```
+"""
 glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
+
+"""
+    glorot_normal(dims...)
+
+Return an `Array` of size `dims` containing random variables taken from a normal
+distribution with mean 0 and standard deviation `sqrt(2 / sum(dims))`.
+
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
+julia> Flux.glorot_normal(3, 2)
+3×2 Array{Float32,2}:
+  0.429505  -0.0852891
+  0.523935   0.371009
+ -0.223261   0.188052
+```
+"""
 glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))

 ones(T::Type, dims...) = Base.ones(T, dims...)
@ -13,9 +43,81 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
 ones(dims...) = Base.ones(Float32, dims...)
 zeros(dims...) = Base.zeros(Float32, dims...)

+"""
+    unsqueeze(xs, dim)
+
+Return `xs` reshaped into an `Array` one dimensionality higher than `xs`,
+where `dim` indicates in which dimension `xs` is extended.
+
+# Examples
+```jldoctest
+julia> xs = [[1, 2], [3, 4], [5, 6]]
+3-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+
+julia> Flux.unsqueeze(xs, 1)
+1×3 Array{Array{Int64,1},2}:
+ [1, 2]  [3, 4]  [5, 6]
+
+julia> Flux.unsqueeze([1 2; 3 4], 2)
+2×1×2 Array{Int64,3}:
+[:, :, 1] =
+ 1
+ 3
+
+[:, :, 2] =
+ 2
+ 4
+```
+"""
 unsqueeze(xs, dim) = reshape(xs, (size(xs)[1:dim-1]..., 1, size(xs)[dim:end]...))

+"""
+    stack(xs, dim)
+
+Concatenate the given `Array` of `Array`s `xs` into a single `Array` along the
+given dimension `dim`.
+
+# Examples
+```jldoctest
+julia> xs = [[1, 2], [3, 4], [5, 6]]
+3-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+
+julia> Flux.stack(xs, 1)
+3×2 Array{Int64,2}:
+ 1  2
+ 3  4
+ 5  6
+
+julia> cat(xs, dims=1)
+3-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+```
+"""
 stack(xs, dim) = cat(unsqueeze.(xs, dim)..., dims=dim)
+
+"""
+    unstack(xs, dim)
+
+Unroll the given `xs` into an `Array` of `Array`s along the given dimension `dim`.
+
+# Examples
+```jldoctest
+julia> Flux.unstack([1 3 5 7; 2 4 6 8], 2)
+4-element Array{Array{Int64,1},1}:
+ [1, 2]
+ [3, 4]
+ [5, 6]
+ [7, 8]
+```
+"""
 unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]

 """
@ -23,9 +125,16 @@ unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]

 Split `xs` into `n` parts.

-```julia
-julia> chunk(1:10, 3)
-3-element Array{Array{Int64,1},1}:
+# Examples
+```jldoctest
+julia> Flux.chunk(1:10, 3)
+3-element Array{UnitRange{Int64},1}:
+ 1:4
+ 5:8
+ 9:10
+
+julia> Flux.chunk(collect(1:10), 3)
+3-element Array{SubArray{Int64,1,Array{Int64,1},Tuple{UnitRange{Int64}},true},1}:
 [1, 2, 3, 4]
 [5, 6, 7, 8]
 [9, 10]
@ -40,11 +149,12 @@ batchindex(xs, i) = (reverse(Base.tail(reverse(axes(xs))))..., i)

 Count the number of times that each element of `xs` appears.

-```julia
-julia> frequencies(['a','b','b'])
+# Examples
+```jldoctest
+julia> Flux.frequencies(['a','b','b'])
 Dict{Char,Int64} with 2 entries:
-  'b' => 2
  'a' => 1
+  'b' => 2
 ```
 """
 function frequencies(xs)
@ -64,8 +174,9 @@ squeezebatch(x) = reshape(x, head(size(x)))

 Batch the arrays in `xs` into a single array.

-```julia
-julia> batch([[1,2,3],[4,5,6]])
+# Examples
+```jldoctest
+julia> Flux.batch([[1,2,3],[4,5,6]])
 3×2 Array{Int64,2}:
 1  4
 2  5
@ -82,6 +193,25 @@ function batch(xs)
  return data
 end

+"""
+Return the given sequence padded with `p` up to a maximum length of `n`.
+
+# Examples
+```jldoctest
+julia> rpad([1, 2], 4, 0)
+4-element Array{Int64,1}:
+ 1
+ 2
+ 0
+ 0
+
+julia> rpad([1, 2, 3], 2, 0)
+3-element Array{Int64,1}:
+ 1
+ 2
+ 3
+```
+"""
 Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))]

 """
@ -90,8 +220,9 @@ Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))
 Take a list of `N` sequences, and turn them into a single sequence where each
 item is a batch of `N`. Short sequences will be padded by `pad`.

-```julia
-julia> batchseq([[1, 2, 3], [4, 5]], 0)
+# Examples
+```jldoctest
+julia> Flux.batchseq([[1, 2, 3], [4, 5]], 0)
 3-element Array{Array{Int64,1},1}:
 [1, 4]
 [2, 5]
@ -115,6 +246,10 @@ function _restructure(m, xs)
  end
 end

+@adjoint function _restructure(m, xs)
+  _restructure(m, xs), dm -> (nothing,destructure(dm)[1])
+end
+
 """
    destructure(m)

@ -148,11 +283,15 @@ end
 # Other

 """
-Returns a function that when invoked, will only be triggered at most once
-during `timeout` seconds. Normally, the throttled function will run
-as much as it can, without ever going more than once per `wait` duration;
-but if you'd like to disable the execution on the leading edge, pass
-`leading=false`. To enable execution on the trailing edge, ditto.
+    throttle(f, timeout; leading=true, trailing=false)
+
+Return a function that when invoked, will only be triggered at most once
+during `timeout` seconds.
+
+Normally, the throttled function will run as much as it can, without ever
+going more than once per `wait` duration; but if you'd like to disable the
+execution on the leading edge, pass `leading=false`. To enable execution on
+the trailing edge, pass `trailing=true`.
 """
 function throttle(f, timeout; leading=true, trailing=false)
  cooldown = true
--- a/src/zeros.jl
+++ b/src/zeros.jl
@ -0,0 +1,106 @@
+import Base: +, -, *, reshape, size
+import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
+
+"""
+    Zeros()
+    Zeros(size...)
+    Zeros(Type, size...)
+
+Acts as a stand-in for an array of zeros that can be
+used during training which is ignored by the optimisers.
+
+Useful to turn bias off for a forward pass of a layer.
+
+## Examples
+
+```julia
+julia> Flux.Zeros(3,3)
+3×3 Flux.Zeros{Bool,2}:
+ false  false  false
+ false  false  false
+ false  false  false
+
+julia> Flux.Zeros(Float32, 3,3)
+3×3 Flux.Zeros{Float32,2}:
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+ 0.0  0.0  0.0
+
+julia> rand(3,3) .+ Flux.Zeros()
+3×3 Array{Float64,2}:
+ 0.198739  0.490459  0.785386
+ 0.779074  0.39986   0.66383
+ 0.854981  0.447292  0.314497
+
+julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
+Conv((2, 2), 1=>3)
+```
+"""
+struct Zeros{T,N} <: AbstractArray{T,N}
+  size::Tuple
+end
+
+Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
+Zeros(sz::Integer...) = Zeros(Bool, sz...)
+
+Base.size(xs::Zeros) = xs.size
+Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
+
+Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
+
+Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
+Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
+              Zeros(T, length(inds))
+
+Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
+
+@adjoint reshape(xs::Zeros{T}, dims...) where T =
+                reshape(xs, dims...), _ -> nothing
+
+# Define basic ops
+for f in (:+, :-)
+  @eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
+    @assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
+    a
+  end
+end
+
+(a::Zeros, b::AbstractArray) = b + a
+-(a::Zeros, b::AbstractArray) = -b + a
+
+Base.copy(xs::Zeros{T,N}) where {T,N} = xs
+
+# Define broadcasting behaviour
+for op in (:+, :-)
+  @eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
+    bs = Broadcast.broadcast_shape(size(a), size(b))
+    size(a) == bs && return a
+    sz = similar(a, bs)
+    sz .= a
+  end
+end
+
+broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
+broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
+
+function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
+  Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+end
+
+broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
+
+for op in (:+, :-, :*)
+  @eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
+end
+
+# Some opportunities to avoid scalar indexing, intermediaries
+# Since it replicates a little of what we expect Base to do,
+# it should be possible to remove in the future, but for now,
+# these help with performance.
+broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
+broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
+broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
+broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
+broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
+broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@ -69,6 +69,7 @@ if CuArrays.has_cudnn()
  @info "Testing Flux/CUDNN"
  include("cudnn.jl")
  include("curnn.jl")
+  include("layers.jl")
 else
  @warn "CUDNN unavailable, not testing GPU DNN support"
 end
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@ -0,0 +1,98 @@
+# Test layers and data/model movements on and off the GPU
+# Add tests for layers and their gradients on the GPU
+# Most of the forward passes should be fine being applied
+# to bitstype objects, but this gives higher coverage for our use-cases
+# Check that getting the gradients does not throw
+
+# generic movement tests
+@testset "Basic GPU Movement" begin
+  @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
+  @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
+end
+
+# TODO: These layers get into scalar indexing
+# `AlphaDropout` throws a compilation error on GPUs,
+# whereas, the rest are scalar indexing issues.
+const BROKEN_LAYERS = [DepthwiseConv,
+		       AlphaDropout,
+                       InstanceNorm,
+                       GroupNorm]
+
+function gradtest(name::String, layers::Vector, xs = nothing, args...)
+  isnothing(xs) && error("Missing input to test the layers against.")
+  @testset "$name GPU grad tests" begin
+    for layer in layers
+      @testset "$layer GPU grad test" begin
+        l = gpu(layer(args...))
+        xs = gpu(xs)
+        if any(x -> isa(l, x), BROKEN_LAYERS)
+          ps = Flux.params(l)
+          @test_broken gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
+        else
+          ps = Flux.params(l)
+          @test gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
+          gs = gradient(() -> sum(l(xs)), ps)
+
+          # Handle pooling layers
+          if !isempty(ps)
+            @test gs[first(ps)] isa Flux.CuArrays.CuArray
+          end
+        end
+      end
+    end
+  end
+end
+
+# Repeats from Conv, CrossCor
+
+r = rand(Float32, 28, 28, 1, 1)
+conv_layers = [Conv, ConvTranspose, CrossCor, DepthwiseConv]
+gradtest("Conv", conv_layers, r, (2,2), 1=>3)
+
+pooling_layers = [MaxPool, MeanPool]
+gradtest("Pooling", pooling_layers, r, (2,2))
+
+dropout_layers = [Dropout, AlphaDropout]
+gradtest("Dropout", dropout_layers, r, 0.5f0)
+
+norm_layers = [LayerNorm, BatchNorm]
+gradtest("Normalising", norm_layers, rand(Float32, 28,28,3,1), 1)
+
+instancenorm = [InstanceNorm]
+gradtest("InstanceNorm", instancenorm, r, 1)
+
+groupnorm = [GroupNorm]
+gradtest("GroupNorm", groupnorm, rand(Float32, 28,28,3,1), 3, 1)
+
+const stateless_layers = [Flux.mse,
+                          Flux.crossentropy,
+                          Flux.logitcrossentropy,
+                          Flux.normalise]
+
+const stateless_layers_broadcasted = [Flux.binarycrossentropy,
+                                      Flux.logitbinarycrossentropy]
+
+function stateless_gradtest(f, args...)
+  @test gradient((args...) -> sum(f(args...)), args...)[1] isa CuArray
+end
+
+function stateless_gradtest_broadcasted(f, args...)
+  @test gradient((args...) -> sum(f.(args...)), args...)[1] isa CuArray
+end
+
+@testset "Stateless GPU grad tests" begin
+  x = gpu(rand(3,3))
+  y = gpu(rand(3,3))
+
+  for layer in stateless_layers
+    if layer == Flux.normalise
+      stateless_gradtest(layer, x)
+    else
+      stateless_gradtest(layer, x, y)
+    end
+  end
+
+  for layer in stateless_layers_broadcasted
+    stateless_gradtest_broadcasted(layer, x, y)
+  end
+end
--- a/test/data.jl
+++ b/test/data.jl
@ -3,20 +3,34 @@
    Y = [1:5;]

    d = DataLoader(X, batchsize=2)
+    @inferred first(d)
    batches = collect(d)
+    @test eltype(batches) == eltype(d) == typeof(X)
    @test length(batches) == 3
    @test batches[1] == X[:,1:2]
    @test batches[2] == X[:,3:4]
    @test batches[3] == X[:,5:5]

    d = DataLoader(X, batchsize=2, partial=false)
+    @inferred first(d)
    batches = collect(d)
+    @test eltype(batches) == eltype(d) == typeof(X)
    @test length(batches) == 2
    @test batches[1] == X[:,1:2]
    @test batches[2] == X[:,3:4]

-    d = DataLoader(X, Y, batchsize=2)
+    d = DataLoader((X,), batchsize=2, partial=false)
+    @inferred first(d)
    batches = collect(d)
+    @test eltype(batches) == eltype(d) == Tuple{typeof(X)}
+    @test length(batches) == 2
+    @test batches[1] == (X[:,1:2],)
+    @test batches[2] == (X[:,3:4],)
+
+    d = DataLoader((X, Y), batchsize=2)
+    @inferred first(d)
+    batches = collect(d)
+    @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
    @test length(batches) == 3
    @test length(batches[1]) == 2
    @test length(batches[2]) == 2
@ -28,6 +42,22 @@
    @test batches[3][1] == X[:,5:5]
    @test batches[3][2] == Y[5:5]

+    # test with NamedTuple
+    d = DataLoader((x=X, y=Y), batchsize=2)
+    @inferred first(d)
+    batches = collect(d)
+    @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
+    @test length(batches) == 3
+    @test length(batches[1]) == 2
+    @test length(batches[2]) == 2
+    @test length(batches[3]) == 2
+    @test batches[1][1] == batches[1].x == X[:,1:2]
+    @test batches[1][2] == batches[1].y == Y[1:2]
+    @test batches[2][1] == batches[2].x == X[:,3:4]
+    @test batches[2][2] == batches[2].y == Y[3:4]
+    @test batches[3][1] == batches[3].x == X[:,5:5]
+    @test batches[3][2] == batches[3].y == Y[5:5]
+
    # test interaction with `train!`
    θ = ones(2)
    X = zeros(2, 10)
@ -41,7 +71,7 @@
    X = ones(2, 10)
    Y = fill(2, 10)
    loss(x, y) = sum((y - x'*θ).^2)
-    d  = DataLoader(X, Y) 
+    d  = DataLoader((X, Y)) 
    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
    @test norm(θ .- 1) < 1e-10
 end
@ -76,8 +106,9 @@ end
    @test size(Iris.labels()) == (150,)
 end

+
@testset "Housing" begin
-    @test Housing.features() isa Matrix
+    @test Housing.features() isa Matrix # test broken due to SSL certifate expiration problem
    @test size(Housing.features()) == (506, 13)

    @test Housing.targets() isa Array{Float64}
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@ -28,6 +28,14 @@ import Flux: activations
  end

  @testset "Dense" begin
+    @testset "constructors" begin
+      @test size(Dense(10, 100).W) == (100, 10)
+      @test Dense(rand(100,10), rand(10)).σ == identity
+
+      @test_throws MethodError Dense(10, 10.5)
+      @test_throws MethodError Dense(10, 10.5, tanh)
+    end
+
    @test  length(Dense(10, 5)(randn(10))) == 5
    @test_throws DimensionMismatch Dense(10, 5)(randn(1))
    @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
@ -37,7 +45,6 @@ import Flux: activations
    @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
    @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
    @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
-
  end

  @testset "Diagonal" begin
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@ -4,6 +4,10 @@ using Flux: gradient

@testset "Pooling" begin
  x = randn(Float32, 10, 10, 3, 2)
+  gmp = GlobalMaxPool()
+  @test size(gmp(x)) == (1, 1, 3, 2)
+  gmp = GlobalMeanPool()
+  @test size(gmp(x)) == (1, 1, 3, 2)
  mp = MaxPool((2, 2))
  @test mp(x) == maxpool(x, PoolDims(x, 2))
  mp = MeanPool((2, 2))
@ -21,6 +25,35 @@ end
    Dense(288, 10), softmax)

  @test size(m(r)) == (10, 5)
+
+  # Test bias switch
+  bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
+  ip = zeros(Float32, 28,28,1,1)
+
+  op = bias(ip)
+  @test sum(op) == prod(size(op))
+
+  bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
+  op = bias(ip)
+  @test sum(op) === 0.f0
+  gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
+  @test gs[bias.bias] == nothing
+
+  # Train w/o bias and make sure no convergence happens
+  # when only bias can be converged
+  bias = Conv((2, 2), 1=>3, bias = Flux.Zeros());
+  ip = zeros(Float32, 28,28,1,1)
+  op = zeros(Float32, 27,27,3,1) .+ 2.f0
+  opt = Descent()
+
+  for _ = 1:10^3
+    gs = gradient(params(bias)) do
+      Flux.mse(bias(ip), op)
+    end
+    Flux.Optimise.update!(opt, params(bias), gs)
+  end
+
+  @test Flux.mse(bias(ip), op) ≈ 4.f0
 end

@testset "asymmetric padding" begin
@ -158,4 +191,28 @@ end
  @test Flux.outdims(m, (5, 5)) == (4, 4)
  m = MeanPool((2, 2); stride = 2, pad = 3)
  @test Flux.outdims(m, (5, 5)) == (5, 5)
-end
+end
+
+@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, CrossCor), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
+  data = ones(Float32, (k .+ 3)..., 1,1)
+  l = ltype(k, 1=>1, pad=SamePad())
+  @test size(l(data)) == size(data)
+
+  l = ltype(k, 1=>1, pad=SamePad(), dilation = k .÷ 2)
+  @test size(l(data)) == size(data)
+
+  stride = 3
+  l = ltype(k, 1=>1, pad=SamePad(), stride = stride)
+  if ltype == ConvTranspose
+    @test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] .- stride .+ 1
+  else
+    @test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ stride)
+  end
+end
+
+@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
+  data = ones(Float32, (k .+ 3)..., 1,1)
+
+  l = ltype(k, pad=SamePad())
+  @test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ k)
+end
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@ -1,9 +1,26 @@
 using Test
 using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
-            σ, binarycrossentropy, logitbinarycrossentropy
+            σ, binarycrossentropy, logitbinarycrossentropy, flatten,
+            xlogx, xlogy

 const ϵ = 1e-7

+@testset "xlogx & xlogy" begin
+  @test iszero(xlogx(0))
+  @test isnan(xlogx(NaN))
+  @test xlogx(2) ≈ 2.0 * log(2.0)
+  @inferred xlogx(2)
+  @inferred xlogx(0)
+
+  @test iszero(xlogy(0, 1))
+  @test isnan(xlogy(NaN, 1))
+  @test isnan(xlogy(1, NaN))
+  @test isnan(xlogy(NaN, NaN))
+  @test xlogy(2, 3) ≈ 2.0 * log(3.0)
+  @inferred xlogy(2, 3)
+  @inferred xlogy(0, 1)
+end
+
@testset "losses" begin
  # First, regression-style y's
  y = [1, 1, 0, 0]
@ -12,15 +29,15 @@ const ϵ = 1e-7
  @testset "mse" begin
    @test mse(ŷ, y) ≈ (.1^2 + .9^2)/2
  end
-  
+
  @testset "mae" begin
    @test Flux.mae(ŷ, y) ≈ 1/2
  end
-  
+
  @testset "huber_loss" begin
    @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
-  end       
-            
+  end
+
  y = [123.0,456.0,789.0]
  ŷ = [345.0,332.0,789.0]
  @testset "msle" begin
@ -35,6 +52,7 @@ const ϵ = 1e-7
  lossvalue = 1.203972804325936

  @testset "crossentropy" begin
+    @test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ crossentropy([0.1,0.9], [0.1,0.9])
    @test crossentropy(ŷ, y) ≈ lossvalue
  end

@ -63,46 +81,47 @@ const ϵ = 1e-7
  @testset "logitbinarycrossentropy" begin
    @test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0)
  end
-  
+
  y = [1 2 3]
  ŷ = [4.0 5.0 6.0]
  @testset "kldivergence" begin
+    @test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ Flux.kldivergence([0.1,0.9], [0.1,0.9])
    @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
-    @test Flux.kldivergence(y, y) ≈ 0 
+    @test Flux.kldivergence(y, y) ≈ 0
  end
-  
+
  y = [1 2 3 4]
  ŷ = [5.0 6.0 7.0 8.0]
  @testset "hinge" begin
    @test Flux.hinge(ŷ, y) ≈ 0
    @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
  end
-  
+
  @testset "squared_hinge" begin
    @test Flux.squared_hinge(ŷ, y) ≈ 0
    @test Flux.squared_hinge(y, 0.5 .* y) ≈ 0.0625
  end
-  
+
  y = [0.1 0.2 0.3]
  ŷ = [0.4 0.5 0.6]
  @testset "poisson" begin
    @test Flux.poisson(ŷ, y) ≈ 0.6278353988097339
    @test Flux.poisson(y, y) ≈ 0.5044459776946685
  end
-  
+
  y = [1.0 0.5 0.3 2.4]
  ŷ = [0 1.4 0.5 1.2]
  @testset "dice_coeff_loss" begin
    @test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999
    @test Flux.dice_coeff_loss(y, y) ≈ 0.0
  end
-            
+
  @testset "tversky_loss" begin
    @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
    @test Flux.tversky_loss(ŷ, y, β = 0.8) ≈ -0.09490740740740744
    @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
  end
-            
+
  @testset "no spurious promotions" begin
    for T in (Float32, Float64)
      y = rand(T, 2)
@ -116,3 +135,10 @@ const ϵ = 1e-7
    end
  end
 end
+
+@testset "helpers" begin
+  @testset "flatten" begin
+    x = randn(Float32, 10, 10, 3, 2)
+    @test size(flatten(x)) == (300, 2)
+  end
+end
--- a/test/optimise.jl
+++ b/test/optimise.jl
@ -57,35 +57,57 @@ end
 end

@testset "ExpDecay" begin
-    w = randn(10, 10)
-    o = ExpDecay(0.1, 0.1, 1000, 1e-4)
-    w1 = randn(10,10)
-    loss(x) = Flux.mse(w*x, w1*x)
-    flag = 1
-    decay_steps = []
-    for t = 1:10^5
-      prev_eta = o.eta
-      θ = Params([w1])
-      x = rand(10)
-      θ̄ = gradient(() -> loss(x), θ)
-      prev_grad = collect(θ̄[w1])
-      delta = Optimise.apply!(o, w1, θ̄[w1])
-      w1 .-= delta
-      new_eta = o.eta
-      if new_eta != prev_eta
-        push!(decay_steps, t)
-      end
-      array = fill(o.eta, size(prev_grad))
-      if array .* prev_grad != delta
-        flag = 0
-      end
+
+  @testset "Sanity Check" begin
+    o = ExpDecay(0.2, 0.5, 1, 1e-3)
+    p = [0.0]
+    steps = 1:8
+    eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
+    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
+    @test eta_actual == eta_expected
+  end
+
+  w = randn(10, 10)
+  o = ExpDecay(0.1, 0.1, 1000, 1e-4)
+  w1 = randn(10,10)
+  loss(x) = Flux.mse(w*x, w1*x)
+  flag = 1
+  decay_steps = []
+  for t = 1:10^5
+    prev_eta = o.eta
+    θ = Params([w1])
+    x = rand(10)
+    θ̄ = gradient(() -> loss(x), θ)
+    prev_grad = collect(θ̄[w1])
+    delta = Optimise.apply!(o, w1, θ̄[w1])
+    w1 .-= delta
+    new_eta = o.eta
+    if new_eta != prev_eta
+      push!(decay_steps, t)
    end
-    @test flag == 1
-    # Test to check if decay happens at decay steps. Eta reaches clip value eventually.
-    ground_truth = []
-    for i in 1:11
-      push!(ground_truth, 1000*i)  # Expected decay steps for this example.
+    array = fill(o.eta, size(prev_grad))
+    if array .* prev_grad != delta
+      flag = 0
    end
-    @test decay_steps == ground_truth
-    @test o.eta == o.clip
+  end
+  @test flag == 1
+  # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
+  ground_truth = []
+  for i in 1:4
+    push!(ground_truth, 1000*i)  # Expected decay steps for this example.
+  end
+  @test decay_steps == ground_truth
+  @test o.eta == o.clip
 end
+
+@testset "Clipping" begin
+    w = randn(10, 10)
+    loss(x) = sum(w * x)
+    θ = Params([w])
+    x = 1000 * randn(10)
+    w̄ = gradient(() -> loss(x), θ)[w]
+    w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
+    @test all(w̄_value .<= 1)
+    w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
+    @test norm(w̄_norm) <= 1
+end
--- a/test/runtests.jl
+++ b/test/runtests.jl
@ -2,48 +2,45 @@ using Flux
 using Flux.Data
 using Test 
 using Random, Statistics, LinearAlgebra
-using Documenter
 using IterTools: ncycle

 Random.seed!(0)

-@testset "Flux" begin
+@testset "Utils" begin
+  include("utils.jl")
+end

-  @testset "Utils" begin
-    include("utils.jl")
-  end
-
-  @testset "Onehot" begin
-    include("onehot.jl")
-  end
-
-  @testset "Optimise" begin
-    include("optimise.jl")
-  end
-
-  @testset "Data" begin
-    include("data.jl")
-  end
-
-  @testset "Layers" begin
-    include("layers/basic.jl")
-    include("layers/normalisation.jl")
-    include("layers/stateless.jl")
-    include("layers/conv.jl")
-  end
-
-  @testset "CUDA" begin
-    if Flux.use_cuda[]
-      include("cuda/cuda.jl")
-    else
-      @warn "CUDA unavailable, not testing GPU support"
-    end
+@testset "Onehot" begin
+  include("onehot.jl")
+end
+
+@testset "Optimise" begin
+  include("optimise.jl")
+end
+
+@testset "Data" begin
+  include("data.jl")
+end
+
+@testset "Layers" begin
+  include("layers/basic.jl")
+  include("layers/normalisation.jl")
+  include("layers/stateless.jl")
+  include("layers/conv.jl")
+end
+
+@testset "CUDA" begin
+  if Flux.use_cuda[]
+    include("cuda/cuda.jl")
+  else
+    @warn "CUDA unavailable, not testing GPU support"
  end
+end

+@static if VERSION >= v"1.4"
+  using Documenter
  @testset "Docs" begin
-    if VERSION >= v"1.2"
-      doctest(Flux)
-    end
+    DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
+    doctest(Flux)
  end
-
-end # testset Flux
+end