diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
new file mode 100644
index 00000000..9777033d
--- /dev/null
+++ b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,24 @@
+name: CompatHelper
+
+on:
+  schedule:
+    - cron: '00 00 * * *'
+
+jobs:
+  CompatHelper:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        julia-version: [1.3]
+        julia-arch: [x64]
+        os: [ubuntu-latest]
+    steps:
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: ${{ matrix.julia-version }}
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 00000000..d77d3a0c
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,11 @@
+name: TagBot
+on:
+  schedule:
+    - cron: 0 * * * *
+jobs:
+  TagBot:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9af14c6a..cf9c6013 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,51 +1,41 @@
-before_script:
-  - export CI_DISABLE_CURNN_TEST=true
-
-variables:
-  CI_IMAGE_TAG: 'cuda'
-
 include:
-  - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/common.yml'
+  - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v6.yml'
 
-.flux:
-  extends: .test
-  script:
-    - julia -e 'using InteractiveUtils;
-                versioninfo()'
-    - mkdir $JULIA_DEPOT_PATH # Pkg3.jl#325
-    - julia --project -e 'using Pkg;
-                          Pkg.instantiate();
-                          Pkg.build();
-                          Pkg.test(; coverage=true);'
+image: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
 
-test:v1.0:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.0'
 
-test:v1.1:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.1'
+# julia:1.0:
+#   extends:
+#     - .julia:1.0
+#     - .test
+#   tags:
+#     - nvidia
+# 
+# julia:1.1:
+#   extends:
+#     - .julia:1.1
+#     - .test
+#   tags:
+#     - nvidia
+# 
+# julia:1.2:
+#   extends:
+#     - .julia:1.2
+#     - .test
+#   tags:
+#     - nvidia
 
-test:v1.2:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.2'
-
-test:v1.3:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.3'
-
-test:v1.0:
-   extends: .flux
-   variables:
-     CI_VERSION_TAG: 'v1.0'
-
-test:dev:
-  extends: .flux
-  variables:
-    CI_VERSION_TAG: 'dev'
+julia:1.3:
+  extends:
+    - .julia:1.3
+    - .test
+  tags:
+    - nvidia
 
+julia:nightly:
+  extends:
+    - .julia:nightly
+    - .test
+  tags:
+    - nvidia
   allow_failure: true
diff --git a/.travis.yml b/.travis.yml
index a9cd86ea..e02f470f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,7 @@ os:
   # - osx
 
 julia:
-  - 1.1
+  - 1.3
   - nightly
 
 matrix:
@@ -16,7 +16,7 @@ matrix:
 jobs:
   include:
     - stage: "Documentation"
-      julia: 1.0
+      julia: 1.3
       os: linux
       script:
         - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
diff --git a/Manifest.toml b/Manifest.toml
index c524a684..12986ccd 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,15 +2,15 @@
 
 [[AbstractFFTs]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40"
+git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "0.4.1"
+version = "0.5.0"
 
 [[AbstractTrees]]
-deps = ["Markdown", "Test"]
-git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b"
+deps = ["Markdown"]
+git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.2.1"
+version = "0.3.0"
 
 [[Adapt]]
 deps = ["LinearAlgebra"]
@@ -21,46 +21,34 @@ version = "1.0.0"
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
-[[BinDeps]]
-deps = ["Compat", "Libdl", "SHA", "URIParser"]
-git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
-uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
-version = "0.8.10"
-
 [[BinaryProvider]]
-deps = ["Libdl", "Logging", "SHA"]
-git-tree-sha1 = "c7361ce8a2129f20b0e05a89f7070820cfed6648"
+deps = ["Libdl", "SHA"]
+git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
 uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.6"
+version = "0.5.8"
 
 [[CEnum]]
 git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 version = "0.2.0"
 
-[[CSTParser]]
-deps = ["Tokenize"]
-git-tree-sha1 = "c69698c3d4a7255bc1b4bc2afc09f59db910243b"
-uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
-version = "0.6.2"
-
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "e063efb91cfefd7e6afd92c435d01398107a500b"
+git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "1.2.0"
+version = "2.1.0"
 
 [[CUDAdrv]]
-deps = ["CUDAapi", "Libdl", "Printf"]
-git-tree-sha1 = "9ce99b5732c70e06ed97c042187baed876fb1698"
+deps = ["CEnum", "CUDAapi", "Printf"]
+git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "3.1.0"
+version = "5.0.1"
 
 [[CUDAnative]]
-deps = ["Adapt", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Printf", "TimerOutputs"]
-git-tree-sha1 = "52ae1ce10ebfa686e227655c47b19add89308623"
+deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
+git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.3.1"
+version = "2.7.0"
 
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@@ -70,9 +58,9 @@ version = "0.6.0"
 
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965"
+git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.8.0"
+version = "0.8.1"
 
 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
@@ -86,40 +74,22 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"
 
-[[Compat]]
-deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f"
-uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "2.1.0"
-
-[[Conda]]
-deps = ["JSON", "VersionParsing"]
-git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
-uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
-version = "1.3.0"
-
-[[Crayons]]
-deps = ["Test"]
-git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
-uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
-version = "4.0.0"
-
 [[CuArrays]]
-deps = ["AbstractFFTs", "Adapt", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "46b48742a84bb839e74215b7e468a4a1c6ba30f9"
+deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
+git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.2.1"
+version = "1.6.0"
 
 [[DataAPI]]
-git-tree-sha1 = "8903f0219d3472543fc4b2f5ebaf675a07f817c0"
+git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.0.1"
+version = "1.1.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a"
+git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.0"
+version = "0.17.7"
 
 [[Dates]]
 deps = ["Printf"]
@@ -130,32 +100,38 @@ deps = ["Mmap"]
 uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 
 [[DiffResults]]
-deps = ["Compat", "StaticArrays"]
-git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c"
+deps = ["StaticArrays"]
+git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc"
 uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-version = "0.0.4"
+version = "1.0.2"
 
 [[DiffRules]]
-deps = ["Random", "Test"]
-git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
+deps = ["NaNMath", "Random", "SpecialFunctions"]
+git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "0.0.10"
+version = "1.0.0"
 
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 
 [[FFTW]]
-deps = ["AbstractFFTs", "BinaryProvider", "Conda", "Libdl", "LinearAlgebra", "Reexport", "Test"]
-git-tree-sha1 = "6c5b420da0b8c12098048561b8d58f81adea506f"
+deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
+git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
 uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.0.1"
+version = "1.2.0"
+
+[[FFTW_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
+uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
+version = "3.3.9+3"
 
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "8fba6ddaf66b45dec830233cea0aae43eb1261ad"
+git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.6.4"
+version = "0.8.4"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -163,33 +139,33 @@ uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 version = "0.6.1"
 
 [[ForwardDiff]]
-deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"]
-git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b"
+deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
+git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.3"
+version = "0.10.8"
 
 [[GPUArrays]]
-deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "StaticArrays", "Test"]
-git-tree-sha1 = "77e27264276fe97a7e7fb928bf8999a145abc018"
+deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
+git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "1.0.3"
+version = "2.0.1"
 
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "e23faa71b8f54c3fdc99b230b9c2906cafdddca5"
+git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.2.3"
+version = "0.3.0"
+
+[[IntelOpenMP_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
+uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
+version = "2018.0.3+0"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
-[[JSON]]
-deps = ["Dates", "Mmap", "Parsers", "Unicode"]
-git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
-uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-version = "0.21.0"
-
 [[Juno]]
 deps = ["Base64", "Logging", "Media", "Profile", "Test"]
 git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8"
@@ -198,9 +174,9 @@ version = "0.7.2"
 
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "4a05f742837779a00bd8c9a18da6817367c4245d"
+git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.3.0"
+version = "1.3.3"
 
 [[LibGit2]]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
@@ -215,11 +191,17 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
+[[MKL_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
+uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
+version = "2019.0.117+0"
+
 [[MacroTools]]
-deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"]
-git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76"
+deps = ["DataStructures", "Markdown", "Random"]
+git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.1"
+version = "0.5.3"
 
 [[Markdown]]
 deps = ["Base64"]
@@ -232,24 +214,30 @@ uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
 version = "0.5.0"
 
 [[Missings]]
-git-tree-sha1 = "29858ce6c8ae629cf2d733bffa329619a1c843d0"
+deps = ["DataAPI"]
+git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5"
 uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "0.4.2"
+version = "0.4.3"
 
 [[Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[NNlib]]
-deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8"
+deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
+git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.0"
+version = "0.6.2"
 
 [[NaNMath]]
-deps = ["Compat"]
-git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
+git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "0.3.2"
+version = "0.3.3"
+
+[[OpenSpecFun_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
+uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
+version = "0.5.3+1"
 
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@@ -257,14 +245,8 @@ git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.1.0"
 
-[[Parsers]]
-deps = ["Dates", "Test"]
-git-tree-sha1 = "ef0af6c8601db18c282d092ccbd2f01f3f0cd70b"
-uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "0.3.7"
-
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
@@ -290,10 +272,10 @@ uuid = "189a3867-3050-52da-a836-e630ba90ab69"
 version = "0.2.0"
 
 [[Requires]]
-deps = ["Test"]
-git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
+deps = ["UUIDs"]
+git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "0.5.2"
+version = "1.0.0"
 
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@@ -301,10 +283,6 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
-[[SharedArrays]]
-deps = ["Distributed", "Mmap", "Random", "Serialization"]
-uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -319,16 +297,16 @@ deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[SpecialFunctions]]
-deps = ["BinDeps", "BinaryProvider", "Libdl"]
-git-tree-sha1 = "3bdd374b6fd78faf0119b8c5d538788dbf910c6e"
+deps = ["OpenSpecFun_jll"]
+git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.8.0"
+version = "0.9.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6"
+git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.11.0"
+version = "0.12.1"
 
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
@@ -345,15 +323,10 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[TimerOutputs]]
-deps = ["Crayons", "Printf", "Test", "Unicode"]
-git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
+deps = ["Printf"]
+git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.0"
-
-[[Tokenize]]
-git-tree-sha1 = "dfcdbbfb2d0370716c815cbd6f8a364efb6f42cf"
-uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
-version = "0.5.6"
+version = "0.5.3"
 
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
@@ -361,12 +334,6 @@ git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
 uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 version = "0.9.5"
 
-[[URIParser]]
-deps = ["Test", "Unicode"]
-git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
-uuid = "30578b45-9adc-5946-b283-645ec420af67"
-version = "0.4.0"
-
 [[UUIDs]]
 deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
@@ -374,30 +341,26 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
-[[VersionParsing]]
-deps = ["Compat"]
-git-tree-sha1 = "c9d5aa108588b978bd859554660c8a5c4f2f7669"
-uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
-version = "1.1.3"
-
 [[ZipFile]]
-deps = ["BinaryProvider", "Libdl", "Printf"]
-git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
+deps = ["Libdl", "Printf", "Zlib_jll"]
+git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.8.3"
+version = "0.9.0"
+
+[[Zlib_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.11+7"
 
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "38241b40ebd8748bcacad5e6c7ba3ab3cc7a15c9"
-repo-rev = "master"
-repo-url = "https://github.com/FluxML/Zygote.jl.git"
+git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.3.4"
+version = "0.4.5"
 
 [[ZygoteRules]]
 deps = ["MacroTools"]
-git-tree-sha1 = "c4c29b30b8ff3be13d4244e78be7df2a42bc54d0"
-repo-rev = "master"
-repo-url = "https://github.com/FluxML/ZygoteRules.jl.git"
+git-tree-sha1 = "b3b4882cc9accf6731a08cc39543fbc6b669dca8"
 uuid = "700de1a5-db45-46bc-99cf-38207098b444"
 version = "0.2.0"
diff --git a/NEWS.md b/NEWS.md
index 26853df3..4023c7f2 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,16 @@
+# v0.10.0
+* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
+  - The dependency on Tracker.jl has been removed.
+  - This means Flux now does not depend on using a specialised `TrackedArray` type, and can be used with normal Array implementations directly.
+  - Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
+* The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
+* `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
+* Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
+* [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
+* [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
+* `@treelike` has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
+* `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
+
 # v0.9.0
 * [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
 * New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.
diff --git a/Project.toml b/Project.toml
index 609af949..71282a10 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,17 +1,15 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.9.0"
+version = "0.10.2"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@@ -24,13 +22,20 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"
 
 [compat]
-CUDAapi = "1.1"
-CuArrays = "1.2"
+AbstractTrees = "0.2, 0.3"
+Adapt = "1"
+CodecZlib = "0.5, 0.6"
+Colors = "0.8, 0.9, 0.10, 0.11"
+CuArrays = "1.6"
+Juno = "0.5, 0.6, 0.7, 0.8"
+MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
-Zygote = "0.3"
+Reexport = "0.2"
+StatsBase = "0"
+ZipFile = "0.7, 0.8, 0.9"
+Zygote = "0.4"
 julia = "1"
 
 [extras]
diff --git a/README.md b/README.md
index 4e06793e..730e2801 100644
--- a/README.md
+++ b/README.md
@@ -7,93 +7,9 @@
 Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, and provides lightweight abstractions on top of Julia's native GPU and AD support. Flux makes the easy things easy while remaining fully hackable.
 
 ```julia
-julia> Pkg.add("Flux")
+] add Flux
 ```
 
 See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
 
-If you use Flux in research, please cite the following paper:
-
-```
-@article{innes:2018,
-  author    = {Mike Innes},
-  title     = {Flux: Elegant Machine Learning with Julia},
-  journal   = {Journal of Open Source Software},
-  year      = {2018},
-  doi       = {10.21105/joss.00602},
-}
-```
-
-## Features
-
-Flux has powerful high-level features, and common architectures can be defined in a few lines.
-
-```julia
-model = Chain(
-  Dense(768, 128, σ),
-  LSTM(128, 256),
-  LSTM(256, 128),
-  Dense(128, 10),
-  softmax)
-
-loss(x, y) = crossentropy(model(x), y)
-
-Flux.train!(loss, data, ADAM(...))
-```
-
-Yet you can easily strip away the layers, and directly write the mathematics for your problem. Flux will seamlessly take gradients of any Julia code, so your model looks just like the paper.
-
-```julia
-W = param(randn(2, 10))
-b = param(randn(2))
-
-y(x) = σ.(W * x .+ b)
-```
-
-If that's *still* not enough, you can go as deep as you want, even writing your own CUDA kernels with [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl)! All this can be freely mixed-and-matched in a single model or script, and it all runs interactively via Jupyter or Juno.
-
-```julia
-function gpu_add(a, b, c)
-  i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-  c[i] = a[i] + b[i]
-  return nothing
-end
-```
-
-Unusual architectures are no problem in Flux, as you can use all the loops, control flow and even macros that you're used to. Here's a Tree RNN in 4 lines.
-
-```julia
-tree() = rand() < 0.5 ? rand(10) : (tree(), tree()) # dummy data
-
-shrink = Dense(20, 10)
-combine(a, b) = shrink([a; b])
-
-model(x) = x
-model(x::Tuple) = combine(model(x[1]), model(x[2]))
-
-model(tree()) # Sample output
-```
-
-Despite this flexibility, Julia's advanced compiler lets us do some powerful optimisations. For example, this definition of `sigmoid` automatically gets fused into a *single* GPU kernel – so it's really fast.
-
-```julia
-sigmoid(xs) = 1 ./ (1 .+ exp.(.-xs))
-```
-
-Similarly, Flux is the first dynamic framework to support [compiling to the browser](https://fluxml.github.io/experiments/) and model import via [formats like ONNX](https://github.com/FluxML/ONNX.jl/), both of which are thinly-veiled compiler problems.
-
-For more on our philosophy on machine learning, check out our article [On Machine Learning & Programming Languages](https://julialang.org/blog/2017/12/ml&pl).
-
-## Contributing & Help
-
-For general questions and help, check out Julia's [community forum](https://discourse.julialang.org/c/domain/ML).
-
-Flux development is carried out via our [GitHub issues](https://github.com/FluxML/Flux.jl/issues), so feel free to open feature requests or PRs here.
-
-For more informal discussions we'd love to have you on the [Julia slack](https://slackinvite.julialang.org/), where we hang out on the #machine-learning channel.
-
-## Related Packages
-
-Check out [Metalhead.jl](https://github.com/FluxML/Metalhead.jl) for common computer vision datasets and trained models.
-
-[MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) provides further common datasets.
+If you use Flux in your research, please [cite](CITATION.bib) our work.
diff --git a/bors.toml b/bors.toml
index 4e6e5d26..4390d210 100644
--- a/bors.toml
+++ b/bors.toml
@@ -1,4 +1,4 @@
 status = [
-  "ci/gitlab/%"
+  "ci/gitlab%"
 ]
-timeout-sec = 14400
+timeout-sec = 7200
diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index d83fc462..6e8d0b76 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -219,3 +219,24 @@ Flux.@functor Affine
 ```
 
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
+
+## Utility functions
+
+Flux provides some utility functions to help you generate models in an automated fashion.
+
+`outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
+Currently limited to the following layers:
+- `Chain`
+- `Dense`
+- `Conv`
+- `Diagonal`
+- `Maxout`
+- `ConvTranspose`
+- `DepthwiseConv`
+- `CrossCor`
+- `MaxPool`
+- `MeanPool`
+
+```@docs
+outdims
+```
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 8b725bfb..5f2ab3ce 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -65,3 +65,15 @@ AlphaDropout
 LayerNorm
 GroupNorm
 ```
+
+## Cost Functions
+```@docs
+mse
+crossentropy
+logitcrossentropy
+binarycrossentropy
+logitbinarycrossentropy
+kldivergence
+poisson
+hinge
+```
diff --git a/docs/src/saving.md b/docs/src/saving.md
index f71c4350..8e795298 100644
--- a/docs/src/saving.md
+++ b/docs/src/saving.md
@@ -113,6 +113,6 @@ You can even store optimiser state alongside the model, to resume training
 exactly where you left off.
 
 ```julia
-opt = ADAM(params(model))
+opt = ADAM()
 @save "model-$(now()).bson" model opt
 ```
diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 9eb659c4..5e8b95de 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -58,3 +58,83 @@ AMSGrad
 NADAM
 ADAMW
 ```
+
+## Optimiser Interface
+
+Flux's optimsers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
+
+In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
+
+```julia
+mutable struct Momentum
+  eta
+  rho
+  velocity
+end
+
+Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
+```
+
+The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
+
+```julia
+function apply!(o::Momentum, x, Δ)
+  η, ρ = o.eta, o.rho
+  v = get!(o.velocity, x, zero(x))::typeof(x)
+  @. v = ρ * v - η * Δ
+  @. Δ = -v
+end
+```
+
+This is the basic definition of a Momentum update rule given by:
+
+```math
+v = ρ * v - η * Δ
+w = w - v
+```
+
+The `apply!` defines the update rules for an optimiser `opt`, given the parameters and gradients. It returns the updated gradients. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
+
+Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully.
+
+## Composing Optimisers
+
+Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
+that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
+
+```julia
+opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
+```
+
+Here we apply exponential decay to the `Descent` optimser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
+It is then applied like any optimser.
+
+```julia
+w = randn(10, 10)
+w1 = randn(10,10)
+ps = Params([w, w1])
+
+loss(x) = Flux.mse(w * x, w1 * x)
+
+loss(rand(10)) # around 9
+
+for t = 1:10^5
+  θ = Params([w, w1])
+  θ̄ = gradient(() -> loss(rand(10)), θ)
+  Flux.Optimise.update!(opt, θ, θ̄)
+end
+
+loss(rand(10)) # around 0.9
+```
+
+In this manner it is possible to compose optimisers for some added flexibility.
+
+## Decays
+
+Similar to optimisers, Flux also defines some simple decays that can be used in conjunction with other optimisers, or standalone.
+
+```@docs
+ExpDecay
+InvDecay
+WeightDecay
+```
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 679bbd0b..b42db7c9 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -1,8 +1,9 @@
 # Training
 
-To actually train a model we need three things:
+To actually train a model we need four things:
 
 * A *objective function*, that evaluates how well a model is doing given some input data.
+* The trainable parameters of the model.
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
 
@@ -32,6 +33,14 @@ Flux.train!(loss, ps, data, opt)
 
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
 
+At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
+
+## Model parameters
+
+The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `params(m)`.
+
+Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
+
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:
@@ -101,3 +110,30 @@ cb = function ()
   accuracy() > 0.9 && Flux.stop()
 end
 ```
+
+## Custom Training loops
+
+The `Flux.train!` function can be very convenient, especially for simple problems.
+Its also very flexible with the use of callbacks.
+But for some problems its much cleaner to write your own custom training loop.
+An example follows that works similar to the default `Flux.train` but with no callbacks.
+You don't need callbacks if you just code the calls to your functions directly into the loop.
+E.g. in the places marked with comments.
+
+```
+function my_custom_train!(loss, ps, data, opt)
+  ps = Params(ps)
+  for d in data
+    gs = gradient(ps) do
+      training_loss = loss(d...)
+      # Insert what ever code you want here that needs Training loss, e.g. logging
+      return training_loss
+    end
+    # insert what ever code you want here that needs gradient
+    # E.g. logging with TensorBoardLogger.jl as histogram so you can see if it is becoming huge
+    update!(opt, ps, gs)
+    # Here you might like to check validation set accuracy, and break out to do early stopping
+  end
+end
+```
+You could simplify this further, for example by hard-coding in the loss function.
diff --git a/src/Flux.jl b/src/Flux.jl
index 0b57f81d..9969b323 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -6,7 +6,7 @@ using Base: tail
 using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using MacroTools: @forward
 @reexport using NNlib
-using Zygote: Params, @adjoint, gradient, pullback
+using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
@@ -20,18 +20,9 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
   ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
   ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
 
-using CUDAapi
-if has_cuda()
-  try
-    using CuArrays
-    @eval has_cuarrays() = true
-  catch ex
-    @warn "CUDA is installed, but CuArrays.jl fails to load" exception=(ex,catch_backtrace())
-    @eval has_cuarrays() = false
-  end
-else
-  has_cuarrays() = false
-end
+
+using CuArrays
+const use_cuda = Ref(false)
 
 include("utils.jl")
 include("onehot.jl")
@@ -47,8 +38,26 @@ include("data/Data.jl")
 
 include("deprecations.jl")
 
-if has_cuarrays()
-  include("cuda/cuda.jl")
+function __init__()
+  precompiling = ccall(:jl_generating_output, Cint, ()) != 0
+
+  # we don't want to include the CUDA module when precompiling,
+  # or we could end up replacing it at run time (triggering a warning)
+  precompiling && return
+
+  if !CuArrays.functional()
+    # nothing to do here, and either CuArrays or one of its dependencies will have warned
+  else
+    use_cuda[] = true
+
+    # FIXME: this functionality should be conditional at run time by checking `use_cuda`
+    #        (or even better, get moved to CuArrays.jl as much as possible)
+    if CuArrays.has_cudnn()
+      include(joinpath(@__DIR__, "cuda/cuda.jl"))
+    else
+      @warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
+    end
+  end
 end
 
 end # module
diff --git a/src/cuda/cuda.jl b/src/cuda/cuda.jl
index 028a0f8b..20aae69c 100644
--- a/src/cuda/cuda.jl
+++ b/src/cuda/cuda.jl
@@ -2,11 +2,8 @@ module CUDA
 
 using ..CuArrays
 
-if CuArrays.libcudnn !== nothing  # TODO: use CuArrays.has_cudnn()
-  include("curnn.jl")
-  include("cudnn.jl")
-else
-  @warn "CUDNN is not installed, some functionality will not be available."
-end
+using CuArrays: CUDNN
+include("curnn.jl")
+include("cudnn.jl")
 
 end
diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
index 448ea140..d394182e 100644
--- a/src/cuda/cudnn.jl
+++ b/src/cuda/cudnn.jl
@@ -1,199 +1,5 @@
-using CuArrays: libcudnn
-using CuArrays.CUDNN: @check, handle, cudnnStatus_t, cudnnTensorDescriptor_t,
-  cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
-import CuArrays.CUDAdrv: CuPtr, CU_NULL
-
-using LinearAlgebra
-
-mutable struct DropoutDesc
-  ptr::Ptr{Nothing}
-  states::CuVector{UInt8}
-end
-
-Base.unsafe_convert(::Type{Ptr{Nothing}}, dd::DropoutDesc) = dd.ptr
-
-function DropoutDesc(ρ::Real; seed::Integer=0)
-  d = [C_NULL]
-  s = Csize_t[0]
-  @check ccall((:cudnnCreateDropoutDescriptor,libcudnn), cudnnStatus_t, (Ptr{Ptr{Nothing}},), d)
-  @check ccall((:cudnnDropoutGetStatesSize,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Csize_t}),handle(),s)
-  states = CuArray{UInt8}(undef, s[]) # TODO: can we drop this when ρ=0?
-  desc = DropoutDesc(d[], states)
-  @check ccall((:cudnnSetDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Nothing},Cfloat,CuPtr{Nothing},Csize_t,Culonglong),
-    desc,handle(),ρ,states,length(states),seed)
-  finalizer(desc) do x
-    @check ccall((:cudnnDestroyDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
-  end
-  return desc
-end
-
-const BATCHNORM_SPATIAL = 1
-const BATCHNORM_ACTIVATION = 0
-const BATCHNORM_MIN_EPS = 1e-5
-
-@inline _wsize(y) = (map(_ -> 1, size(y)[1:end-2])..., size(y)[end-1], 1)
-
-@inline _reddims(y) = (collect(1:ndims(y)-2)..., ndims(y))
-
-mutable struct BNCache
-  mean
-  ivar
-end
-
-BNCache() = BNCache(nothing, nothing)
-
-# NOTE: CuDNN supports only 4D and 5D Tensors for BatchNorm Operations
-# so reshape a 2D Tensor into 4D
-batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2},
-          running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-          cache = nothing, alpha = T(1), beta = T(0),
-          eps = T(1e-5), training = true) where T<:Union{Float32, Float64} =
-  dropdims(batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), running_mean, running_var, momentum,
-            cache = cache, alpha = alpha, beta = beta, eps = eps, training = training), dims = (1, 2))
-
-function batchnorm(g::CuArray{T}, b::CuArray{T}, x::Union{CuArray{T, 4},CuArray{T,5}},
-                   running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-                   cache = nothing, alpha = T(1), beta = T(0),
-                   eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
-  y = similar(x)
-  cudnnBNForward!(y, g, b, x, running_mean, running_var, momentum, cache = cache,
-      alpha = alpha, beta = beta, eps = eps, training = training)
-  y
-end
-
-function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray{T},
-                        running_mean::CuArray{T}, running_var::CuArray{T},
-                        momentum; cache = nothing,
-                        alpha = T(1), beta = T(0),
-                        eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
-  dims = _wsize(x)
-  if eps < BATCHNORM_MIN_EPS
-    # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", BATCHNORM_MIN_EPS)
-    eps = BATCHNORM_MIN_EPS
-  end
-  xd = TensorDesc(x)
-  yd = TensorDesc(y)
-  gd = TensorDesc(T, dims)
-
-  if training
-
-    if cache !== nothing
-      mean = zeros(CuArray{T}, dims...)
-      ivar = ones(CuArray{T}, dims...)
-    else
-      mean = CU_NULL
-      ivar = CU_NULL
-    end
-
-    @check ccall((:cudnnBatchNormalizationForwardTraining, libcudnn), cudnnStatus_t,
-                 (cudnnHandle_t,cudnnBatchNormMode_t,
-                  Ptr{T}, Ptr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, CuPtr{T},
-                  Cdouble, CuPtr{T}, CuPtr{T},
-                  Cdouble, CuPtr{T}, CuPtr{T}),
-                  handle(), BATCHNORM_SPATIAL,
-                  Ref(T(alpha)), Ref(T(beta)),
-                  xd, x,
-                  yd, y,
-                  gd, g, b,
-                  momentum, running_mean, running_var,
-                  eps, mean, ivar)
-
-    if cache !== nothing
-      cache.mean = mean
-      cache.ivar = ivar
-    end
-  else
-    @check ccall((:cudnnBatchNormalizationForwardInference, libcudnn), cudnnStatus_t,
-                 (Ptr{cudnnHandle_t},cudnnBatchNormMode_t,
-                  Ptr{T}, Ptr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, CuPtr{T},
-                  CuPtr{T}, CuPtr{T},
-                  Cdouble),
-                  handle(), BATCHNORM_SPATIAL,
-                  Ref(T(alpha)), Ref(T(beta)),
-                  xd, x,
-                  yd, y,
-                  gd, g, b,
-                  running_mean, running_var,
-                  eps)
-  end
-end
-
-function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2}, dy::CuArray{T, 2},
-           running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-           cache = nothing, eps = T(1e-5), alpha = T(1),
-           beta = T(0), training = true) where T<:Union{Float32, Float64}
-  dg, db, dx = ∇batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), reshape(dy, 1, 1, size(dy, 1),
-                          size(dy, 2)), running_mean, running_var, momentum, cache = cache, eps = eps,
-                          alpha = alpha, beta = beta, training = training)
-  (dg, db, dropdims(dx, dims = (1, 2)))
-end
-
-function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
-                    running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
-                    cache = nothing, eps = T(1e-5), alpha = T(1),
-                    beta = T(0), training = true) where T<:Union{Float32, Float64}
-  dg = similar(g)
-  db = similar(b)
-  dx = similar(x)
-  cudnnBNBackward!(dg, g, db, dx, x, dy, running_mean, running_var, T(momentum),
-    training = training, cache = cache, eps = eps, alpha = alpha, beta = beta)
-  (dg, db, dx)
-end
-
-function cudnnBNBackward!(dg::CuArray{T}, g::CuArray{T}, db::CuArray{T},
-                          dx::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
-                          running_mean::CuArray{T}, running_var::CuArray{T},
-                          momentum; cache = nothing, eps = T(1e-5),
-                          alpha = T(1), beta = T(0),
-                          dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
-  if training
-    xd = TensorDesc(x)
-    dyd = TensorDesc(dy)
-    dxd = TensorDesc(dx)
-    gd = TensorDesc(T, _wsize(x))
-    if cache !== nothing
-      mean, ivar = cache.mean, cache.ivar
-      info("mean and ivar are fetched from the cache")
-    else
-      mean, ivar = CU_NULL, CU_NULL
-    end
-
-    if eps < BATCHNORM_MIN_EPS
-      eps = BATCHNORM_MIN_EPS
-    end
-
-    @check ccall((:cudnnBatchNormalizationBackward, libcudnn), cudnnStatus_t,
-                 (cudnnHandle_t,cudnnBatchNormMode_t,
-                  Ptr{T}, Ptr{T},
-                  Ptr{T}, Ptr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, CuPtr{T}, CuPtr{T},
-                  Cdouble, CuPtr{T}, CuPtr{T}),
-                  handle(), BATCHNORM_SPATIAL,
-                  Ref(T(alpha)), Ref(T(beta)),
-                  Ref(T(dalpha)), Ref(T(dbeta)),
-                  xd, x,
-                  dyd, dy,
-                  dxd, dx,
-                  gd, g, dg, db,
-                  eps, mean, ivar)
-  else
-    ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps)
-    dx .= dy .* reshape(g, _wsize(x)) .* ivar
-    dg .= squeeze(sum(dy .* (x .- reshape(running_mean, _wsize(x))) .* ivar, _reddims(dy)), dims = (1,2,4))
-    db .= squeeze(sum(dy, _reddims(dy)), dims = (1,2,4))
-  end
-end
-
-# Flux Interface
+import ..Flux: data
+import CuArrays.CUDNN: batchnorm, ∇batchnorm
 
 (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, cache = nothing) where T<:Union{Float32, Float64} =
   BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = Flux.istraining()))
diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl
index 17896775..51e26a3e 100644
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@@ -1,273 +1,25 @@
-using CuArrays: libcudnn
-using CuArrays.CUDNN: @check, cudnnStatus_t, cudnnTensorDescriptor_t,
-  cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
-
-import CuArrays.CUDAdrv: CuPtr, CU_NULL
-
-using LinearAlgebra
-
-const RNN_RELU = 0 # Stock RNN with ReLu activation
-const RNN_TANH = 1 # Stock RNN with tanh activation
-const LSTM = 2     # LSTM with no peephole connections
-const GRU = 3      # Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1)
-
-const LINEAR_INPUT = 0
-const SKIP_INPUT = 1
-
-const UNIDIRECTIONAL = 0
-const BIDIRECTIONAL = 1
-
-const RNN_ALGO_STANDARD = 0
-const RNN_ALGO_PERSIST_STATIC = 1
-const RNN_ALGO_PERSIST_DYNAMIC = 2
-
-# param layout:
-# RNN: [weight, bias] × [input, hidden]
-# GRU: [weight, bias] × [input, hidden] × [reset, update, newmem]
-# LSTM: [weight, bias] × [input, hidden] × [input, forget, newmem, output]
-
-function params(w::CuVector, input, hidden, n = 1)
-  slice(offset, shape) = reshape(view(w, offset.+(1:prod(shape))), shape)
-  wx = slice(0, (input, hidden*n))
-  wh = slice(length(wx), (hidden, hidden*n))
-  bias = view(w, length(wx)+length(wh) .+ (1:hidden*n))
-  (wx, wh), bias
-end
-
-mutable struct RNNDesc{T}
-  mode::Int
-  input::Int
-  hidden::Int
-  params::CuVector{T}
-  weights::NTuple{2,CuMatrix{T}}
-  bias::CuVector{T}
-  ptr::Ptr{Nothing}
-end
-
-Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr
-
-function rnnParamSize(T, r, input)
-  size = Csize_t[0]
-  @check ccall((:cudnnGetRNNParamsSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Ptr{Nothing},Ptr{Csize_t},Cint),
-    handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T))
-  return Int(size[])÷sizeof(T)
-end
-
-ngates(mode) = [1, 1, 4, 3][mode+1]
-ngates(r::RNNDesc) = ngates(r.mode)
-
-function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T
-  d = [C_NULL]
-  @check ccall((:cudnnCreateRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Ptr{Nothing}},),d)
-
-  dropoutDesc = DropoutDesc(0)
-  inputMode = LINEAR_INPUT
-  direction = UNIDIRECTIONAL
-  algo = RNN_ALGO_STANDARD
-  @check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint),
-    handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
-
-  w = CuArrays.zeros(T, rnnParamSize(T, d[], input))
-  # TODO: avoid reserve allocation here
-  rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
-  finalizer(rd) do x
-    @check ccall((:cudnnDestroyRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
-  end
-  return rd
-end
-
-function rnnWorkspaceSize(r::RNNDesc, seqlen, xdesc)
-  size = Csize_t[0]
-  @check ccall((:cudnnGetRNNWorkspaceSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Ptr{Ptr{Nothing}},Ptr{Csize_t}),
-    handle(), r, seqlen, xdesc, size)
-  return Int(size[])
-end
-
-const workspace = Ref{Union{Nothing,CuVector{UInt8}}}(nothing)
-
-function getworkspace(bytes)
-  if workspace[] === nothing || length(workspace[]) < bytes
-    workspace[] = CuVector{UInt8}(undef, bytes)
-  end
-  workspace[]
-end
-
-getworkspace(r::RNNDesc, seqlen, xdesc) =
-  getworkspace(rnnWorkspaceSize(r, seqlen, xdesc))
-
-function rnnTrainingReserveSize(r::RNNDesc, seqlen, xdesc)
-  size = Csize_t[0]
-  @check ccall((:cudnnGetRNNTrainingReserveSize,libcudnn), cudnnStatus_t, (Ptr{Nothing}, Ptr{Nothing}, Cint, Ptr{Ptr{Nothing}}, Ptr{Csize_t}),
-    handle(), r, seqlen, xdesc, size)
-  return Int(size[])
-end
-
-function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
-                         workspace, reserve=nothing) where T
-  if reserve == nothing
-    @check ccall((:cudnnRNNForwardInference, libcudnn), cudnnStatus_t,
-                 (Ptr{Nothing}, Ptr{Nothing}, Cint,
-                  Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                  Ptr{Nothing}, CuPtr{T},
-                  CuPtr{Nothing}, Csize_t),
-                 handle(), rnn, seqlen,
-                 xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
-                 workspace, length(workspace))
-  else
-    @check ccall((:cudnnRNNForwardTraining, libcudnn), cudnnStatus_t,
-                 (Ptr{Nothing}, Ptr{Nothing}, Cint,
-                  Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                  CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
-                 handle(), rnn, seqlen,
-                 xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
-                 workspace, length(workspace), reserve, length(reserve))
-  end
-end
-
-xDesc(x) = [TensorDesc(eltype(x), (1, size(x, 1), size(x, 2)))]
-
-hDesc(h::Nothing) = C_NULL, CU_NULL
-hDesc(x::Integer) = (@assert x == 0; hDesc(nothing))
-function hDesc(h::CuArray)
-  TensorDesc(eltype(h), (size(h, 1), size(h, 2), 1)), h
-end
-
-# TODO: can we just manipulate strides here?
-# TODO: should use repmat, but this isn't implemented.
-hBatch(x::AbstractVector, h::CuVector) = h
-hBatch(x::AbstractMatrix, h::CuVector) = h .* CuArrays.ones(1, size(x, 2))
-hBatch(x::AbstractMatrix, h::CuMatrix) = h .* CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1)
-
-function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
-  h = hBatch(x, h_)
-  c = c_ == nothing ? nothing : hBatch(x, c_)
-  @assert size(x, 1) == rnn.input
-  @assert size(h, 1) == rnn.hidden
-  @assert size(x, 2) == size(h, 2)
-  seqLength = 1
-  xdesc = xDesc(x)
-  y = x isa AbstractVector ? similar(x, rnn.hidden) : similar(x, rnn.hidden, size(x, 2))
-  ho = similar(h)
-  ydesc = xDesc(y)
-  workspace = getworkspace(rnn, seqLength, xdesc)
-  reserve = train == Val{true} ?
-    CuVector{UInt8}(undef, rnnTrainingReserveSize(rnn, seqLength, xdesc)) :
-    nothing
-  co = c == nothing ? c : similar(c)
-  cudnnRNNForward(rnn, seqLength,
-                  xdesc, x,
-                  hDesc(h)...,
-                  hDesc(c)...,
-                  FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
-                  ydesc, y,
-                  hDesc(ho)...,
-                  hDesc(co)...,
-                  workspace, reserve)
-  result = c == nothing ? (y, ho) : (y, ho, co)
-  return train == Val{true} ? (reserve, result) : result
-end
-
-forwardTrain(rnn::RNNDesc{T}, x::CuArray{T}, h::CuArray{T}, c = nothing) where T =
-  forward(rnn, x, h, c, Val{true})
-
-function cudnnRNNBackwardData(rnn::RNNDesc{T}, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
-                              wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, rs) where T
-  @check ccall((:cudnnRNNBackwardData,libcudnn),cudnnStatus_t,
-               (Ptr{Nothing}, Ptr{Nothing}, Cint,
-                Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing},
-                CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
-                CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
-               handle(), rnn, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
-               wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, length(ws), rs, length(rs))
-end
-
-function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
-  # Same as above, any more efficient way?
-  dy = dy_ isa Integer ? zero(y) : dy_
-  yd = xDesc(y)
-  dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2))
-  dh = similar(h)
-  dc = c == nothing ? nothing : similar(c)
-  cudnnRNNBackwardData(rnn, 1,
-    yd, y, yd, dy, hDesc(dho)..., hDesc(dco)...,
-    FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
-    hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)...,
-    workspace[], reserve)
-  return c == nothing ? (dx, dh) : (dx, dh, dc)
-end
-
-backwardData(rnn, y, dy, dho, hx, reserve) =
-  backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve)
-
-function cudnnRNNBackwardWeights(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, yd, y, dwd, dw,
-                                 workspace, reserve) where T
-  @check ccall((:cudnnRNNBackwardWeights,libcudnn), cudnnStatus_t,
-               (Ptr{Nothing}, Ptr{Nothing}, Cint,  # handle, rnnDesc, seqLength
-                Ptr{Ptr{Nothing}}, CuPtr{T}, #x
-                Ptr{Nothing}, CuPtr{T}, #hx
-                Ptr{Ptr{Nothing}}, CuPtr{T}, #y
-                CuPtr{Nothing}, Csize_t, #ws
-                Ptr{Nothing}, CuPtr{T}, #dw
-                CuPtr{Nothing}, Csize_t), #rs
-               handle(), rnn, seqlen, xd, x, hd, h, yd, y,
-               workspace, length(workspace), dwd, dw, reserve, length(reserve))
-end
-
-function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T
-  dw = zero(rnn.params)
-  cudnnRNNBackwardWeights(rnn, 1,
-    xDesc(x), x, hDesc(h)..., xDesc(y), y,
-    FilterDesc(T, (1, 1, length(dw))), dw,
-    workspace[], reserve)
-  return params(dw, rnn.input, rnn.hidden, ngates(rnn))
-end
-
-# Interface
-
 import ..Flux: Flux, relu
 using CuArrays.CUDAnative
-using CuArrays: @cuindex, cudims
-
-function LinearAlgebra.copy_transpose!(dst::CuArray, src::CuArray)
-  function kernel(dst, src)
-    I = @cuindex dst
-    dst[I...] = src[reverse(I)...]
-    return
-  end
-  blk, thr = cudims(dst)
-  @cuda blocks=blk threads=thr kernel(dst, src)
-  return dst
-end
 
 CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuArray{T,2},<:CuArray{T,1}}
 CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}}
 CuLSTM{T} = Flux.LSTMCell{<:CuArray{T,2},<:CuArray{T,1}}
 CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}}
 
-function copyparams!(m::CuRNNs, d::RNNDesc)
-  Wi, Wh = d.weights
-  copy_transpose!(Wi, m.Wi)
-  copy_transpose!(Wh, m.Wh)
-  copy_transpose!(d.bias, m.b)
-  return
-end
-
-function RNNDesc(m::CuRNNs{T}) where T
+function CUDNN.RNNDesc(m::CuRNNs{T}) where T
   h, i = length(m.h), size(m.Wi, 2)
   mode = m isa CuRNN ?
-    (m.σ == tanh ? RNN_TANH : RNN_RELU) :
-    m isa CuGRU ? GRU : LSTM
-  r = RNNDesc{T}(mode, i, h)
+    (m.σ == tanh ? CUDNN.CUDNN_RNN_TANH : CUDNN.CUDNN_RNN_RELU) :
+    m isa CuGRU ? CUDNN.CUDNN_GRU : CUDNN.CUDNN_LSTM
+  r = CUDNN.RNNDesc{T}(mode, i, h)
   return r
 end
 
 const descs = WeakKeyDict()
 
 function desc(rnn)
-  d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = RNNDesc(rnn))
-  copyparams!(rnn, d)
+  d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = CUDNN.RNNDesc(rnn))
+  CUDNN.setweights!(d, rnn.Wi, rnn.Wh, rnn.b)
   return d
 end
 
@@ -275,17 +27,17 @@ import Zygote
 using Zygote: @adjoint
 
 function (m::CuRNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  y, h′ = forward(desc(m), x, h)
+  y, h′ = CUDNN.forward(desc(m), x, h)
   return h′, y
 end
 
 function (m::CuGRU{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  y, h′ = forward(desc(m), x, h)
+  y, h′ = CUDNN.forward(desc(m), x, h)
   return h′, y
 end
 
 function (m::CuLSTM{T})(h::NTuple{2,CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  y, h′, c′ = forward(desc(m), x, h[1], h[2])
+  y, h′, c′ = CUDNN.forward(desc(m), x, h[1], h[2])
   return (h′, c′), y
 end
 
@@ -303,7 +55,7 @@ unbroadcast(x::AbstractArray, Δ) =
 coerce_cuda(x::Union{CuArray,Nothing}) = x
 coerce_cuda(x::Tuple) = coerce_cuda.(x)
 
-coerce_cuda(x) = x .+ CuArrays.fill(0)
+coerce_cuda(x::AbstractArray) = x .+ CuArrays.fill(0)
 
 function struct_grad!(cx::Zygote.Context, x, x̄)
   for f in fieldnames(typeof(x))
@@ -316,28 +68,23 @@ end
 
 for RNN in (CuRNN, CuGRU)
   @eval @adjoint function (m::$RNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-    reserve, (y, ho) = forwardTrain(desc(m), x, h)
+    (y, ho), back = CUDNN.pullback(desc(m), x, h)
     (ho, y), function (Δ)
-      dho, dy = coerce_cuda(Δ)
-      h_ = hBatch(x, h)
-      dx, dh = backwardData(descs[m], y, dy, dho, h_, reserve)
-      (dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
-      dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing))
-      (dm, unbroadcast(h, dh), dx)
+      dho, dy = coerce_cuda(Δ) # Support FillArrays etc.
+      m̄ = back(dy, dho)
+      dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(m̄.Wi),Wh=transpose(m̄.Wh),b=m̄.b,h=nothing))
+      (dm, unbroadcast(h, m̄.h), m̄.x)
     end
   end
 end
 
 @adjoint function (m::CuLSTM)((h, c)::Tuple{CuArray{T},CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  reserve, (y, ho, co) = forwardTrain(desc(m), x, h, c)
+  (y, ho, co), back = CUDNN.pullback(desc(m), x, h, c)
   ((ho, co), y), function (Δ)
-    dhc, dy = coerce_cuda(Δ)
+    dhc, dy = coerce_cuda(Δ) # Support FillArrays etc.
     dho, dco = dhc === nothing ? (nothing, nothing) : dhc
-    h_ = hBatch(x, h)
-    c_ = hBatch(x, c)
-    dx, dh, dc = backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve)
-    (dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
-    dm = struct_grad!(__context__, m, (Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing,c=nothing))
-    (dm, (unbroadcast(h, dh), unbroadcast(c, dc)), dx)
+    m̄ = back(dy, dho, dco)
+    dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(m̄.Wi),Wh=transpose(m̄.Wh),b=m̄.b,h=nothing,c=nothing))
+    (dm, (unbroadcast(h, m̄.h), unbroadcast(c, m̄.c)), m̄.x)
   end
 end
diff --git a/src/functor.jl b/src/functor.jl
index 73483ab9..a36b5765 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -39,7 +39,7 @@ end
 
 trainable(m) = functor(m)[1]
 
-params!(p::Params, x::AbstractArray{<:Real}, seen = IdSet()) = push!(p, x)
+params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
 
 function params!(p::Params, x, seen = IdSet())
   x in seen && return
@@ -73,13 +73,7 @@ end
 
 cpu(m) = fmap(x -> adapt(Array, x), m)
 
-const gpu_adaptor = if has_cuarrays()
-  CuArrays.cu
-else
-  identity
-end
-
-gpu(x) = fmap(gpu_adaptor, x)
+gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x
 
 # Precision
 
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index f42a9619..6f056429 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -39,24 +39,39 @@ function Base.show(io::IO, c::Chain)
   print(io, ")")
 end
 
+"""
+    outdims(c::Chain, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+outdims(m, (10, 10)) == (6, 6)
+```
+"""
+outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(isize)
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
 # see issue https://github.com/FluxML/Flux.jl/issues/702
 # Johnny Chen -- @johnnychen94
+# only slightly changed to better handle interaction with Zygote @dsweber2
 """
     activations(c::Chain, input)
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  rst = []
-  for l in c
-    x = get(rst, length(rst), input)
-    push!(rst, l(x))
-  end
-  return rst
+    extraChain(c.layers, input)
 end
 
+function extraChain(fs::Tuple, x)
+    res = first(fs)(x)
+    return (res, extraChain(Base.tail(fs), res)...)
+end
+
+extraChain(::Tuple{}, x) = ()
+
+
 
 """
     Dense(in::Integer, out::Integer, σ = identity)
@@ -112,6 +127,19 @@ end
 (a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+"""
+    outdims(l::Dense, isize)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+
+```julia
+m = Dense(10, 5)
+outdims(m, (5, 2)) == (5,)
+outdims(m, (10,)) == (5,)
+```
+"""
+outdims(l::Dense, isize) = (size(l.W)[1],)
+
 """
     Diagonal(in::Integer)
 
@@ -141,6 +169,7 @@ function Base.show(io::IO, l::Diagonal)
   print(io, "Diagonal(", length(l.α), ")")
 end
 
+outdims(l::Diagonal, isize) = (length(l.α),)
 
 """
     Maxout(over)
@@ -189,6 +218,8 @@ function (mo::Maxout)(input::AbstractArray)
     mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end
 
+outdims(l::Maxout, isize) = outdims(first(l.over), isize)
+
 """
     SkipConnection(layers, connection)
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 5b19269a..94a10606 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,4 +1,9 @@
-using NNlib: conv, ∇conv_data, depthwiseconv
+using NNlib: conv, ∇conv_data, depthwiseconv, output_size
+
+# pad dims of x with dims of y until ndims(x) == ndims(y)
+_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
+
+_convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+ (ksize .- 1).*dsize .- (pad[1:2:end] .+ pad[2:2:end])
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -17,7 +22,7 @@ Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
     out = 16
     Conv((2, 2), 1=>16, relu)
 
-Data should be stored in WHCN order (width, height, # channels, # batches).
+Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
@@ -106,8 +111,23 @@ end
   a(T.(x))
 
 """
-    ConvTranspose(filter::Tuple, in=>out)
-    ConvTranspose(filter::Tuple, in=>out, activation)
+    outdims(l::Conv, isize::Tuple)
+
+Calculate the output dimensions given the input dimensions, `isize`.
+Batch size and channel size are ignored as per `NNlib.jl`.
+
+```julia
+m = Conv((3, 3), 3 => 16)
+outdims(m, (10, 10)) == (8, 8)
+outdims(m, (10, 10, 1, 3)) == (8, 8)
+```
+"""
+outdims(l::Conv, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
+
+"""
+    ConvTranspose(size, in=>out)
+    ConvTranspose(size, in=>out, relu)
 
 Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
@@ -178,6 +198,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
     )
 end
 
+# TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
+@nograd conv_transpose_dims
+
 function (c::ConvTranspose)(x::AbstractArray)
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
   σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
@@ -198,6 +221,8 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
+
 """
     DepthwiseConv(filter::Tuple, in=>out)
     DepthwiseConv(filter::Tuple, in=>out, activation)
@@ -298,9 +323,12 @@ end
 (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+outdims(l::DepthwiseConv, isize) =
+  output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
+
 """
     CrossCor(size, in=>out)
-    CrossCor(size, in=>out, relu)
+    CrossCor(size, in=>out, activation)
 
 Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
@@ -351,6 +379,11 @@ function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ
   return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
+function CrossCor(;weight::AbstractArray, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                   init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
                   weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
@@ -387,6 +420,9 @@ end
 (a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
   a(T.(x))
 
+outdims(l::CrossCor, isize) =
+  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
+
 """
     MaxPool(k)
 
@@ -416,6 +452,8 @@ function Base.show(io::IO, m::MaxPool)
   print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 
+outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
+
 """
     MeanPool(k)
 
@@ -443,3 +481,5 @@ end
 function Base.show(io::IO, m::MeanPool)
   print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
+
+outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
\ No newline at end of file
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index f2344af8..647dda25 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -1,5 +1,5 @@
 gate(h, n) = (1:h) .+ h*(n-1)
-gate(x::AbstractVector, h, n) = x[gate(h,n)]
+gate(x::AbstractVector, h, n) = @view x[gate(h,n)]
 gate(x::AbstractMatrix, h, n) = x[gate(h,n),:]
 
 # Stateful recurrence
@@ -45,8 +45,7 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
 """
     reset!(rnn)
 
-Reset the hidden state of a recurrent layer back to its original value. See also
-`truncate!`.
+Reset the hidden state of a recurrent layer back to its original value.
 
 Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
 
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 4c216672..159a8385 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -1,13 +1,24 @@
+using CuArrays
 using NNlib: logsoftmax, logσ
 
 # Cost functions
 
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
-function crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
-  -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
+  return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
 end
 
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
+  return -sum(y .* log.(ŷ)) .* weight * 1 // size(y, 2)
+end
+
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
+  return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
+end
+
+crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
+
 function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
   return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
 end
@@ -25,6 +36,9 @@ Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerica
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
+
 """
     logitbinarycrossentropy(logŷ, y)
 
@@ -39,13 +53,60 @@ but it is more numerically stable.
 """
 logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
 
+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+
 """
     normalise(x::AbstractArray; dims=1)
 
-    Normalises x to mean 0 and standard deviation 1, across the dimensions given by dims. Defaults to normalising over columns.
+Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
+
+    julia> a = reshape(collect(1:9), 3, 3)
+    3×3 Array{Int64,2}:
+     1  4  7
+     2  5  8
+     3  6  9
+
+    julia> normalise(a)
+    3×3 Array{Float64,2}:
+     -1.22474  -1.22474  -1.22474
+      0.0       0.0       0.0
+      1.22474   1.22474   1.22474
+
+    julia> normalise(a, dims=2)
+    3×3 Array{Float64,2}:
+     -1.22474  0.0  1.22474
+     -1.22474  0.0  1.22474
+     -1.22474  0.0  1.22474
 """
 function normalise(x::AbstractArray; dims=1)
   μ′ = mean(x, dims = dims)
   σ′ = std(x, dims = dims, mean = μ′, corrected=false)
   return (x .- μ′) ./ σ′
 end
+
+"""
+    kldivergence(ŷ, y)
+KLDivergence is a measure of how much one probability distribution is different from the other.
+It is always non-negative and zero only when both the distributions are equal everywhere.
+[KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
+"""
+function kldivergence(ŷ, y)
+  entropy = sum(y .* log.(y)) *1 //size(y,2)
+  cross_entropy = crossentropy(ŷ, y)
+  return entropy + cross_entropy
+end
+
+"""
+    poisson(ŷ, y)
+Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
+[Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
+"""
+poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
+
+"""
+    hinge(ŷ, y)
+Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
+[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
+"""
+hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
diff --git a/src/onehot.jl b/src/onehot.jl
index fe93c5c5..7a3123ec 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -37,12 +37,10 @@ import Adapt: adapt, adapt_structure
 
 adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
 
-if has_cuarrays()
-  import .CuArrays: CuArray, cudaconvert
-  import Base.Broadcast: BroadcastStyle, ArrayStyle
-  BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
-  cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
-end
+import .CuArrays: CuArray, cudaconvert
+import Base.Broadcast: BroadcastStyle, ArrayStyle
+BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
+cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
 
 """
     onehot(l, labels[, unk])
@@ -127,6 +125,4 @@ onecold(y::AbstractMatrix, labels...) =
 onecold(y::OneHotMatrix, labels...) =
   mapreduce(x -> Flux.onecold(x, labels...), |, y.data, dims = 2, init = 0)
 
-# TODO probably still want this as a custom adjoint Zygote
-# onecold(x::TrackedVector, l...) = onecold(data(x), l...)
-# onecold(x::TrackedMatrix, l...) = onecold(data(x), l...)
+@nograd onecold, onehot, onehotbatch
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 58cd5ff7..cf4496f4 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -1,5 +1,4 @@
 using Flux
-using Base: @get!
 using MacroTools: @forward
 
 const ϵ = 1e-8
@@ -7,10 +6,28 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-    Descent(η)
+  Descent(η)
 
 Classic gradient descent optimiser with learning rate `η`.
-For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
+For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
+
+## Parameters
+  - Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
+
+## Example
+```julia-repl
+opt = Descent() # uses default η (0.1)
+
+opt = Descent(0.3) # use provided η
+
+ps = params(model)
+
+gs = gradient(ps) do
+  loss(x, y)
+end
+
+Flux.Optimise.update!(opt, ps, gs)
+```
 """
 mutable struct Descent
   eta::Float64
@@ -23,9 +40,20 @@ function apply!(o::Descent, x, Δ)
 end
 
 """
-    Momentum(η = 0.01; ρ = 0.9)
+    Momentum(η, ρ)
 
 Gradient descent with learning rate `η` and momentum `ρ`.
+
+## Parameters
+  - Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`.
+  - Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`.
+
+## Examples
+```julia
+opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9
+
+opt = Momentum(0.01, 0.99)
+```
 """
 mutable struct Momentum
   eta::Float64
@@ -43,9 +71,20 @@ function apply!(o::Momentum, x, Δ)
 end
 
 """
-    Nesterov(eta, ρ = 0.9)
+    Nesterov(η, ρ)
 
 Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
+
+## Parameters
+  - Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
+  - Nesterov Momentum (ρ): Paramters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
+
+## Examples
+```julia
+opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9
+
+opt = Nesterov(0.003, 0.95)
+```
 """
 mutable struct Nesterov
   eta::Float64
@@ -64,11 +103,23 @@ function apply!(o::Nesterov, x, Δ)
 end
 
 """
-    RMSProp(η = 0.001, ρ = 0.9)
+    RMSProp(η, ρ)
 
+Implements the RMSProp algortihm. Often a good choice for recurrent networks. Paramters other than learning rate generally don't need tuning.
+
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`.
+  - Rho (ρ): Defaults to `0.9`.
+
+## Examples
+```julia
+opt = RMSProp() # uses default η = 0.001 and ρ = 0.9
+
+opt = RMSProp(0.002, 0.95)
+```
+
+## References
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
-optimiser. Parameters other than learning rate don't need tuning. Often a good
-choice for recurrent networks.
 """
 mutable struct RMSProp
   eta::Float64
@@ -86,8 +137,22 @@ function apply!(o::RMSProp, x, Δ)
 end
 
 """
-    ADAM(η = 0.001, β = (0.9, 0.999))
+    ADAM(η, β::Tuple)
 
+Implements the ADAM optimiser.
+
+## Paramters
+  - Learning Rate (`η`): Defaults to `0.001`.
+  - Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+
+```julia
+opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
+
+opt = ADAM(0.001, (0.9, 0.8))
+```
+## References
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 """
 mutable struct ADAM
@@ -109,8 +174,23 @@ function apply!(o::ADAM, x, Δ)
 end
 
 """
-    RADAM(η = 0.001, β = (0.9, 0.999))
+    RADAM(η, β::Tuple)
 
+Implements the rectified ADAM optimizer.
+
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+
+```julia
+opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
+
+opt = RADAM(0.001, (0.9, 0.8))
+```
+
+## References
 [RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
 """
 mutable struct RADAM
@@ -139,10 +219,22 @@ function apply!(o::RADAM, x, Δ)
 end
 
 """
-    AdaMax(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08)
+    AdaMax(η, β::Tuple)
 
-[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
-the ∞-norm.
+Variant of ADAM based on ∞-norm.
+
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+```julia
+opt = AdaMax() # uses default η and β
+
+opt = AdaMax(0.001, (0.9, 0.995))
+```
+## References
+[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser.
 """
 mutable struct AdaMax
   eta::Float64
@@ -163,8 +255,21 @@ function apply!(o::AdaMax, x, Δ)
 end
 
 """
-    ADAGrad(η = 0.1; ϵ = 1e-8)
+    ADAGrad(η)
 
+Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated.
+
+## Parameters
+  - Learning Rate (η): Defaults to `0.1`
+
+## Examples
+```julia
+opt = ADAGrad() # uses default η = 0.1
+
+opt = ADAGrad(0.001)
+```
+
+## References
 [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
 Parameters don't need tuning.
 """
@@ -177,16 +282,27 @@ ADAGrad(η = 0.1) = ADAGrad(η, IdDict())
 
 function apply!(o::ADAGrad, x, Δ)
   η = o.eta
-  acc = get!(o.acc, x, fill(ϵ, size(x)))::typeof(x)
+  acc = get!(o.acc, x, fill!(zero(x), ϵ))::typeof(x)
   @. acc += Δ^2
   @. Δ *= η / (√acc + ϵ)
 end
 
 """
-    ADADelta(ρ = 0.9, ϵ = 1e-8)
+    ADADelta(ρ)
 
-[ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
-tuning.
+Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning.
+
+## Parameters
+  - Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`.
+
+## Examples
+```julia
+opt = ADADelta() # uses default ρ = 0.9
+opt = ADADelta(0.89)
+```
+
+## References
+[ADADelta](https://arxiv.org/abs/1212.5701) optimiser.
 """
 mutable struct ADADelta
   rho::Float64
@@ -205,10 +321,22 @@ function apply!(o::ADADelta, x, Δ)
 end
 
 """
-    AMSGrad(η = 0.001, β = (0.9, 0.999))
+    AMSGrad(η, β::Tuple)
 
-[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
-tuning.
+Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning.
+
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`.
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+```julia
+opt = AMSGrad() # uses default η and β
+opt = AMSGrad(0.001, (0.89, 0.995))
+```
+
+## References
+[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser.
 """
 mutable struct AMSGrad
   eta::Float64
@@ -220,18 +348,30 @@ AMSGrad(η = 0.001, β = (0.9, 0.999)) = AMSGrad(η, β, IdDict())
 
 function apply!(o::AMSGrad, x, Δ)
   η, β = o.eta, o.beta
-  mt, vt, v̂t = get!(o.state, x, (fill(ϵ, size(x)), fill(ϵ, size(x)), fill(ϵ, size(x))))
+  mt, vt, v̂t = get!(o.state, x, (fill!(zero(x), ϵ), fill!(zero(x), ϵ), fill!(zero(x), ϵ)))
   @. mt = β[1] * mt + (1 - β[1]) * Δ
   @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
-  @. v̂t = max.(v̂t, vt)
+  @. v̂t = max(v̂t, vt)
   @. Δ = η * mt / (√v̂t + ϵ)
 end
 
 """
-    NADAM(η = 0.001, β = (0.9, 0.999))
+    NADAM(η, β::Tuple)
 
-[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
-tuning.
+Nesterov variant of ADAM. Parameters don't need tuning.
+
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`.
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
+
+## Examples
+```julia
+opt = NADAM() # uses default η and β
+opt = NADAM(0.002, (0.89, 0.995))
+```
+
+## References
+[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser.
 """
 mutable struct NADAM
   eta::Float64
@@ -252,9 +392,23 @@ function apply!(o::NADAM, x, Δ)
 end
 
 """
-    ADAMW((η = 0.001, β = (0.9, 0.999), decay = 0)
+    ADAMW(η, β::Tuple, decay)
 
-[ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam.
+Variant of ADAM defined by fixing weight decay regularization.
+
+## Parameters
+  - Learning Rate (η): Defaults to `0.001`.
+  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999).
+  - decay: Decay applied to weights during optimisation. Defaults to 0.
+
+## Examples
+```julia
+opt = ADAMW() # uses default η, β and decay
+opt = ADAMW(0.001, (0.89, 0.995), 0.1)
+```
+
+## References
+[ADAMW](https://arxiv.org/abs/1711.05101)
 """
 ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
   Optimiser(ADAM(η, β), WeightDecay(decay))
@@ -287,9 +441,15 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
-`InvDecay(γ)`
+  InvDecay(γ)
 
-Apply inverse time decay to an optimiser
+Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
+```
+
+## Parameters
+  - gamma (γ): Defaults to `0.001`
+
+## Example
 ```julia
   Optimiser(InvDecay(..), Opt(..))
 ```
@@ -310,13 +470,22 @@ function apply!(o::InvDecay, x, Δ)
 end
 
 """
-`ExpDecay(eta, decay, decay_step, clip)`
+  ExpDecay(eta, decay, decay_step, clip)
 
-Schedule the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
+Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
 
+## Parameters
+  - Learning Rate (eta): Defaults to `0.001`.
+  - decay: Factor by which the learning rate is discounted. Defaults to `0.1`.
+  - decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`.
+  - clip: Minimum value of learning rate. Defaults to `1e-4`.
+
+## Example
 To apply exponential decay to an optimiser:
 ```julia
   Optimiser(ExpDecay(..), Opt(..))
+
+  opt = Optimiser(ExpDecay(), ADAM())
 ```
 """
 mutable struct ExpDecay
@@ -340,9 +509,12 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-`WeightDecay(wd)`
+  WeightDecay(wd)
 
-Decay the weight parameter by `wd`
+Decays the weight by `wd`
+
+## Parameters
+  - weight decay (wd): 0
 """
 mutable struct WeightDecay
   wd::Real
diff --git a/src/utils.jl b/src/utils.jl
index 36bab5a9..dbf85c95 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,6 +1,11 @@
 # Arrays
-glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0/sum(dims))
-glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0/sum(dims))
+nfan() = 1, 1 #fan_in, fan_out
+nfan(n) = 1, n #A vector is treated as a n×1 matrix
+nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
+nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
+
+glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
+glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
 
 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
@@ -98,6 +103,48 @@ function batchseq(xs, pad = nothing, n = maximum(length(x) for x in xs))
   [batch([xs_[j][i] for j = 1:length(xs_)]) for i = 1:n]
 end
 
+# Flattening models to weight vectors, and back
+
+function _restructure(m, xs)
+  i = 0
+  fmap(m) do x
+    x isa AbstractArray || return x
+    x = reshape(xs[i.+(1:length(x))], size(x))
+    i += length(x)
+    return x
+  end
+end
+
+"""
+    destructure(m)
+
+Flatten a model's parameters into a single weight vector.
+
+    julia> m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+
+    julia> θ, re = destructure(m);
+
+    julia> θ
+    67-element Array{Float32,1}:
+    -0.1407104
+    ...
+
+The second return value `re` allows you to reconstruct the original network after making
+modifications to the weight vector (for example, with a hypernetwork).
+
+    julia> re(θ .* 2)
+    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+"""
+function destructure(m)
+  xs = Zygote.Buffer([])
+  fmap(m) do x
+    x isa AbstractArray && push!(xs, x)
+    return x
+  end
+  return vcat(vec.(copy(xs))...), p -> _restructure(m, p)
+end
+
 # Other
 
 """
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 0bb7b2ef..c75cfb4e 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -25,9 +25,16 @@ cm = gpu(m)
 @test all(p isa CuArray for p in params(cm))
 @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
 
-x = [1,2,3]
+x = [1.,2.,3.]
 cx = gpu(x)
 @test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
+@test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
+@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
+
+x = [-1.1491, 0.8619, 0.3127]
+y = [1, 1, 0.]
+@test Flux.binarycrossentropy.(σ.(x),y) ≈ Array(Flux.binarycrossentropy.(cu(σ.(x)),cu(y)))
+@test Flux.logitbinarycrossentropy.(x,y) ≈ Array(Flux.logitbinarycrossentropy.(cu(x),cu(y)))
 
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)
@@ -51,10 +58,10 @@ end
   @test y[3,:] isa CuArray
 end
 
-if CuArrays.libcudnn != nothing
-    @info "Testing Flux/CUDNN"
-    include("cudnn.jl")
-    if !haskey(ENV, "CI_DISABLE_CURNN_TEST")
-      include("curnn.jl")
-    end
+if CuArrays.has_cudnn()
+  @info "Testing Flux/CUDNN"
+  include("cudnn.jl")
+  include("curnn.jl")
+else
+  @warn "CUDNN unavailable, not testing GPU DNN support"
 end
diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl
index ee5753d6..7753837a 100644
--- a/test/cuda/curnn.jl
+++ b/test/cuda/curnn.jl
@@ -22,8 +22,8 @@ end
       rand(10, batch_size)
     cux = gpu(x)
 
-    y, back = pullback((r, x) -> (r(x)), rnn, x)
-    cuy, cuback = pullback((r, x) -> (r(x)), curnn, cux)
+    y, back = pullback((r, x) -> r(x), rnn, x)
+    cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
 
     @test y ≈ collect(cuy)
     @test haskey(Flux.CUDA.descs, curnn.cell)
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index cbe250fc..421c7721 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -4,11 +4,13 @@ import Flux: activations
 @testset "basic" begin
   @testset "helpers" begin
     @testset "activations" begin
-      dummy_model = Chain(Dense(10,5,σ),Dense(5,2),softmax)
-      x = rand(10)
-      @test activations(Chain(), x) == []
-      @test activations(dummy_model, x)[1] == dummy_model[1](x)
-      @test activations(dummy_model, x)[2] == x |> dummy_model[1] |> dummy_model[2]
+      dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x))
+      x = randn(10)
+      @test activations(dummy_model, x)[1] == x.^2
+      @test activations(dummy_model, x)[2] == (x.^2 .- 3)
+      @test activations(dummy_model, x)[3] == tan.(x.^2 .- 3)
+
+      @test activations(Chain(), x) == ()
       @test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type
     end
   end
@@ -19,6 +21,12 @@ import Flux: activations
     # numeric test should be put into testset of corresponding layer
   end
 
+  @testset "Activations" begin
+    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
+    X = Float32.([1.0; 1.0; 1.0])
+    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
+  end
+
   @testset "Dense" begin
     @test  length(Dense(10, 5)(randn(10))) == 5
     @test_throws DimensionMismatch Dense(10, 5)(randn(1))
@@ -84,4 +92,19 @@ import Flux: activations
       @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
     end
   end
+
+  @testset "output dimensions" begin
+    m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
+    @test Flux.outdims(m, (10, 10)) == (6, 6)
+
+    m = Dense(10, 5)
+    @test Flux.outdims(m, (5, 2)) == (5,)
+    @test Flux.outdims(m, (10,)) == (5,)
+
+    m = Flux.Diagonal(10)
+    @test Flux.outdims(m, (10,)) == (10,)
+
+    m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
+    @test Flux.outdims(m, (10, 10)) == (8, 8)
+  end
 end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index d3345929..1ba92c9f 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -1,5 +1,6 @@
 using Flux, Test
 using Flux: maxpool, meanpool
+using Flux: gradient
 
 @testset "Pooling" begin
   x = randn(Float32, 10, 10, 3, 2)
@@ -83,6 +84,10 @@ end
   y = Conv((3,3), 1 => 1)(x)
   x_hat = ConvTranspose((3, 3), 1 => 1)(y)
   @test size(x_hat) == size(x)
+
+  m = ConvTranspose((3,3), 1=>1)
+  # Test that the gradient call does not throw: #900
+  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
 end
 
 @testset "CrossCor" begin
@@ -90,7 +95,7 @@ end
   w = rand(2,2,1,1)
   y = CrossCor(w, [0.0])
 
-  @test sum(w .* x[1:2, 1:2, :, :]) == y(x)[1, 1, 1, 1]
+  @test isapprox(sum(w .* x[1:2, 1:2, :, :]), y(x)[1, 1, 1, 1], rtol=1e-7)
 
   r = zeros(Float32, 28, 28, 1, 5)
   m = Chain(
@@ -113,17 +118,17 @@ end
   l = Conv((3,3), 1=>1)
   expected = zeros(eltype(l.weight),5,5,1,1)
   expected[2:end-1,2:end-1,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)
 
   l = Conv((3,1), 1=>1)
   expected = zeros(eltype(l.weight),5,7,1,1)
   expected[2:end-1,4,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)
 
   l = Conv((1,3), 1=>1)
   expected = zeros(eltype(l.weight),7,5,1,1)
   expected[4,2:end-1,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)
 
   @test begin
     # we test that the next expression does not throw
@@ -131,3 +136,55 @@ end
     true
   end
 end
+
+@testset "conv output dimensions" begin
+  m = Conv((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = Conv((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+
+  m = ConvTranspose((3, 3), 3 => 16)
+  @test Flux.outdims(m, (8, 8)) == (10, 10)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (2, 2)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (4, 4)) == (5, 5)
+
+  m = DepthwiseConv((3, 3), 3 => 6)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+
+  m = CrossCor((3, 3), 3 => 16)
+  @test Flux.outdims(m, (10, 10)) == (8, 8)
+  m = CrossCor((3, 3), 3 => 16; stride = 2)
+  @test Flux.outdims(m, (5, 5)) == (2, 2)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+
+  m = MaxPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MaxPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MaxPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+
+  m = MeanPool((2, 2))
+  @test Flux.outdims(m, (10, 10)) == (5, 5)
+  m = MeanPool((2, 2); stride = 1)
+  @test Flux.outdims(m, (5, 5)) == (4, 4)
+  m = MeanPool((2, 2); stride = 2, pad = 3)
+  @test Flux.outdims(m, (5, 5)) == (5, 5)
+end
\ No newline at end of file
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 22a5d283..4399a256 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -191,6 +191,7 @@ end
 
 end
 
+if VERSION >= v"1.1"
 @testset "GroupNorm" begin
   # begin tests
   squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
@@ -289,5 +290,5 @@ end
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test BN(x) ≈ GN(x)
   end
-
+end
 end
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 9e01af07..7cb8ed2e 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -49,12 +49,33 @@ const ϵ = 1e-7
   @testset "logitbinarycrossentropy" begin
     @test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0)
   end
-
+  
+  y = [1 2 3]
+  y1 = [4.0 5.0 6.0]
+  @testset "kldivergence" begin
+    @test Flux.kldivergence(y, y1) ≈ 4.761838062403337
+    @test Flux.kldivergence(y, y) ≈ 0 
+  end
+  
+  y = [1 2 3 4]
+  y1 = [5.0 6.0 7.0 8.0]
+  @testset "hinge" begin
+    @test Flux.hinge(y, y1) ≈ 0
+    @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
+  end
+  
+  y = [0.1 0.2 0.3]
+  y1 = [0.4 0.5 0.6]
+  @testset "poisson" begin
+    @test Flux.poisson(y, y1) ≈ 1.0160455586700767
+    @test Flux.poisson(y, y) ≈ 0.5044459776946685
+  end
+  
   @testset "no spurious promotions" begin
     for T in (Float32, Float64)
       y = rand(T, 2)
       ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
         fwd, back = Flux.pullback(f, ŷ, y)
         @test fwd isa T
         @test eltype(back(one(T))[1]) == T
diff --git a/test/runtests.jl b/test/runtests.jl
index 61def2b1..1505e96a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -19,7 +19,7 @@ include("layers/normalisation.jl")
 include("layers/stateless.jl")
 include("layers/conv.jl")
 
-if isdefined(Flux, :CUDA)
+if Flux.use_cuda[]
   include("cuda/cuda.jl")
 else
   @warn "CUDA unavailable, not testing GPU support"
diff --git a/test/utils.jl b/test/utils.jl
index 18a57139..1c275e85 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,6 +1,6 @@
 using Flux
-using Flux: throttle, glorot_uniform, glorot_normal, stack, unstack
-using StatsBase: std
+using Flux: throttle, nfan, glorot_uniform, glorot_normal, stack, unstack
+using StatsBase: var
 using Random
 using Test
 
@@ -56,18 +56,26 @@ end
   # Set random seed so that these tests don't fail randomly
   Random.seed!(0)
 
-  # glorot_uniform should yield a kernel with stddev ~= sqrt(6/(n_in + n_out)),
-  # and glorot_normal should yield a kernel with stddev != 2/(n_in _ n_out)
-  for (n_in, n_out) in [(100, 100), (100, 400)]
-    v = glorot_uniform(n_in, n_out)
-    @test minimum(v) > -1.1*sqrt(6/(n_in + n_out))
-    @test minimum(v) < -0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) >  0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) <  1.1*sqrt(6/(n_in + n_out))
+  @testset "Fan in/out" begin
+    @test nfan() == (1, 1) #For a constant
+    @test nfan(100) == (1, 100) #For vector
+    @test nfan(100, 200) == (200, 100) #For Dense layer
+    @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
+    @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
+    @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
+  end
 
-    v = glorot_normal(n_in, n_out)
-    @test std(v) > 0.9*sqrt(2/(n_in + n_out))
-    @test std(v) < 1.1*sqrt(2/(n_in + n_out))
+  @testset "glorot" begin
+    # glorot_uniform and glorot_normal should both yield a kernel with
+    # variance ≈ 2/(fan_in + fan_out)
+    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+      for init ∈ [glorot_uniform, glorot_normal]
+        v = init(dims...)
+        fan_in, fan_out = nfan(dims...)
+        σ2 = 2 / (fan_in + fan_out)
+        @test 0.9σ2 < var(v) < 1.1σ2
+      end
+    end
   end
 end