more docs and constructors

2020-02-26 22:29:14 +05:30 · 2020-02-26 22:29:14 +05:30 · cd931793ef
parent 58211e31bd 55616afc11
commit cd931793ef
34 changed files with 967 additions and 880 deletions
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@ -0,0 +1,24 @@
 name: CompatHelper
 on:
  schedule:
    - cron: '00 00 * * *'
 jobs:
  CompatHelper:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        julia-version: [1.3]
        julia-arch: [x64]
        os: [ubuntu-latest]
    steps:
      - uses: julia-actions/setup-julia@latest
        with:
          version: ${{ matrix.julia-version }}
      - name: Pkg.add("CompatHelper")
        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
      - name: CompatHelper.main()
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: julia -e 'using CompatHelper; CompatHelper.main()'
--- a/.github/workflows/TagBot.yml
+++ b/.github/workflows/TagBot.yml
@ -0,0 +1,11 @@
 name: TagBot
 on:
  schedule:
    - cron: 0 * * * *
 jobs:
  TagBot:
    runs-on: ubuntu-latest
    steps:
      - uses: JuliaRegistries/TagBot@v1
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,51 +1,41 @@
 before_script:
  - export CI_DISABLE_CURNN_TEST=true
 variables:
  CI_IMAGE_TAG: 'cuda'
 include:
-  - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/common.yml'
+  - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v6.yml'
-.flux:
+image: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
  extends: .test
  script:
    - julia -e 'using InteractiveUtils;
                versioninfo()'
    - mkdir $JULIA_DEPOT_PATH # Pkg3.jl#325
    - julia --project -e 'using Pkg;
                          Pkg.instantiate();
                          Pkg.build();
                          Pkg.test(; coverage=true);'
 test:v1.0:
   extends: .flux
   variables:
     CI_VERSION_TAG: 'v1.0'
-test:v1.1:
+# julia:1.0:
-   extends: .flux
+#   extends:
-   variables:
+#     - .julia:1.0
-     CI_VERSION_TAG: 'v1.1'
+#     - .test
 #   tags:
 #     - nvidia
 # 
 # julia:1.1:
 #   extends:
 #     - .julia:1.1
 #     - .test
 #   tags:
 #     - nvidia
 # 
 # julia:1.2:
 #   extends:
 #     - .julia:1.2
 #     - .test
 #   tags:
 #     - nvidia
-test:v1.2:
+julia:1.3:
-   extends: .flux
+  extends:
-   variables:
+    - .julia:1.3
-     CI_VERSION_TAG: 'v1.2'
+    - .test
-
+  tags:
-test:v1.3:
+    - nvidia
   extends: .flux
   variables:
     CI_VERSION_TAG: 'v1.3'
 test:v1.0:
   extends: .flux
   variables:
     CI_VERSION_TAG: 'v1.0'
 test:dev:
  extends: .flux
  variables:
    CI_VERSION_TAG: 'dev'
 julia:nightly:
  extends:
    - .julia:nightly
    - .test
  tags:
    - nvidia
  allow_failure: true
--- a/.travis.yml
+++ b/.travis.yml
@ -6,7 +6,7 @@ os:
  # - osx
 julia:
-  - 1.1
+  - 1.3
  - nightly
 matrix:
@ -16,7 +16,7 @@ matrix:
 jobs:
  include:
    - stage: "Documentation"
-      julia: 1.0
+      julia: 1.3
      os: linux
      script:
        - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
--- a/Manifest.toml
+++ b/Manifest.toml
@ -2,15 +2,15 @@
 [[AbstractFFTs]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40"
+git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "0.4.1"
+version = "0.5.0"
 [[AbstractTrees]]
-deps = ["Markdown", "Test"]
+deps = ["Markdown"]
-git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b"
+git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.2.1"
+version = "0.3.0"
 [[Adapt]]
 deps = ["LinearAlgebra"]
@ -21,46 +21,34 @@ version = "1.0.0"
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 [[BinDeps]]
 deps = ["Compat", "Libdl", "SHA", "URIParser"]
 git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
 uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
 version = "0.8.10"
 [[BinaryProvider]]
-deps = ["Libdl", "Logging", "SHA"]
+deps = ["Libdl", "SHA"]
-git-tree-sha1 = "c7361ce8a2129f20b0e05a89f7070820cfed6648"
+git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
 uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.6"
+version = "0.5.8"
 [[CEnum]]
 git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 version = "0.2.0"
 [[CSTParser]]
 deps = ["Tokenize"]
 git-tree-sha1 = "c69698c3d4a7255bc1b4bc2afc09f59db910243b"
 uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
 version = "0.6.2"
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "e063efb91cfefd7e6afd92c435d01398107a500b"
+git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "1.2.0"
+version = "2.1.0"
 [[CUDAdrv]]
-deps = ["CUDAapi", "Libdl", "Printf"]
+deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "9ce99b5732c70e06ed97c042187baed876fb1698"
+git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "3.1.0"
+version = "5.0.1"
 [[CUDAnative]]
-deps = ["Adapt", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Printf", "TimerOutputs"]
+deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "52ae1ce10ebfa686e227655c47b19add89308623"
+git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.3.1"
+version = "2.7.0"
 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@ -70,9 +58,9 @@ version = "0.6.0"
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965"
+git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.8.0"
+version = "0.8.1"
 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
@ -86,40 +74,22 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"
 [[Compat]]
 deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
 git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
 version = "2.1.0"
 [[Conda]]
 deps = ["JSON", "VersionParsing"]
 git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
 uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 version = "1.3.0"
 [[Crayons]]
 deps = ["Test"]
 git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
 uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 version = "4.0.0"
 [[CuArrays]]
-deps = ["AbstractFFTs", "Adapt", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
+deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "46b48742a84bb839e74215b7e468a4a1c6ba30f9"
+git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.2.1"
+version = "1.6.0"
 [[DataAPI]]
-git-tree-sha1 = "8903f0219d3472543fc4b2f5ebaf675a07f817c0"
+git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.0.1"
+version = "1.1.0"
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a"
+git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.0"
+version = "0.17.7"
 [[Dates]]
 deps = ["Printf"]
@ -130,32 +100,38 @@ deps = ["Mmap"]
 uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 [[DiffResults]]
-deps = ["Compat", "StaticArrays"]
+deps = ["StaticArrays"]
-git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c"
+git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc"
 uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-version = "0.0.4"
+version = "1.0.2"
 [[DiffRules]]
-deps = ["Random", "Test"]
+deps = ["NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
+git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "0.0.10"
+version = "1.0.0"
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 [[FFTW]]
-deps = ["AbstractFFTs", "BinaryProvider", "Conda", "Libdl", "LinearAlgebra", "Reexport", "Test"]
+deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
-git-tree-sha1 = "6c5b420da0b8c12098048561b8d58f81adea506f"
+git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
 uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.0.1"
+version = "1.2.0"
 [[FFTW_jll]]
 deps = ["Libdl", "Pkg"]
 git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
 uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
 version = "3.3.9+3"
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "8fba6ddaf66b45dec830233cea0aae43eb1261ad"
+git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.6.4"
+version = "0.8.4"
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@ -163,33 +139,33 @@ uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 version = "0.6.1"
 [[ForwardDiff]]
-deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"]
+deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b"
+git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.3"
+version = "0.10.8"
 [[GPUArrays]]
-deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "StaticArrays", "Test"]
+deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
-git-tree-sha1 = "77e27264276fe97a7e7fb928bf8999a145abc018"
+git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "1.0.3"
+version = "2.0.1"
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "e23faa71b8f54c3fdc99b230b9c2906cafdddca5"
+git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.2.3"
+version = "0.3.0"
 [[IntelOpenMP_jll]]
 deps = ["Libdl", "Pkg"]
 git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
 uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
 version = "2018.0.3+0"
 [[InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 [[JSON]]
 deps = ["Dates", "Mmap", "Parsers", "Unicode"]
 git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
 uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 version = "0.21.0"
 [[Juno]]
 deps = ["Base64", "Logging", "Media", "Profile", "Test"]
 git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8"
@ -198,9 +174,9 @@ version = "0.7.2"
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "4a05f742837779a00bd8c9a18da6817367c4245d"
+git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.3.0"
+version = "1.3.3"
 [[LibGit2]]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
@ -215,11 +191,17 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 [[MKL_jll]]
 deps = ["Libdl", "Pkg"]
 git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
 uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
 version = "2019.0.117+0"
 [[MacroTools]]
-deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"]
+deps = ["DataStructures", "Markdown", "Random"]
-git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76"
+git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.1"
+version = "0.5.3"
 [[Markdown]]
 deps = ["Base64"]
@ -232,24 +214,30 @@ uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
 version = "0.5.0"
 [[Missings]]
-git-tree-sha1 = "29858ce6c8ae629cf2d733bffa329619a1c843d0"
+deps = ["DataAPI"]
 git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5"
 uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "0.4.2"
+version = "0.4.3"
 [[Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 [[NNlib]]
-deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"]
+deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
-git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8"
+git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.0"
+version = "0.6.2"
 [[NaNMath]]
-deps = ["Compat"]
+git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
 git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "0.3.2"
+version = "0.3.3"
 [[OpenSpecFun_jll]]
 deps = ["Libdl", "Pkg"]
 git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
 version = "0.5.3+1"
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@ -257,14 +245,8 @@ git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.1.0"
 [[Parsers]]
 deps = ["Dates", "Test"]
 git-tree-sha1 = "ef0af6c8601db18c282d092ccbd2f01f3f0cd70b"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 version = "0.3.7"
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 [[Printf]]
@ -290,10 +272,10 @@ uuid = "189a3867-3050-52da-a836-e630ba90ab69"
 version = "0.2.0"
 [[Requires]]
-deps = ["Test"]
+deps = ["UUIDs"]
-git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
+git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "0.5.2"
+version = "1.0.0"
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@ -301,10 +283,6 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 [[SharedArrays]]
 deps = ["Distributed", "Mmap", "Random", "Serialization"]
 uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
@ -319,16 +297,16 @@ deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 [[SpecialFunctions]]
-deps = ["BinDeps", "BinaryProvider", "Libdl"]
+deps = ["OpenSpecFun_jll"]
-git-tree-sha1 = "3bdd374b6fd78faf0119b8c5d538788dbf910c6e"
+git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.8.0"
+version = "0.9.0"
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6"
+git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.11.0"
+version = "0.12.1"
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
@ -345,15 +323,10 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 [[TimerOutputs]]
-deps = ["Crayons", "Printf", "Test", "Unicode"]
+deps = ["Printf"]
-git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
+git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.0"
+version = "0.5.3"
 [[Tokenize]]
 git-tree-sha1 = "dfcdbbfb2d0370716c815cbd6f8a364efb6f42cf"
 uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
 version = "0.5.6"
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
@ -361,12 +334,6 @@ git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
 uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 version = "0.9.5"
 [[URIParser]]
 deps = ["Test", "Unicode"]
 git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
 uuid = "30578b45-9adc-5946-b283-645ec420af67"
 version = "0.4.0"
 [[UUIDs]]
 deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
@ -374,30 +341,26 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 [[VersionParsing]]
 deps = ["Compat"]
 git-tree-sha1 = "c9d5aa108588b978bd859554660c8a5c4f2f7669"
 uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
 version = "1.1.3"
 [[ZipFile]]
-deps = ["BinaryProvider", "Libdl", "Printf"]
+deps = ["Libdl", "Printf", "Zlib_jll"]
-git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
+git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.8.3"
+version = "0.9.0"
 [[Zlib_jll]]
 deps = ["Libdl", "Pkg"]
 git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
 version = "1.2.11+7"
 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "38241b40ebd8748bcacad5e6c7ba3ab3cc7a15c9"
+git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
 repo-rev = "master"
 repo-url = "https://github.com/FluxML/Zygote.jl.git"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.3.4"
+version = "0.4.5"
 [[ZygoteRules]]
 deps = ["MacroTools"]
-git-tree-sha1 = "c4c29b30b8ff3be13d4244e78be7df2a42bc54d0"
+git-tree-sha1 = "b3b4882cc9accf6731a08cc39543fbc6b669dca8"
 repo-rev = "master"
 repo-url = "https://github.com/FluxML/ZygoteRules.jl.git"
 uuid = "700de1a5-db45-46bc-99cf-38207098b444"
 version = "0.2.0"
--- a/NEWS.md
+++ b/NEWS.md
@ -1,3 +1,16 @@
 # v0.10.0
 * The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
  - The dependency on Tracker.jl has been removed.
  - This means Flux now does not depend on using a specialised `TrackedArray` type, and can be used with normal Array implementations directly.
  - Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
 * The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
 * `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
 * Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
 * [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
 * [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
 * `@treelike` has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
 * `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
 # v0.9.0
 * [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
 * New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.
--- a/Project.toml
+++ b/Project.toml
@ -1,17 +1,15 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.9.0"
+version = "0.10.2"
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@ -24,13 +22,20 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"
 [compat]
-CUDAapi = "1.1"
+AbstractTrees = "0.2, 0.3"
-CuArrays = "1.2"
+Adapt = "1"
 CodecZlib = "0.5, 0.6"
 Colors = "0.8, 0.9, 0.10, 0.11"
 CuArrays = "1.6"
 Juno = "0.5, 0.6, 0.7, 0.8"
 MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
-Zygote = "0.3"
+Reexport = "0.2"
 StatsBase = "0"
 ZipFile = "0.7, 0.8, 0.9"
 Zygote = "0.4"
 julia = "1"
 [extras]
--- a/README.md
+++ b/README.md
@ -7,93 +7,9 @@
 Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, and provides lightweight abstractions on top of Julia's native GPU and AD support. Flux makes the easy things easy while remaining fully hackable.
 ```julia
-julia> Pkg.add("Flux")
+] add Flux
 ```
 See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
-If you use Flux in research, please cite the following paper:
+If you use Flux in your research, please [cite](CITATION.bib) our work.
 ```
@article{innes:2018,
  author    = {Mike Innes},
  title     = {Flux: Elegant Machine Learning with Julia},
  journal   = {Journal of Open Source Software},
  year      = {2018},
  doi       = {10.21105/joss.00602},
 }
 ```
 ## Features
 Flux has powerful high-level features, and common architectures can be defined in a few lines.
 ```julia
 model = Chain(
  Dense(768, 128, σ),
  LSTM(128, 256),
  LSTM(256, 128),
  Dense(128, 10),
  softmax)
 loss(x, y) = crossentropy(model(x), y)
 Flux.train!(loss, data, ADAM(...))
 ```
 Yet you can easily strip away the layers, and directly write the mathematics for your problem. Flux will seamlessly take gradients of any Julia code, so your model looks just like the paper.
 ```julia
 W = param(randn(2, 10))
 b = param(randn(2))
 y(x) = σ.(W * x .+ b)
 ```
 If that's *still* not enough, you can go as deep as you want, even writing your own CUDA kernels with [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl)! All this can be freely mixed-and-matched in a single model or script, and it all runs interactively via Jupyter or Juno.
 ```julia
 function gpu_add(a, b, c)
  i = (blockIdx().x-1) * blockDim().x + threadIdx().x
  c[i] = a[i] + b[i]
  return nothing
 end
 ```
 Unusual architectures are no problem in Flux, as you can use all the loops, control flow and even macros that you're used to. Here's a Tree RNN in 4 lines.
 ```julia
 tree() = rand() < 0.5 ? rand(10) : (tree(), tree()) # dummy data
 shrink = Dense(20, 10)
 combine(a, b) = shrink([a; b])
 model(x) = x
 model(x::Tuple) = combine(model(x[1]), model(x[2]))
 model(tree()) # Sample output
 ```
 Despite this flexibility, Julia's advanced compiler lets us do some powerful optimisations. For example, this definition of `sigmoid` automatically gets fused into a *single* GPU kernel – so it's really fast.
 ```julia
 sigmoid(xs) = 1 ./ (1 .+ exp.(.-xs))
 ```
 Similarly, Flux is the first dynamic framework to support [compiling to the browser](https://fluxml.github.io/experiments/) and model import via [formats like ONNX](https://github.com/FluxML/ONNX.jl/), both of which are thinly-veiled compiler problems.
 For more on our philosophy on machine learning, check out our article [On Machine Learning & Programming Languages](https://julialang.org/blog/2017/12/ml&pl).
 ## Contributing & Help
 For general questions and help, check out Julia's [community forum](https://discourse.julialang.org/c/domain/ML).
 Flux development is carried out via our [GitHub issues](https://github.com/FluxML/Flux.jl/issues), so feel free to open feature requests or PRs here.
 For more informal discussions we'd love to have you on the [Julia slack](https://slackinvite.julialang.org/), where we hang out on the #machine-learning channel.
 ## Related Packages
 Check out [Metalhead.jl](https://github.com/FluxML/Metalhead.jl) for common computer vision datasets and trained models.
 [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) provides further common datasets.
--- a/bors.toml
+++ b/bors.toml
@ -1,4 +1,4 @@
 status = [
-  "ci/gitlab/%"
+  "ci/gitlab%"
 ]
-timeout-sec = 14400
+timeout-sec = 7200
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@ -219,3 +219,24 @@ Flux.@functor Affine
 ```
 This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
 ## Utility functions
 Flux provides some utility functions to help you generate models in an automated fashion.
 `outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
 Currently limited to the following layers:
 - `Chain`
 - `Dense`
 - `Conv`
 - `Diagonal`
 - `Maxout`
 - `ConvTranspose`
 - `DepthwiseConv`
 - `CrossCor`
 - `MaxPool`
 - `MeanPool`
 ```@docs
 outdims
 ```
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@ -65,3 +65,15 @@ AlphaDropout
 LayerNorm
 GroupNorm
 ```
 ## Cost Functions
 ```@docs
 mse
 crossentropy
 logitcrossentropy
 binarycrossentropy
 logitbinarycrossentropy
 kldivergence
 poisson
 hinge
 ```
--- a/docs/src/saving.md
+++ b/docs/src/saving.md
@ -113,6 +113,6 @@ You can even store optimiser state alongside the model, to resume training
 exactly where you left off.
 ```julia
-opt = ADAM(params(model))
+opt = ADAM()
@save "model-$(now()).bson" model opt
 ```
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@ -58,3 +58,83 @@ AMSGrad
 NADAM
 ADAMW
 ```
 ## Optimiser Interface
 Flux's optimsers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
 In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
 ```julia
 mutable struct Momentum
  eta
  rho
  velocity
 end
 Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
 ```
 The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
 ```julia
 function apply!(o::Momentum, x, Δ)
  η, ρ = o.eta, o.rho
  v = get!(o.velocity, x, zero(x))::typeof(x)
  @. v = ρ * v - η * Δ
  @. Δ = -v
 end
 ```
 This is the basic definition of a Momentum update rule given by:
 ```math
 v = ρ * v - η * Δ
 w = w - v
 ```
 The `apply!` defines the update rules for an optimiser `opt`, given the parameters and gradients. It returns the updated gradients. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
 Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully.
 ## Composing Optimisers
 Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
 that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
 ```julia
 opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
 ```
 Here we apply exponential decay to the `Descent` optimser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
 It is then applied like any optimser.
 ```julia
 w = randn(10, 10)
 w1 = randn(10,10)
 ps = Params([w, w1])
 loss(x) = Flux.mse(w * x, w1 * x)
 loss(rand(10)) # around 9
 for t = 1:10^5
  θ = Params([w, w1])
  θ̄ = gradient(() -> loss(rand(10)), θ)
  Flux.Optimise.update!(opt, θ, θ̄)
 end
 loss(rand(10)) # around 0.9
 ```
 In this manner it is possible to compose optimisers for some added flexibility.
 ## Decays
 Similar to optimisers, Flux also defines some simple decays that can be used in conjunction with other optimisers, or standalone.
 ```@docs
 ExpDecay
 InvDecay
 WeightDecay
 ```
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@ -1,8 +1,9 @@
 # Training
-To actually train a model we need three things:
+To actually train a model we need four things:
 * A *objective function*, that evaluates how well a model is doing given some input data.
 * The trainable parameters of the model.
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.
@ -32,6 +33,14 @@ Flux.train!(loss, ps, data, opt)
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
 At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
 ## Model parameters
 The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `params(m)`.
 Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
 ## Datasets
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:
@ -101,3 +110,30 @@ cb = function ()
  accuracy() > 0.9 && Flux.stop()
 end
 ```
 ## Custom Training loops
 The `Flux.train!` function can be very convenient, especially for simple problems.
 Its also very flexible with the use of callbacks.
 But for some problems its much cleaner to write your own custom training loop.
 An example follows that works similar to the default `Flux.train` but with no callbacks.
 You don't need callbacks if you just code the calls to your functions directly into the loop.
 E.g. in the places marked with comments.
 ```
 function my_custom_train!(loss, ps, data, opt)
  ps = Params(ps)
  for d in data
    gs = gradient(ps) do
      training_loss = loss(d...)
      # Insert what ever code you want here that needs Training loss, e.g. logging
      return training_loss
    end
    # insert what ever code you want here that needs gradient
    # E.g. logging with TensorBoardLogger.jl as histogram so you can see if it is becoming huge
    update!(opt, ps, gs)
    # Here you might like to check validation set accuracy, and break out to do early stopping
  end
 end
 ```
 You could simplify this further, for example by hard-coding in the loss function.
--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -6,7 +6,7 @@ using Base: tail
 using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using MacroTools: @forward
@reexport using NNlib
-using Zygote: Params, @adjoint, gradient, pullback
+using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
@ -20,18 +20,9 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
  ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
-using CUDAapi
+
-if has_cuda()
+using CuArrays
-  try
+const use_cuda = Ref(false)
    using CuArrays
    @eval has_cuarrays() = true
  catch ex
    @warn "CUDA is installed, but CuArrays.jl fails to load" exception=(ex,catch_backtrace())
    @eval has_cuarrays() = false
  end
 else
  has_cuarrays() = false
 end
 include("utils.jl")
 include("onehot.jl")
@ -47,8 +38,26 @@ include("data/Data.jl")
 include("deprecations.jl")
-if has_cuarrays()
+function __init__()
-  include("cuda/cuda.jl")
+  precompiling = ccall(:jl_generating_output, Cint, ()) != 0
  # we don't want to include the CUDA module when precompiling,
  # or we could end up replacing it at run time (triggering a warning)
  precompiling && return
  if !CuArrays.functional()
    # nothing to do here, and either CuArrays or one of its dependencies will have warned
  else
    use_cuda[] = true
    # FIXME: this functionality should be conditional at run time by checking `use_cuda`
    #        (or even better, get moved to CuArrays.jl as much as possible)
    if CuArrays.has_cudnn()
      include(joinpath(@__DIR__, "cuda/cuda.jl"))
    else
      @warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
    end
  end
 end
 end # module
--- a/src/cuda/cuda.jl
+++ b/src/cuda/cuda.jl
@ -2,11 +2,8 @@ module CUDA
 using ..CuArrays
-if CuArrays.libcudnn !== nothing  # TODO: use CuArrays.has_cudnn()
+using CuArrays: CUDNN
-  include("curnn.jl")
+include("curnn.jl")
-  include("cudnn.jl")
+include("cudnn.jl")
 else
  @warn "CUDNN is not installed, some functionality will not be available."
 end
 end
--- a/src/cuda/cudnn.jl
+++ b/src/cuda/cudnn.jl
@ -1,199 +1,5 @@
-using CuArrays: libcudnn
+import ..Flux: data
-using CuArrays.CUDNN: @check, handle, cudnnStatus_t, cudnnTensorDescriptor_t,
+import CuArrays.CUDNN: batchnorm, ∇batchnorm
  cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
 import CuArrays.CUDAdrv: CuPtr, CU_NULL
 using LinearAlgebra
 mutable struct DropoutDesc
  ptr::Ptr{Nothing}
  states::CuVector{UInt8}
 end
 Base.unsafe_convert(::Type{Ptr{Nothing}}, dd::DropoutDesc) = dd.ptr
 function DropoutDesc(ρ::Real; seed::Integer=0)
  d = [C_NULL]
  s = Csize_t[0]
  @check ccall((:cudnnCreateDropoutDescriptor,libcudnn), cudnnStatus_t, (Ptr{Ptr{Nothing}},), d)
  @check ccall((:cudnnDropoutGetStatesSize,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Csize_t}),handle(),s)
  states = CuArray{UInt8}(undef, s[]) # TODO: can we drop this when ρ=0?
  desc = DropoutDesc(d[], states)
  @check ccall((:cudnnSetDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Nothing},Cfloat,CuPtr{Nothing},Csize_t,Culonglong),
    desc,handle(),ρ,states,length(states),seed)
  finalizer(desc) do x
    @check ccall((:cudnnDestroyDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
  end
  return desc
 end
 const BATCHNORM_SPATIAL = 1
 const BATCHNORM_ACTIVATION = 0
 const BATCHNORM_MIN_EPS = 1e-5
@inline _wsize(y) = (map(_ -> 1, size(y)[1:end-2])..., size(y)[end-1], 1)
@inline _reddims(y) = (collect(1:ndims(y)-2)..., ndims(y))
 mutable struct BNCache
  mean
  ivar
 end
 BNCache() = BNCache(nothing, nothing)
 # NOTE: CuDNN supports only 4D and 5D Tensors for BatchNorm Operations
 # so reshape a 2D Tensor into 4D
 batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2},
          running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
          cache = nothing, alpha = T(1), beta = T(0),
          eps = T(1e-5), training = true) where T<:Union{Float32, Float64} =
  dropdims(batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), running_mean, running_var, momentum,
            cache = cache, alpha = alpha, beta = beta, eps = eps, training = training), dims = (1, 2))
 function batchnorm(g::CuArray{T}, b::CuArray{T}, x::Union{CuArray{T, 4},CuArray{T,5}},
                   running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
                   cache = nothing, alpha = T(1), beta = T(0),
                   eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
  y = similar(x)
  cudnnBNForward!(y, g, b, x, running_mean, running_var, momentum, cache = cache,
      alpha = alpha, beta = beta, eps = eps, training = training)
  y
 end
 function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray{T},
                        running_mean::CuArray{T}, running_var::CuArray{T},
                        momentum; cache = nothing,
                        alpha = T(1), beta = T(0),
                        eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
  dims = _wsize(x)
  if eps < BATCHNORM_MIN_EPS
    # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", BATCHNORM_MIN_EPS)
    eps = BATCHNORM_MIN_EPS
  end
  xd = TensorDesc(x)
  yd = TensorDesc(y)
  gd = TensorDesc(T, dims)
  if training
    if cache !== nothing
      mean = zeros(CuArray{T}, dims...)
      ivar = ones(CuArray{T}, dims...)
    else
      mean = CU_NULL
      ivar = CU_NULL
    end
    @check ccall((:cudnnBatchNormalizationForwardTraining, libcudnn), cudnnStatus_t,
                 (cudnnHandle_t,cudnnBatchNormMode_t,
                  Ptr{T}, Ptr{T},
                  Ptr{Nothing}, CuPtr{T},
                  Ptr{Nothing}, CuPtr{T},
                  Ptr{Nothing}, CuPtr{T}, CuPtr{T},
                  Cdouble, CuPtr{T}, CuPtr{T},
                  Cdouble, CuPtr{T}, CuPtr{T}),
                  handle(), BATCHNORM_SPATIAL,
                  Ref(T(alpha)), Ref(T(beta)),
                  xd, x,
                  yd, y,
                  gd, g, b,
                  momentum, running_mean, running_var,
                  eps, mean, ivar)
    if cache !== nothing
      cache.mean = mean
      cache.ivar = ivar
    end
  else
    @check ccall((:cudnnBatchNormalizationForwardInference, libcudnn), cudnnStatus_t,
                 (Ptr{cudnnHandle_t},cudnnBatchNormMode_t,
                  Ptr{T}, Ptr{T},
                  Ptr{Nothing}, CuPtr{T},
                  Ptr{Nothing}, CuPtr{T},
                  Ptr{Nothing}, CuPtr{T}, CuPtr{T},
                  CuPtr{T}, CuPtr{T},
                  Cdouble),
                  handle(), BATCHNORM_SPATIAL,
                  Ref(T(alpha)), Ref(T(beta)),
                  xd, x,
                  yd, y,
                  gd, g, b,
                  running_mean, running_var,
                  eps)
  end
 end
 function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2}, dy::CuArray{T, 2},
           running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
           cache = nothing, eps = T(1e-5), alpha = T(1),
           beta = T(0), training = true) where T<:Union{Float32, Float64}
  dg, db, dx = ∇batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), reshape(dy, 1, 1, size(dy, 1),
                          size(dy, 2)), running_mean, running_var, momentum, cache = cache, eps = eps,
                          alpha = alpha, beta = beta, training = training)
  (dg, db, dropdims(dx, dims = (1, 2)))
 end
 function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
                    running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
                    cache = nothing, eps = T(1e-5), alpha = T(1),
                    beta = T(0), training = true) where T<:Union{Float32, Float64}
  dg = similar(g)
  db = similar(b)
  dx = similar(x)
  cudnnBNBackward!(dg, g, db, dx, x, dy, running_mean, running_var, T(momentum),
    training = training, cache = cache, eps = eps, alpha = alpha, beta = beta)
  (dg, db, dx)
 end
 function cudnnBNBackward!(dg::CuArray{T}, g::CuArray{T}, db::CuArray{T},
                          dx::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
                          running_mean::CuArray{T}, running_var::CuArray{T},
                          momentum; cache = nothing, eps = T(1e-5),
                          alpha = T(1), beta = T(0),
                          dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
  if training
    xd = TensorDesc(x)
    dyd = TensorDesc(dy)
    dxd = TensorDesc(dx)
    gd = TensorDesc(T, _wsize(x))
    if cache !== nothing
      mean, ivar = cache.mean, cache.ivar
      info("mean and ivar are fetched from the cache")
    else
      mean, ivar = CU_NULL, CU_NULL
    end
    if eps < BATCHNORM_MIN_EPS
      eps = BATCHNORM_MIN_EPS
    end
    @check ccall((:cudnnBatchNormalizationBackward, libcudnn), cudnnStatus_t,
                 (cudnnHandle_t,cudnnBatchNormMode_t,
                  Ptr{T}, Ptr{T},
                  Ptr{T}, Ptr{T},
                  Ptr{Nothing}, CuPtr{T},
                  Ptr{Nothing}, CuPtr{T},
                  Ptr{Nothing}, CuPtr{T},
                  Ptr{Nothing}, CuPtr{T}, CuPtr{T}, CuPtr{T},
                  Cdouble, CuPtr{T}, CuPtr{T}),
                  handle(), BATCHNORM_SPATIAL,
                  Ref(T(alpha)), Ref(T(beta)),
                  Ref(T(dalpha)), Ref(T(dbeta)),
                  xd, x,
                  dyd, dy,
                  dxd, dx,
                  gd, g, dg, db,
                  eps, mean, ivar)
  else
    ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps)
    dx .= dy .* reshape(g, _wsize(x)) .* ivar
    dg .= squeeze(sum(dy .* (x .- reshape(running_mean, _wsize(x))) .* ivar, _reddims(dy)), dims = (1,2,4))
    db .= squeeze(sum(dy, _reddims(dy)), dims = (1,2,4))
  end
 end
 # Flux Interface
 (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, cache = nothing) where T<:Union{Float32, Float64} =
  BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = Flux.istraining()))
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@ -1,273 +1,25 @@
 using CuArrays: libcudnn
 using CuArrays.CUDNN: @check, cudnnStatus_t, cudnnTensorDescriptor_t,
  cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
 import CuArrays.CUDAdrv: CuPtr, CU_NULL
 using LinearAlgebra
 const RNN_RELU = 0 # Stock RNN with ReLu activation
 const RNN_TANH = 1 # Stock RNN with tanh activation
 const LSTM = 2     # LSTM with no peephole connections
 const GRU = 3      # Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1)
 const LINEAR_INPUT = 0
 const SKIP_INPUT = 1
 const UNIDIRECTIONAL = 0
 const BIDIRECTIONAL = 1
 const RNN_ALGO_STANDARD = 0
 const RNN_ALGO_PERSIST_STATIC = 1
 const RNN_ALGO_PERSIST_DYNAMIC = 2
 # param layout:
 # RNN: [weight, bias] × [input, hidden]
 # GRU: [weight, bias] × [input, hidden] × [reset, update, newmem]
 # LSTM: [weight, bias] × [input, hidden] × [input, forget, newmem, output]
 function params(w::CuVector, input, hidden, n = 1)
  slice(offset, shape) = reshape(view(w, offset.+(1:prod(shape))), shape)
  wx = slice(0, (input, hidden*n))
  wh = slice(length(wx), (hidden, hidden*n))
  bias = view(w, length(wx)+length(wh) .+ (1:hidden*n))
  (wx, wh), bias
 end
 mutable struct RNNDesc{T}
  mode::Int
  input::Int
  hidden::Int
  params::CuVector{T}
  weights::NTuple{2,CuMatrix{T}}
  bias::CuVector{T}
  ptr::Ptr{Nothing}
 end
 Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr
 function rnnParamSize(T, r, input)
  size = Csize_t[0]
  @check ccall((:cudnnGetRNNParamsSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Ptr{Nothing},Ptr{Csize_t},Cint),
    handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T))
  return Int(size[])÷sizeof(T)
 end
 ngates(mode) = [1, 1, 4, 3][mode+1]
 ngates(r::RNNDesc) = ngates(r.mode)
 function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T
  d = [C_NULL]
  @check ccall((:cudnnCreateRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Ptr{Nothing}},),d)
  dropoutDesc = DropoutDesc(0)
  inputMode = LINEAR_INPUT
  direction = UNIDIRECTIONAL
  algo = RNN_ALGO_STANDARD
  @check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint),
    handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
  w = CuArrays.zeros(T, rnnParamSize(T, d[], input))
  # TODO: avoid reserve allocation here
  rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
  finalizer(rd) do x
    @check ccall((:cudnnDestroyRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
  end
  return rd
 end
 function rnnWorkspaceSize(r::RNNDesc, seqlen, xdesc)
  size = Csize_t[0]
  @check ccall((:cudnnGetRNNWorkspaceSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Ptr{Ptr{Nothing}},Ptr{Csize_t}),
    handle(), r, seqlen, xdesc, size)
  return Int(size[])
 end
 const workspace = Ref{Union{Nothing,CuVector{UInt8}}}(nothing)
 function getworkspace(bytes)
  if workspace[] === nothing || length(workspace[]) < bytes
    workspace[] = CuVector{UInt8}(undef, bytes)
  end
  workspace[]
 end
 getworkspace(r::RNNDesc, seqlen, xdesc) =
  getworkspace(rnnWorkspaceSize(r, seqlen, xdesc))
 function rnnTrainingReserveSize(r::RNNDesc, seqlen, xdesc)
  size = Csize_t[0]
  @check ccall((:cudnnGetRNNTrainingReserveSize,libcudnn), cudnnStatus_t, (Ptr{Nothing}, Ptr{Nothing}, Cint, Ptr{Ptr{Nothing}}, Ptr{Csize_t}),
    handle(), r, seqlen, xdesc, size)
  return Int(size[])
 end
 function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
                         workspace, reserve=nothing) where T
  if reserve == nothing
    @check ccall((:cudnnRNNForwardInference, libcudnn), cudnnStatus_t,
                 (Ptr{Nothing}, Ptr{Nothing}, Cint,
                  Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
                  Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
                  Ptr{Nothing}, CuPtr{T},
                  CuPtr{Nothing}, Csize_t),
                 handle(), rnn, seqlen,
                 xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
                 workspace, length(workspace))
  else
    @check ccall((:cudnnRNNForwardTraining, libcudnn), cudnnStatus_t,
                 (Ptr{Nothing}, Ptr{Nothing}, Cint,
                  Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
                  CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
                 handle(), rnn, seqlen,
                 xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
                 workspace, length(workspace), reserve, length(reserve))
  end
 end
 xDesc(x) = [TensorDesc(eltype(x), (1, size(x, 1), size(x, 2)))]
 hDesc(h::Nothing) = C_NULL, CU_NULL
 hDesc(x::Integer) = (@assert x == 0; hDesc(nothing))
 function hDesc(h::CuArray)
  TensorDesc(eltype(h), (size(h, 1), size(h, 2), 1)), h
 end
 # TODO: can we just manipulate strides here?
 # TODO: should use repmat, but this isn't implemented.
 hBatch(x::AbstractVector, h::CuVector) = h
 hBatch(x::AbstractMatrix, h::CuVector) = h .* CuArrays.ones(1, size(x, 2))
 hBatch(x::AbstractMatrix, h::CuMatrix) = h .* CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1)
 function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
  h = hBatch(x, h_)
  c = c_ == nothing ? nothing : hBatch(x, c_)
  @assert size(x, 1) == rnn.input
  @assert size(h, 1) == rnn.hidden
  @assert size(x, 2) == size(h, 2)
  seqLength = 1
  xdesc = xDesc(x)
  y = x isa AbstractVector ? similar(x, rnn.hidden) : similar(x, rnn.hidden, size(x, 2))
  ho = similar(h)
  ydesc = xDesc(y)
  workspace = getworkspace(rnn, seqLength, xdesc)
  reserve = train == Val{true} ?
    CuVector{UInt8}(undef, rnnTrainingReserveSize(rnn, seqLength, xdesc)) :
    nothing
  co = c == nothing ? c : similar(c)
  cudnnRNNForward(rnn, seqLength,
                  xdesc, x,
                  hDesc(h)...,
                  hDesc(c)...,
                  FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
                  ydesc, y,
                  hDesc(ho)...,
                  hDesc(co)...,
                  workspace, reserve)
  result = c == nothing ? (y, ho) : (y, ho, co)
  return train == Val{true} ? (reserve, result) : result
 end
 forwardTrain(rnn::RNNDesc{T}, x::CuArray{T}, h::CuArray{T}, c = nothing) where T =
  forward(rnn, x, h, c, Val{true})
 function cudnnRNNBackwardData(rnn::RNNDesc{T}, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
                              wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, rs) where T
  @check ccall((:cudnnRNNBackwardData,libcudnn),cudnnStatus_t,
               (Ptr{Nothing}, Ptr{Nothing}, Cint,
                Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
                Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing},
                CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
                CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
               handle(), rnn, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
               wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, length(ws), rs, length(rs))
 end
 function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
  # Same as above, any more efficient way?
  dy = dy_ isa Integer ? zero(y) : dy_
  yd = xDesc(y)
  dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2))
  dh = similar(h)
  dc = c == nothing ? nothing : similar(c)
  cudnnRNNBackwardData(rnn, 1,
    yd, y, yd, dy, hDesc(dho)..., hDesc(dco)...,
    FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
    hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)...,
    workspace[], reserve)
  return c == nothing ? (dx, dh) : (dx, dh, dc)
 end
 backwardData(rnn, y, dy, dho, hx, reserve) =
  backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve)
 function cudnnRNNBackwardWeights(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, yd, y, dwd, dw,
                                 workspace, reserve) where T
  @check ccall((:cudnnRNNBackwardWeights,libcudnn), cudnnStatus_t,
               (Ptr{Nothing}, Ptr{Nothing}, Cint,  # handle, rnnDesc, seqLength
                Ptr{Ptr{Nothing}}, CuPtr{T}, #x
                Ptr{Nothing}, CuPtr{T}, #hx
                Ptr{Ptr{Nothing}}, CuPtr{T}, #y
                CuPtr{Nothing}, Csize_t, #ws
                Ptr{Nothing}, CuPtr{T}, #dw
                CuPtr{Nothing}, Csize_t), #rs
               handle(), rnn, seqlen, xd, x, hd, h, yd, y,
               workspace, length(workspace), dwd, dw, reserve, length(reserve))
 end
 function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T
  dw = zero(rnn.params)
  cudnnRNNBackwardWeights(rnn, 1,
    xDesc(x), x, hDesc(h)..., xDesc(y), y,
    FilterDesc(T, (1, 1, length(dw))), dw,
    workspace[], reserve)
  return params(dw, rnn.input, rnn.hidden, ngates(rnn))
 end
 # Interface
 import ..Flux: Flux, relu
 using CuArrays.CUDAnative
 using CuArrays: @cuindex, cudims
 function LinearAlgebra.copy_transpose!(dst::CuArray, src::CuArray)
  function kernel(dst, src)
    I = @cuindex dst
    dst[I...] = src[reverse(I)...]
    return
  end
  blk, thr = cudims(dst)
  @cuda blocks=blk threads=thr kernel(dst, src)
  return dst
 end
 CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuArray{T,2},<:CuArray{T,1}}
 CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}}
 CuLSTM{T} = Flux.LSTMCell{<:CuArray{T,2},<:CuArray{T,1}}
 CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}}
-function copyparams!(m::CuRNNs, d::RNNDesc)
+function CUDNN.RNNDesc(m::CuRNNs{T}) where T
  Wi, Wh = d.weights
  copy_transpose!(Wi, m.Wi)
  copy_transpose!(Wh, m.Wh)
  copy_transpose!(d.bias, m.b)
  return
 end
 function RNNDesc(m::CuRNNs{T}) where T
  h, i = length(m.h), size(m.Wi, 2)
  mode = m isa CuRNN ?
-    (m.σ == tanh ? RNN_TANH : RNN_RELU) :
+    (m.σ == tanh ? CUDNN.CUDNN_RNN_TANH : CUDNN.CUDNN_RNN_RELU) :
-    m isa CuGRU ? GRU : LSTM
+    m isa CuGRU ? CUDNN.CUDNN_GRU : CUDNN.CUDNN_LSTM
-  r = RNNDesc{T}(mode, i, h)
+  r = CUDNN.RNNDesc{T}(mode, i, h)
  return r
 end
 const descs = WeakKeyDict()
 function desc(rnn)
-  d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = RNNDesc(rnn))
+  d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = CUDNN.RNNDesc(rnn))
-  copyparams!(rnn, d)
+  CUDNN.setweights!(d, rnn.Wi, rnn.Wh, rnn.b)
  return d
 end
@ -275,17 +27,17 @@ import Zygote
 using Zygote: @adjoint
 function (m::CuRNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  y, h′ = forward(desc(m), x, h)
+  y, h′ = CUDNN.forward(desc(m), x, h)
  return h′, y
 end
 function (m::CuGRU{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  y, h′ = forward(desc(m), x, h)
+  y, h′ = CUDNN.forward(desc(m), x, h)
  return h′, y
 end
 function (m::CuLSTM{T})(h::NTuple{2,CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  y, h′, c′ = forward(desc(m), x, h[1], h[2])
+  y, h′, c′ = CUDNN.forward(desc(m), x, h[1], h[2])
  return (h′, c′), y
 end
@ -303,7 +55,7 @@ unbroadcast(x::AbstractArray, Δ) =
 coerce_cuda(x::Union{CuArray,Nothing}) = x
 coerce_cuda(x::Tuple) = coerce_cuda.(x)
-coerce_cuda(x) = x .+ CuArrays.fill(0)
+coerce_cuda(x::AbstractArray) = x .+ CuArrays.fill(0)
 function struct_grad!(cx::Zygote.Context, x, x̄)
  for f in fieldnames(typeof(x))
@ -316,28 +68,23 @@ end
 for RNN in (CuRNN, CuGRU)
  @eval @adjoint function (m::$RNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
-    reserve, (y, ho) = forwardTrain(desc(m), x, h)
+    (y, ho), back = CUDNN.pullback(desc(m), x, h)
    (ho, y), function (Δ)
-      dho, dy = coerce_cuda(Δ)
+      dho, dy = coerce_cuda(Δ) # Support FillArrays etc.
-      h_ = hBatch(x, h)
+      m̄ = back(dy, dho)
-      dx, dh = backwardData(descs[m], y, dy, dho, h_, reserve)
+      dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(m̄.Wi),Wh=transpose(m̄.Wh),b=m̄.b,h=nothing))
-      (dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
+      (dm, unbroadcast(h, m̄.h), m̄.x)
      dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing))
      (dm, unbroadcast(h, dh), dx)
    end
  end
 end
@adjoint function (m::CuLSTM)((h, c)::Tuple{CuArray{T},CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
-  reserve, (y, ho, co) = forwardTrain(desc(m), x, h, c)
+  (y, ho, co), back = CUDNN.pullback(desc(m), x, h, c)
  ((ho, co), y), function (Δ)
-    dhc, dy = coerce_cuda(Δ)
+    dhc, dy = coerce_cuda(Δ) # Support FillArrays etc.
    dho, dco = dhc === nothing ? (nothing, nothing) : dhc
-    h_ = hBatch(x, h)
+    m̄ = back(dy, dho, dco)
-    c_ = hBatch(x, c)
+    dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(m̄.Wi),Wh=transpose(m̄.Wh),b=m̄.b,h=nothing,c=nothing))
-    dx, dh, dc = backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve)
+    (dm, (unbroadcast(h, m̄.h), unbroadcast(c, m̄.c)), m̄.x)
    (dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
    dm = struct_grad!(__context__, m, (Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing,c=nothing))
    (dm, (unbroadcast(h, dh), unbroadcast(c, dc)), dx)
  end
 end
--- a/src/functor.jl
+++ b/src/functor.jl
@ -39,7 +39,7 @@ end
 trainable(m) = functor(m)[1]
-params!(p::Params, x::AbstractArray{<:Real}, seen = IdSet()) = push!(p, x)
+params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
 function params!(p::Params, x, seen = IdSet())
  x in seen && return
@ -73,13 +73,7 @@ end
 cpu(m) = fmap(x -> adapt(Array, x), m)
-const gpu_adaptor = if has_cuarrays()
+gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x
  CuArrays.cu
 else
  identity
 end
 gpu(x) = fmap(gpu_adaptor, x)
 # Precision
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@ -39,24 +39,39 @@ function Base.show(io::IO, c::Chain)
  print(io, ")")
 end
 """
    outdims(c::Chain, isize)
 Calculate the output dimensions given the input dimensions, `isize`.
 ```julia
 m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
 outdims(m, (10, 10)) == (6, 6)
 ```
 """
 outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(isize)
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
 # see issue https://github.com/FluxML/Flux.jl/issues/702
 # Johnny Chen -- @johnnychen94
 # only slightly changed to better handle interaction with Zygote @dsweber2
 """
    activations(c::Chain, input)
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  rst = []
+    extraChain(c.layers, input)
  for l in c
    x = get(rst, length(rst), input)
    push!(rst, l(x))
  end
  return rst
 end
 function extraChain(fs::Tuple, x)
    res = first(fs)(x)
    return (res, extraChain(Base.tail(fs), res)...)
 end
 extraChain(::Tuple{}, x) = ()
 """
    Dense(in::Integer, out::Integer, σ = identity)
@ -112,6 +127,19 @@ end
 (a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  a(T.(x))
 """
    outdims(l::Dense, isize)
 Calculate the output dimensions given the input dimensions, `isize`.
 ```julia
 m = Dense(10, 5)
 outdims(m, (5, 2)) == (5,)
 outdims(m, (10,)) == (5,)
 ```
 """
 outdims(l::Dense, isize) = (size(l.W)[1],)
 """
    Diagonal(in::Integer)
@ -141,6 +169,7 @@ function Base.show(io::IO, l::Diagonal)
  print(io, "Diagonal(", length(l.α), ")")
 end
 outdims(l::Diagonal, isize) = (length(l.α),)
 """
    Maxout(over)
@ -189,6 +218,8 @@ function (mo::Maxout)(input::AbstractArray)
    mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
 end
 outdims(l::Maxout, isize) = outdims(first(l.over), isize)
 """
    SkipConnection(layers, connection)
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@ -1,4 +1,9 @@
-using NNlib: conv, ∇conv_data, depthwiseconv
+using NNlib: conv, ∇conv_data, depthwiseconv, output_size
 # pad dims of x with dims of y until ndims(x) == ndims(y)
 _paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
 _convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+ (ksize .- 1).*dsize .- (pad[1:2:end] .+ pad[2:2:end])
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@ -17,7 +22,7 @@ Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
    out = 16
    Conv((2, 2), 1=>16, relu)
-Data should be stored in WHCN order (width, height, # channels, # batches).
+Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
@ -106,8 +111,23 @@ end
  a(T.(x))
 """
-    ConvTranspose(filter::Tuple, in=>out)
+    outdims(l::Conv, isize::Tuple)
-    ConvTranspose(filter::Tuple, in=>out, activation)
+
 Calculate the output dimensions given the input dimensions, `isize`.
 Batch size and channel size are ignored as per `NNlib.jl`.
 ```julia
 m = Conv((3, 3), 3 => 16)
 outdims(m, (10, 10)) == (8, 8)
 outdims(m, (10, 10, 1, 3)) == (8, 8)
 ```
 """
 outdims(l::Conv, isize) =
  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 """
    ConvTranspose(size, in=>out)
    ConvTranspose(size, in=>out, relu)
 Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
@ -178,6 +198,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
    )
 end
 # TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
@nograd conv_transpose_dims
 function (c::ConvTranspose)(x::AbstractArray)
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
@ -198,6 +221,8 @@ end
 (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  a(T.(x))
 outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
 """
    DepthwiseConv(filter::Tuple, in=>out)
    DepthwiseConv(filter::Tuple, in=>out, activation)
@ -298,9 +323,12 @@ end
 (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  a(T.(x))
 outdims(l::DepthwiseConv, isize) =
  output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 """
    CrossCor(size, in=>out)
-    CrossCor(size, in=>out, relu)
+    CrossCor(size, in=>out, activation)
 Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
@ -351,6 +379,11 @@ function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ
  return CrossCor(σ, w, b, stride, pad, dilation)
 end
 function CrossCor(;weight::AbstractArray, bias::Union{Zeros, AbstractVector{T}},
                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
  CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                  init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
                  weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
@ -387,6 +420,9 @@ end
 (a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
  a(T.(x))
 outdims(l::CrossCor, isize) =
  output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 """
    MaxPool(k)
@ -416,6 +452,8 @@ function Base.show(io::IO, m::MaxPool)
  print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
 """
    MeanPool(k)
@ -443,3 +481,5 @@ end
 function Base.show(io::IO, m::MeanPool)
  print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
 end
 outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@ -1,5 +1,5 @@
 gate(h, n) = (1:h) .+ h*(n-1)
-gate(x::AbstractVector, h, n) = x[gate(h,n)]
+gate(x::AbstractVector, h, n) = @view x[gate(h,n)]
 gate(x::AbstractMatrix, h, n) = x[gate(h,n),:]
 # Stateful recurrence
@ -45,8 +45,7 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
 """
    reset!(rnn)
-Reset the hidden state of a recurrent layer back to its original value. See also
+Reset the hidden state of a recurrent layer back to its original value.
 `truncate!`.
 Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@ -1,13 +1,24 @@
 using CuArrays
 using NNlib: logsoftmax, logσ
 # Cost functions
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
-function crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
+function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
-  -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
+  return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
 end
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
  return -sum(y .* log.(ŷ)) .* weight * 1 // size(y, 2)
 end
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
  return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
 end
 crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
 function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
  return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
 end
@ -25,6 +36,9 @@ Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerica
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 # Re-definition to fix interaction with CuArrays.
 CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 """
    logitbinarycrossentropy(logŷ, y)
@ -39,13 +53,60 @@ but it is more numerically stable.
 """
 logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
 # Re-definition to fix interaction with CuArrays.
 CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
 """
    normalise(x::AbstractArray; dims=1)
-    Normalises x to mean 0 and standard deviation 1, across the dimensions given by dims. Defaults to normalising over columns.
+Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
    julia> a = reshape(collect(1:9), 3, 3)
    3×3 Array{Int64,2}:
     1  4  7
     2  5  8
     3  6  9
    julia> normalise(a)
    3×3 Array{Float64,2}:
     -1.22474  -1.22474  -1.22474
      0.0       0.0       0.0
      1.22474   1.22474   1.22474
    julia> normalise(a, dims=2)
    3×3 Array{Float64,2}:
     -1.22474  0.0  1.22474
     -1.22474  0.0  1.22474
     -1.22474  0.0  1.22474
 """
 function normalise(x::AbstractArray; dims=1)
  μ′ = mean(x, dims = dims)
  σ′ = std(x, dims = dims, mean = μ′, corrected=false)
  return (x .- μ′) ./ σ′
 end
 """
    kldivergence(ŷ, y)
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
 [KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
 """
 function kldivergence(ŷ, y)
  entropy = sum(y .* log.(y)) *1 //size(y,2)
  cross_entropy = crossentropy(ŷ, y)
  return entropy + cross_entropy
 end
 """
    poisson(ŷ, y)
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
 [Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 """
    hinge(ŷ, y)
 Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
--- a/src/onehot.jl
+++ b/src/onehot.jl
@ -37,12 +37,10 @@ import Adapt: adapt, adapt_structure
 adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
-if has_cuarrays()
+import .CuArrays: CuArray, cudaconvert
-  import .CuArrays: CuArray, cudaconvert
+import Base.Broadcast: BroadcastStyle, ArrayStyle
-  import Base.Broadcast: BroadcastStyle, ArrayStyle
+BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
-  BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
+cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
  cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
 end
 """
    onehot(l, labels[, unk])
@ -127,6 +125,4 @@ onecold(y::AbstractMatrix, labels...) =
 onecold(y::OneHotMatrix, labels...) =
  mapreduce(x -> Flux.onecold(x, labels...), |, y.data, dims = 2, init = 0)
-# TODO probably still want this as a custom adjoint Zygote
+@nograd onecold, onehot, onehotbatch
 # onecold(x::TrackedVector, l...) = onecold(data(x), l...)
 # onecold(x::TrackedMatrix, l...) = onecold(data(x), l...)
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@ -1,5 +1,4 @@
 using Flux
 using Base: @get!
 using MacroTools: @forward
 const ϵ = 1e-8
@ -7,10 +6,28 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 """
-    Descent(η)
+  Descent(η)
 Classic gradient descent optimiser with learning rate `η`.
-For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
+For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
 ## Parameters
  - Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
 ## Example
 ```julia-repl
 opt = Descent() # uses default η (0.1)
 opt = Descent(0.3) # use provided η
 ps = params(model)
 gs = gradient(ps) do
  loss(x, y)
 end
 Flux.Optimise.update!(opt, ps, gs)
 ```
 """
 mutable struct Descent
  eta::Float64
@ -23,9 +40,20 @@ function apply!(o::Descent, x, Δ)
 end
 """
-    Momentum(η = 0.01; ρ = 0.9)
+    Momentum(η, ρ)
 Gradient descent with learning rate `η` and momentum `ρ`.
 ## Parameters
  - Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`.
  - Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`.
 ## Examples
 ```julia
 opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9
 opt = Momentum(0.01, 0.99)
 ```
 """
 mutable struct Momentum
  eta::Float64
@ -43,9 +71,20 @@ function apply!(o::Momentum, x, Δ)
 end
 """
-    Nesterov(eta, ρ = 0.9)
+    Nesterov(η, ρ)
 Gradient descent with learning rate  `η` and Nesterov momentum `ρ`.
 ## Parameters
  - Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
  - Nesterov Momentum (ρ): Paramters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
 ## Examples
 ```julia
 opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9
 opt = Nesterov(0.003, 0.95)
 ```
 """
 mutable struct Nesterov
  eta::Float64
@ -64,11 +103,23 @@ function apply!(o::Nesterov, x, Δ)
 end
 """
-    RMSProp(η = 0.001, ρ = 0.9)
+    RMSProp(η, ρ)
 Implements the RMSProp algortihm. Often a good choice for recurrent networks. Paramters other than learning rate generally don't need tuning.
 ## Parameters
  - Learning Rate (η): Defaults to `0.001`.
  - Rho (ρ): Defaults to `0.9`.
 ## Examples
 ```julia
 opt = RMSProp() # uses default η = 0.001 and ρ = 0.9
 opt = RMSProp(0.002, 0.95)
 ```
 ## References
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 optimiser. Parameters other than learning rate don't need tuning. Often a good
 choice for recurrent networks.
 """
 mutable struct RMSProp
  eta::Float64
@ -86,8 +137,22 @@ function apply!(o::RMSProp, x, Δ)
 end
 """
-    ADAM(η = 0.001, β = (0.9, 0.999))
+    ADAM(η, β::Tuple)
 Implements the ADAM optimiser.
 ## Paramters
  - Learning Rate (`η`): Defaults to `0.001`.
  - Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
 ## Examples
 ```julia
 opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
 opt = ADAM(0.001, (0.9, 0.8))
 ```
 ## References
 [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
 """
 mutable struct ADAM
@ -109,8 +174,23 @@ function apply!(o::ADAM, x, Δ)
 end
 """
-    RADAM(η = 0.001, β = (0.9, 0.999))
+    RADAM(η, β::Tuple)
 Implements the rectified ADAM optimizer.
 ## Parameters
  - Learning Rate (η): Defaults to `0.001`
  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
 ## Examples
 ```julia
 opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
 opt = RADAM(0.001, (0.9, 0.8))
 ```
 ## References
 [RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
 """
 mutable struct RADAM
@ -139,10 +219,22 @@ function apply!(o::RADAM, x, Δ)
 end
 """
-    AdaMax(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08)
+    AdaMax(η, β::Tuple)
-[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
+Variant of ADAM based on ∞-norm.
-the ∞-norm.
+
 ## Parameters
  - Learning Rate (η): Defaults to `0.001`
  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
 ## Examples
 ```julia
 opt = AdaMax() # uses default η and β
 opt = AdaMax(0.001, (0.9, 0.995))
 ```
 ## References
 [AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser.
 """
 mutable struct AdaMax
  eta::Float64
@ -163,8 +255,21 @@ function apply!(o::AdaMax, x, Δ)
 end
 """
-    ADAGrad(η = 0.1; ϵ = 1e-8)
+    ADAGrad(η)
 Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated.
 ## Parameters
  - Learning Rate (η): Defaults to `0.1`
 ## Examples
 ```julia
 opt = ADAGrad() # uses default η = 0.1
 opt = ADAGrad(0.001)
 ```
 ## References
 [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
 Parameters don't need tuning.
 """
@ -177,16 +282,27 @@ ADAGrad(η = 0.1) = ADAGrad(η, IdDict())
 function apply!(o::ADAGrad, x, Δ)
  η = o.eta
-  acc = get!(o.acc, x, fill(ϵ, size(x)))::typeof(x)
+  acc = get!(o.acc, x, fill!(zero(x), ϵ))::typeof(x)
  @. acc += Δ^2
  @. Δ *= η / (√acc + ϵ)
 end
 """
-    ADADelta(ρ = 0.9, ϵ = 1e-8)
+    ADADelta(ρ)
-[ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
+Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning.
-tuning.
+
 ## Parameters
  - Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`.
 ## Examples
 ```julia
 opt = ADADelta() # uses default ρ = 0.9
 opt = ADADelta(0.89)
 ```
 ## References
 [ADADelta](https://arxiv.org/abs/1212.5701) optimiser.
 """
 mutable struct ADADelta
  rho::Float64
@ -205,10 +321,22 @@ function apply!(o::ADADelta, x, Δ)
 end
 """
-    AMSGrad(η = 0.001, β = (0.9, 0.999))
+    AMSGrad(η, β::Tuple)
-[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
+Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning.
-tuning.
+
 ## Parameters
  - Learning Rate (η): Defaults to `0.001`.
  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
 ## Examples
 ```julia
 opt = AMSGrad() # uses default η and β
 opt = AMSGrad(0.001, (0.89, 0.995))
 ```
 ## References
 [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser.
 """
 mutable struct AMSGrad
  eta::Float64
@ -220,18 +348,30 @@ AMSGrad(η = 0.001, β = (0.9, 0.999)) = AMSGrad(η, β, IdDict())
 function apply!(o::AMSGrad, x, Δ)
  η, β = o.eta, o.beta
-  mt, vt, v̂t = get!(o.state, x, (fill(ϵ, size(x)), fill(ϵ, size(x)), fill(ϵ, size(x))))
+  mt, vt, v̂t = get!(o.state, x, (fill!(zero(x), ϵ), fill!(zero(x), ϵ), fill!(zero(x), ϵ)))
  @. mt = β[1] * mt + (1 - β[1]) * Δ
  @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
-  @. v̂t = max.(v̂t, vt)
+  @. v̂t = max(v̂t, vt)
  @. Δ = η * mt / (√v̂t + ϵ)
 end
 """
-    NADAM(η = 0.001, β = (0.9, 0.999))
+    NADAM(η, β::Tuple)
-[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
+Nesterov variant of ADAM. Parameters don't need tuning.
-tuning.
+
 ## Parameters
  - Learning Rate (η): Defaults to `0.001`.
  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
 ## Examples
 ```julia
 opt = NADAM() # uses default η and β
 opt = NADAM(0.002, (0.89, 0.995))
 ```
 ## References
 [NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser.
 """
 mutable struct NADAM
  eta::Float64
@ -252,9 +392,23 @@ function apply!(o::NADAM, x, Δ)
 end
 """
-    ADAMW((η = 0.001, β = (0.9, 0.999), decay = 0)
+    ADAMW(η, β::Tuple, decay)
-[ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam.
+Variant of ADAM defined by fixing weight decay regularization.
 ## Parameters
  - Learning Rate (η): Defaults to `0.001`.
  - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999).
  - decay: Decay applied to weights during optimisation. Defaults to 0.
 ## Examples
 ```julia
 opt = ADAMW() # uses default η, β and decay
 opt = ADAMW(0.001, (0.89, 0.995), 0.1)
 ```
 ## References
 [ADAMW](https://arxiv.org/abs/1711.05101)
 """
 ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
  Optimiser(ADAM(η, β), WeightDecay(decay))
@ -287,9 +441,15 @@ function apply!(o::Optimiser, x, Δ)
 end
 """
-`InvDecay(γ)`
+  InvDecay(γ)
-Apply inverse time decay to an optimiser
+Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
 ```
 ## Parameters
  - gamma (γ): Defaults to `0.001`
 ## Example
 ```julia
  Optimiser(InvDecay(..), Opt(..))
 ```
@ -310,13 +470,22 @@ function apply!(o::InvDecay, x, Δ)
 end
 """
-`ExpDecay(eta, decay, decay_step, clip)`
+  ExpDecay(eta, decay, decay_step, clip)
-Schedule the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
+Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
 ## Parameters
  - Learning Rate (eta): Defaults to `0.001`.
  - decay: Factor by which the learning rate is discounted. Defaults to `0.1`.
  - decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`.
  - clip: Minimum value of learning rate. Defaults to `1e-4`.
 ## Example
 To apply exponential decay to an optimiser:
 ```julia
  Optimiser(ExpDecay(..), Opt(..))
  opt = Optimiser(ExpDecay(), ADAM())
 ```
 """
 mutable struct ExpDecay
@ -340,9 +509,12 @@ function apply!(o::ExpDecay, x, Δ)
 end
 """
-`WeightDecay(wd)`
+  WeightDecay(wd)
-Decay the weight parameter by `wd`
+Decays the weight by `wd`
 ## Parameters
  - weight decay (wd): 0
 """
 mutable struct WeightDecay
  wd::Real
--- a/src/utils.jl
+++ b/src/utils.jl
@ -1,6 +1,11 @@
 # Arrays
-glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0/sum(dims))
+nfan() = 1, 1 #fan_in, fan_out
-glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0/sum(dims))
+nfan(n) = 1, n #A vector is treated as a n×1 matrix
 nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
 nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
 glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
 glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
@ -98,6 +103,48 @@ function batchseq(xs, pad = nothing, n = maximum(length(x) for x in xs))
  [batch([xs_[j][i] for j = 1:length(xs_)]) for i = 1:n]
 end
 # Flattening models to weight vectors, and back
 function _restructure(m, xs)
  i = 0
  fmap(m) do x
    x isa AbstractArray || return x
    x = reshape(xs[i.+(1:length(x))], size(x))
    i += length(x)
    return x
  end
 end
 """
    destructure(m)
 Flatten a model's parameters into a single weight vector.
    julia> m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
    julia> θ, re = destructure(m);
    julia> θ
    67-element Array{Float32,1}:
    -0.1407104
    ...
 The second return value `re` allows you to reconstruct the original network after making
 modifications to the weight vector (for example, with a hypernetwork).
    julia> re(θ .* 2)
    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
 """
 function destructure(m)
  xs = Zygote.Buffer([])
  fmap(m) do x
    x isa AbstractArray && push!(xs, x)
    return x
  end
  return vcat(vec.(copy(xs))...), p -> _restructure(m, p)
 end
 # Other
 """
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@ -25,9 +25,16 @@ cm = gpu(m)
@test all(p isa CuArray for p in params(cm))
@test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
-x = [1,2,3]
+x = [1.,2.,3.]
 cx = gpu(x)
@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
@test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
 x = [-1.1491, 0.8619, 0.3127]
 y = [1, 1, 0.]
@test Flux.binarycrossentropy.(σ.(x),y) ≈ Array(Flux.binarycrossentropy.(cu(σ.(x)),cu(y)))
@test Flux.logitbinarycrossentropy.(x,y) ≈ Array(Flux.logitbinarycrossentropy.(cu(x),cu(y)))
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)
@ -51,10 +58,10 @@ end
  @test y[3,:] isa CuArray
 end
-if CuArrays.libcudnn != nothing
+if CuArrays.has_cudnn()
-    @info "Testing Flux/CUDNN"
+  @info "Testing Flux/CUDNN"
-    include("cudnn.jl")
+  include("cudnn.jl")
-    if !haskey(ENV, "CI_DISABLE_CURNN_TEST")
+  include("curnn.jl")
-      include("curnn.jl")
+else
-    end
+  @warn "CUDNN unavailable, not testing GPU DNN support"
 end
--- a/test/cuda/curnn.jl
+++ b/test/cuda/curnn.jl
@ -22,8 +22,8 @@ end
      rand(10, batch_size)
    cux = gpu(x)
-    y, back = pullback((r, x) -> (r(x)), rnn, x)
+    y, back = pullback((r, x) -> r(x), rnn, x)
-    cuy, cuback = pullback((r, x) -> (r(x)), curnn, cux)
+    cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
    @test y ≈ collect(cuy)
    @test haskey(Flux.CUDA.descs, curnn.cell)
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@ -4,11 +4,13 @@ import Flux: activations
@testset "basic" begin
  @testset "helpers" begin
    @testset "activations" begin
-      dummy_model = Chain(Dense(10,5,σ),Dense(5,2),softmax)
+      dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x))
-      x = rand(10)
+      x = randn(10)
-      @test activations(Chain(), x) == []
+      @test activations(dummy_model, x)[1] == x.^2
-      @test activations(dummy_model, x)[1] == dummy_model[1](x)
+      @test activations(dummy_model, x)[2] == (x.^2 .- 3)
-      @test activations(dummy_model, x)[2] == x |> dummy_model[1] |> dummy_model[2]
+      @test activations(dummy_model, x)[3] == tan.(x.^2 .- 3)
      @test activations(Chain(), x) == ()
      @test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type
    end
  end
@ -19,6 +21,12 @@ import Flux: activations
    # numeric test should be put into testset of corresponding layer
  end
  @testset "Activations" begin
    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
    X = Float32.([1.0; 1.0; 1.0])
    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
  end
  @testset "Dense" begin
    @test  length(Dense(10, 5)(randn(10))) == 5
    @test_throws DimensionMismatch Dense(10, 5)(randn(1))
@ -84,4 +92,19 @@ import Flux: activations
      @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
    end
  end
  @testset "output dimensions" begin
    m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
    @test Flux.outdims(m, (10, 10)) == (6, 6)
    m = Dense(10, 5)
    @test Flux.outdims(m, (5, 2)) == (5,)
    @test Flux.outdims(m, (10,)) == (5,)
    m = Flux.Diagonal(10)
    @test Flux.outdims(m, (10,)) == (10,)
    m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
    @test Flux.outdims(m, (10, 10)) == (8, 8)
  end
 end
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@ -1,5 +1,6 @@
 using Flux, Test
 using Flux: maxpool, meanpool
 using Flux: gradient
@testset "Pooling" begin
  x = randn(Float32, 10, 10, 3, 2)
@ -83,6 +84,10 @@ end
  y = Conv((3,3), 1 => 1)(x)
  x_hat = ConvTranspose((3, 3), 1 => 1)(y)
  @test size(x_hat) == size(x)
  m = ConvTranspose((3,3), 1=>1)
  # Test that the gradient call does not throw: #900
  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
 end
@testset "CrossCor" begin
@ -90,7 +95,7 @@ end
  w = rand(2,2,1,1)
  y = CrossCor(w, [0.0])
-  @test sum(w .* x[1:2, 1:2, :, :]) == y(x)[1, 1, 1, 1]
+  @test isapprox(sum(w .* x[1:2, 1:2, :, :]), y(x)[1, 1, 1, 1], rtol=1e-7)
  r = zeros(Float32, 28, 28, 1, 5)
  m = Chain(
@ -113,17 +118,17 @@ end
  l = Conv((3,3), 1=>1)
  expected = zeros(eltype(l.weight),5,5,1,1)
  expected[2:end-1,2:end-1,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)
  l = Conv((3,1), 1=>1)
  expected = zeros(eltype(l.weight),5,7,1,1)
  expected[2:end-1,4,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)
  l = Conv((1,3), 1=>1)
  expected = zeros(eltype(l.weight),7,5,1,1)
  expected[4,2:end-1,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)
  @test begin
    # we test that the next expression does not throw
@ -131,3 +136,55 @@ end
    true
  end
 end
@testset "conv output dimensions" begin
  m = Conv((3, 3), 3 => 16)
  @test Flux.outdims(m, (10, 10)) == (8, 8)
  m = Conv((3, 3), 3 => 16; stride = 2)
  @test Flux.outdims(m, (5, 5)) == (2, 2)
  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
  @test Flux.outdims(m, (5, 5)) == (5, 5)
  m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
  @test Flux.outdims(m, (5, 5)) == (4, 4)
  m = ConvTranspose((3, 3), 3 => 16)
  @test Flux.outdims(m, (8, 8)) == (10, 10)
  m = ConvTranspose((3, 3), 3 => 16; stride = 2)
  @test Flux.outdims(m, (2, 2)) == (5, 5)
  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
  @test Flux.outdims(m, (5, 5)) == (5, 5)
  m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
  @test Flux.outdims(m, (4, 4)) == (5, 5)
  m = DepthwiseConv((3, 3), 3 => 6)
  @test Flux.outdims(m, (10, 10)) == (8, 8)
  m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
  @test Flux.outdims(m, (5, 5)) == (2, 2)
  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
  @test Flux.outdims(m, (5, 5)) == (5, 5)
  m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
  @test Flux.outdims(m, (5, 5)) == (4, 4)
  m = CrossCor((3, 3), 3 => 16)
  @test Flux.outdims(m, (10, 10)) == (8, 8)
  m = CrossCor((3, 3), 3 => 16; stride = 2)
  @test Flux.outdims(m, (5, 5)) == (2, 2)
  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
  @test Flux.outdims(m, (5, 5)) == (5, 5)
  m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
  @test Flux.outdims(m, (5, 5)) == (4, 4)
  m = MaxPool((2, 2))
  @test Flux.outdims(m, (10, 10)) == (5, 5)
  m = MaxPool((2, 2); stride = 1)
  @test Flux.outdims(m, (5, 5)) == (4, 4)
  m = MaxPool((2, 2); stride = 2, pad = 3)
  @test Flux.outdims(m, (5, 5)) == (5, 5)
  m = MeanPool((2, 2))
  @test Flux.outdims(m, (10, 10)) == (5, 5)
  m = MeanPool((2, 2); stride = 1)
  @test Flux.outdims(m, (5, 5)) == (4, 4)
  m = MeanPool((2, 2); stride = 2, pad = 3)
  @test Flux.outdims(m, (5, 5)) == (5, 5)
 end
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@ -191,6 +191,7 @@ end
 end
 if VERSION >= v"1.1"
@testset "GroupNorm" begin
  # begin tests
  squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
@ -289,5 +290,5 @@ end
      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
    @test BN(x) ≈ GN(x)
  end
-
+end
 end
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@ -49,12 +49,33 @@ const ϵ = 1e-7
  @testset "logitbinarycrossentropy" begin
    @test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0)
  end
-
+  
  y = [1 2 3]
  y1 = [4.0 5.0 6.0]
  @testset "kldivergence" begin
    @test Flux.kldivergence(y, y1) ≈ 4.761838062403337
    @test Flux.kldivergence(y, y) ≈ 0 
  end
  y = [1 2 3 4]
  y1 = [5.0 6.0 7.0 8.0]
  @testset "hinge" begin
    @test Flux.hinge(y, y1) ≈ 0
    @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
  end
  y = [0.1 0.2 0.3]
  y1 = [0.4 0.5 0.6]
  @testset "poisson" begin
    @test Flux.poisson(y, y1) ≈ 1.0160455586700767
    @test Flux.poisson(y, y) ≈ 0.5044459776946685
  end
  @testset "no spurious promotions" begin
    for T in (Float32, Float64)
      y = rand(T, 2)
      ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
        fwd, back = Flux.pullback(f, ŷ, y)
        @test fwd isa T
        @test eltype(back(one(T))[1]) == T
--- a/test/runtests.jl
+++ b/test/runtests.jl
@ -19,7 +19,7 @@ include("layers/normalisation.jl")
 include("layers/stateless.jl")
 include("layers/conv.jl")
-if isdefined(Flux, :CUDA)
+if Flux.use_cuda[]
  include("cuda/cuda.jl")
 else
  @warn "CUDA unavailable, not testing GPU support"
--- a/test/utils.jl
+++ b/test/utils.jl
@ -1,6 +1,6 @@
 using Flux
-using Flux: throttle, glorot_uniform, glorot_normal, stack, unstack
+using Flux: throttle, nfan, glorot_uniform, glorot_normal, stack, unstack
-using StatsBase: std
+using StatsBase: var
 using Random
 using Test
@ -56,18 +56,26 @@ end
  # Set random seed so that these tests don't fail randomly
  Random.seed!(0)
-  # glorot_uniform should yield a kernel with stddev ~= sqrt(6/(n_in + n_out)),
+  @testset "Fan in/out" begin
-  # and glorot_normal should yield a kernel with stddev != 2/(n_in _ n_out)
+    @test nfan() == (1, 1) #For a constant
-  for (n_in, n_out) in [(100, 100), (100, 400)]
+    @test nfan(100) == (1, 100) #For vector
-    v = glorot_uniform(n_in, n_out)
+    @test nfan(100, 200) == (200, 100) #For Dense layer
-    @test minimum(v) > -1.1*sqrt(6/(n_in + n_out))
+    @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
-    @test minimum(v) < -0.9*sqrt(6/(n_in + n_out))
+    @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
-    @test maximum(v) >  0.9*sqrt(6/(n_in + n_out))
+    @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
-    @test maximum(v) <  1.1*sqrt(6/(n_in + n_out))
+  end
-    v = glorot_normal(n_in, n_out)
+  @testset "glorot" begin
-    @test std(v) > 0.9*sqrt(2/(n_in + n_out))
+    # glorot_uniform and glorot_normal should both yield a kernel with
-    @test std(v) < 1.1*sqrt(2/(n_in + n_out))
+    # variance ≈ 2/(fan_in + fan_out)
    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
      for init ∈ [glorot_uniform, glorot_normal]
        v = init(dims...)
        fan_in, fan_out = nfan(dims...)
        σ2 = 2 / (fan_in + fan_out)
        @test 0.9σ2 < var(v) < 1.1σ2
      end
    end
  end
 end