Merge pull request #1 from FluxML/master

update to origin
2020-02-16 18:27:35 +01:00 · 2020-02-16 18:27:35 +01:00 · 6f0710d364
parent 9d05afaccc 0b8d1574bf
commit 6f0710d364
24 changed files with 398 additions and 290 deletions
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@ -0,0 +1,24 @@
+name: CompatHelper
+
+on:
+  schedule:
+    - cron: '00 00 * * *'
+
+jobs:
+  CompatHelper:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        julia-version: [1.3]
+        julia-arch: [x64]
+        os: [ubuntu-latest]
+    steps:
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: ${{ matrix.julia-version }}
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -4,26 +4,26 @@ include:
 image: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04


-julia:1.0:
-  extends:
-    - .julia:1.0
-    - .test
-  tags:
-    - nvidia
-
-julia:1.1:
-  extends:
-    - .julia:1.1
-    - .test
-  tags:
-    - nvidia
-
-julia:1.2:
-  extends:
-    - .julia:1.2
-    - .test
-  tags:
-    - nvidia
+# julia:1.0:
+#   extends:
+#     - .julia:1.0
+#     - .test
+#   tags:
+#     - nvidia
+# 
+# julia:1.1:
+#   extends:
+#     - .julia:1.1
+#     - .test
+#   tags:
+#     - nvidia
+# 
+# julia:1.2:
+#   extends:
+#     - .julia:1.2
+#     - .test
+#   tags:
+#     - nvidia

 julia:1.3:
  extends:
--- a/.travis.yml
+++ b/.travis.yml
@ -6,7 +6,6 @@ os:
  # - osx

 julia:
-  - 1.2
  - 1.3
  - nightly

@ -17,7 +16,7 @@ matrix:
 jobs:
  include:
    - stage: "Documentation"
-      julia: 1.2
+      julia: 1.3
      os: linux
      script:
        - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
--- a/Manifest.toml
+++ b/Manifest.toml
@ -2,15 +2,15 @@

 [[AbstractFFTs]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40"
+git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "0.4.1"
+version = "0.5.0"

 [[AbstractTrees]]
-deps = ["Markdown", "Test"]
-git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b"
+deps = ["Markdown"]
+git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.2.1"
+version = "0.3.0"

 [[Adapt]]
 deps = ["LinearAlgebra"]
@ -21,12 +21,6 @@ version = "1.0.0"
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"

-[[BinDeps]]
-deps = ["Compat", "Libdl", "SHA", "URIParser"]
-git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
-uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
-version = "0.8.10"
-
 [[BinaryProvider]]
 deps = ["Libdl", "SHA"]
 git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
@ -38,29 +32,23 @@ git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 version = "0.2.0"

-[[CSTParser]]
-deps = ["Tokenize"]
-git-tree-sha1 = "99dda94f5af21a4565dc2b97edf6a95485f116c3"
-uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
-version = "1.0.0"
-
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "e063efb91cfefd7e6afd92c435d01398107a500b"
+git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-version = "1.2.0"
+version = "2.1.0"

 [[CUDAdrv]]
-deps = ["CEnum", "Printf"]
-git-tree-sha1 = "96eabc95ebb83e361311330ffb574a3e2df73251"
+deps = ["CEnum", "CUDAapi", "Printf"]
+git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-version = "4.0.2"
+version = "5.0.1"

 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "dd642afe5fd6633663a8c3d42f3b7638f2210b79"
+git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.5.3"
+version = "2.7.0"

 [[CodecZlib]]
 deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@ -70,9 +58,9 @@ version = "0.6.0"

 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965"
+git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.8.0"
+version = "0.8.1"

 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
@ -86,29 +74,11 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
 uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
 version = "0.2.0"

-[[Compat]]
-deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
-uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "2.2.0"
-
-[[Conda]]
-deps = ["JSON", "VersionParsing"]
-git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
-uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
-version = "1.3.0"
-
-[[Crayons]]
-deps = ["Test"]
-git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
-uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
-version = "4.0.0"
-
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
-git-tree-sha1 = "bc94d6cb335d418088f12641751aab63ff56509d"
+git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.4.2"
+version = "1.6.0"

 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@ -117,9 +87,9 @@ version = "1.1.0"

 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "1fe8fad5fc84686dcbc674aa255bc867a64f8132"
+git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.5"
+version = "0.17.7"

 [[Dates]]
 deps = ["Printf"]
@ -130,32 +100,38 @@ deps = ["Mmap"]
 uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"

 [[DiffResults]]
-deps = ["Compat", "StaticArrays"]
-git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c"
+deps = ["StaticArrays"]
+git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc"
 uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-version = "0.0.4"
+version = "1.0.2"

 [[DiffRules]]
-deps = ["Random", "Test"]
-git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
+deps = ["NaNMath", "Random", "SpecialFunctions"]
+git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "0.0.10"
+version = "1.0.0"

 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"

 [[FFTW]]
-deps = ["AbstractFFTs", "BinaryProvider", "Conda", "Libdl", "LinearAlgebra", "Reexport", "Test"]
-git-tree-sha1 = "6c5b420da0b8c12098048561b8d58f81adea506f"
+deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
+git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
 uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.0.1"
+version = "1.2.0"
+
+[[FFTW_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
+uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
+version = "3.3.9+3"

 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "6827a8f73ff12707f209c920d204238a16892b55"
+git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.0"
+version = "0.8.4"

 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@ -164,15 +140,15 @@ version = "0.6.1"

 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "adf88d6da1f0294058f38295becf8807986bb7d0"
+git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.5"
+version = "0.10.8"

 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
-git-tree-sha1 = "a0a3b927b1a06e63fb8b91950cc7df340b7d912c"
+git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "2.0.0"
+version = "2.0.1"

 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
@ -180,16 +156,16 @@ git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
 uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
 version = "0.3.0"

+[[IntelOpenMP_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
+uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
+version = "2018.0.3+0"
+
 [[InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

-[[JSON]]
-deps = ["Dates", "Mmap", "Parsers", "Unicode"]
-git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
-uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-version = "0.21.0"
-
 [[Juno]]
 deps = ["Base64", "Logging", "Media", "Profile", "Test"]
 git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8"
@ -198,9 +174,9 @@ version = "0.7.2"

 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "74fe444b8b6d1ac01d639b2f9eaf395bcc2e24fc"
+git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.3.2"
+version = "1.3.3"

 [[LibGit2]]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
@ -215,11 +191,17 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

+[[MKL_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
+uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
+version = "2019.0.117+0"
+
 [[MacroTools]]
-deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"]
-git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76"
+deps = ["DataStructures", "Markdown", "Random"]
+git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.1"
+version = "0.5.3"

 [[Markdown]]
 deps = ["Base64"]
@ -241,16 +223,21 @@ version = "0.4.3"
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"

 [[NNlib]]
-deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8"
+deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
+git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.6.0"
+version = "0.6.2"

 [[NaNMath]]
-deps = ["Compat"]
-git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
+git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "0.3.2"
+version = "0.3.3"
+
+[[OpenSpecFun_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
+uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
+version = "0.5.3+1"

 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@ -258,12 +245,6 @@ git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.1.0"

-[[Parsers]]
-deps = ["Dates", "Test"]
-git-tree-sha1 = "c56ecb484f286639f161e712b8311f5ab77e8d32"
-uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "0.3.8"
-
 [[Pkg]]
 deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@ -291,10 +272,10 @@ uuid = "189a3867-3050-52da-a836-e630ba90ab69"
 version = "0.2.0"

 [[Requires]]
-deps = ["Test"]
-git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
+deps = ["UUIDs"]
+git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "0.5.2"
+version = "1.0.0"

 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@ -302,10 +283,6 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

-[[SharedArrays]]
-deps = ["Distributed", "Mmap", "Random", "Serialization"]
-uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"

@ -320,16 +297,16 @@ deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

 [[SpecialFunctions]]
-deps = ["BinDeps", "BinaryProvider", "Libdl"]
-git-tree-sha1 = "3bdd374b6fd78faf0119b8c5d538788dbf910c6e"
+deps = ["OpenSpecFun_jll"]
+git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "0.8.0"
+version = "0.9.0"

 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "1e9c5d89cba8047d518f1ffef432906ef1a3e8bd"
+git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "0.12.0"
+version = "0.12.1"

 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
@ -346,15 +323,10 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

 [[TimerOutputs]]
-deps = ["Crayons", "Printf", "Test", "Unicode"]
-git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
+deps = ["Printf"]
+git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.0"
-
-[[Tokenize]]
-git-tree-sha1 = "dfcdbbfb2d0370716c815cbd6f8a364efb6f42cf"
-uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
-version = "0.5.6"
+version = "0.5.3"

 [[TranscodingStreams]]
 deps = ["Random", "Test"]
@ -362,12 +334,6 @@ git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
 uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 version = "0.9.5"

-[[URIParser]]
-deps = ["Test", "Unicode"]
-git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
-uuid = "30578b45-9adc-5946-b283-645ec420af67"
-version = "0.4.0"
-
 [[UUIDs]]
 deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
@ -375,23 +341,23 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

-[[VersionParsing]]
-deps = ["Compat"]
-git-tree-sha1 = "c9d5aa108588b978bd859554660c8a5c4f2f7669"
-uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
-version = "1.1.3"
-
 [[ZipFile]]
-deps = ["BinaryProvider", "Libdl", "Printf"]
-git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
+deps = ["Libdl", "Printf", "Zlib_jll"]
+git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.8.3"
+version = "0.9.0"
+
+[[Zlib_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.11+7"

 [[Zygote]]
 deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "b2e42a21dc3d1ecd3cbe8c83a454ca56fbf423c4"
+git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
 uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.4.0"
+version = "0.4.5"

 [[ZygoteRules]]
 deps = ["MacroTools"]
--- a/NEWS.md
+++ b/NEWS.md
@ -1,3 +1,16 @@
+# v0.10.0
+* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
+  - The dependency on Tracker.jl has been removed.
+  - This means Flux now does not depend on using a specialised `TrackedArray` type, and can be used with normal Array implementations directly.
+  - Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
+* The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
+* `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
+* Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
+* [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
+* [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
+* `@treelike` has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
+* `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
+
 # v0.9.0
 * [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
 * New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.
--- a/Project.toml
+++ b/Project.toml
@ -1,11 +1,10 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.9.0"
+version = "0.10.1"

 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
@ -25,9 +24,17 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

 [compat]
-CUDAdrv = "4.0.1"
-CuArrays = "1.4.2"
+AbstractTrees = "0.2, 0.3"
+Adapt = "1"
+CodecZlib = "0.5, 0.6"
+Colors = "0.8, 0.9"
+CuArrays = "1.6"
+Juno = "0.5, 0.6, 0.7"
+MacroTools = "0.3, 0.4, 0.5"
 NNlib = "0.6"
+Reexport = "0.2"
+StatsBase = "0"
+ZipFile = "0.7, 0.8, 0.9"
 Zygote = "0.4"
 julia = "1"

--- a/README.md
+++ b/README.md
@ -7,93 +7,9 @@
 Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, and provides lightweight abstractions on top of Julia's native GPU and AD support. Flux makes the easy things easy while remaining fully hackable.

 ```julia
-julia> Pkg.add("Flux")
+] add Flux
 ```

 See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.

-If you use Flux in research, please cite the following paper:
-
-```
-@article{innes:2018,
-  author    = {Mike Innes},
-  title     = {Flux: Elegant Machine Learning with Julia},
-  journal   = {Journal of Open Source Software},
-  year      = {2018},
-  doi       = {10.21105/joss.00602},
-}
-```
-
-## Features
-
-Flux has powerful high-level features, and common architectures can be defined in a few lines.
-
-```julia
-model = Chain(
-  Dense(768, 128, σ),
-  LSTM(128, 256),
-  LSTM(256, 128),
-  Dense(128, 10),
-  softmax)
-
-loss(x, y) = crossentropy(model(x), y)
-
-Flux.train!(loss, params(model), data, ADAM(...))
-```
-
-Yet you can easily strip away the layers, and directly write the mathematics for your problem. Flux will seamlessly take gradients of any Julia code, so your model looks just like the paper.
-
-```julia
-W = param(randn(2, 10))
-b = param(randn(2))
-
-y(x) = σ.(W * x .+ b)
-```
-
-If that's *still* not enough, you can go as deep as you want, even writing your own CUDA kernels with [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl)! All this can be freely mixed-and-matched in a single model or script, and it all runs interactively via Jupyter or Juno.
-
-```julia
-function gpu_add(a, b, c)
-  i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-  c[i] = a[i] + b[i]
-  return nothing
-end
-```
-
-Unusual architectures are no problem in Flux, as you can use all the loops, control flow and even macros that you're used to. Here's a Tree RNN in 4 lines.
-
-```julia
-tree() = rand() < 0.5 ? rand(10) : (tree(), tree()) # dummy data
-
-shrink = Dense(20, 10)
-combine(a, b) = shrink([a; b])
-
-model(x) = x
-model(x::Tuple) = combine(model(x[1]), model(x[2]))
-
-model(tree()) # Sample output
-```
-
-Despite this flexibility, Julia's advanced compiler lets us do some powerful optimisations. For example, this definition of `sigmoid` automatically gets fused into a *single* GPU kernel – so it's really fast.
-
-```julia
-sigmoid(xs) = 1 ./ (1 .+ exp.(.-xs))
-```
-
-Similarly, Flux is the first dynamic framework to support [compiling to the browser](https://fluxml.github.io/experiments/) and model import via [formats like ONNX](https://github.com/FluxML/ONNX.jl/), both of which are thinly-veiled compiler problems.
-
-For more on our philosophy on machine learning, check out our article [On Machine Learning & Programming Languages](https://julialang.org/blog/2017/12/ml&pl).
-
-## Contributing & Help
-
-For general questions and help, check out Julia's [community forum](https://discourse.julialang.org/c/domain/ML).
-
-Flux development is carried out via our [GitHub issues](https://github.com/FluxML/Flux.jl/issues), so feel free to open feature requests or PRs here.
-
-For more informal discussions we'd love to have you on the [Julia slack](https://slackinvite.julialang.org/), where we hang out on the #machine-learning channel.
-
-## Related Packages
-
-Check out [Metalhead.jl](https://github.com/FluxML/Metalhead.jl) for common computer vision datasets and trained models.
-
-[MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) provides further common datasets.
+If you use Flux in your research, please [cite](CITATION.bib) our work.
--- a/bors.toml
+++ b/bors.toml
@ -1,4 +1,4 @@
 status = [
-  "ci/gitlab/%"
+  "ci/gitlab%"
 ]
-timeout-sec = 14400
+timeout-sec = 7200
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@ -65,3 +65,15 @@ AlphaDropout
 LayerNorm
 GroupNorm
 ```
+
+## Cost Functions
+```@docs
+mse
+crossentropy
+logitcrossentropy
+binarycrossentropy
+logitbinarycrossentropy
+kldivergence
+poisson
+hinge
+```
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@ -1,8 +1,9 @@
 # Training

-To actually train a model we need three things:
+To actually train a model we need four things:

 * A *objective function*, that evaluates how well a model is doing given some input data.
+* The trainable parameters of the model.
 * A collection of data points that will be provided to the objective function.
 * An [optimiser](optimisers.md) that will update the model parameters appropriately.

@ -32,6 +33,14 @@ Flux.train!(loss, ps, data, opt)

 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.

+At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
+
+## Model parameters
+
+The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `params(m)`.
+
+Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
+
 ## Datasets

 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:
@ -101,3 +110,30 @@ cb = function ()
  accuracy() > 0.9 && Flux.stop()
 end
 ```
+
+## Custom Training loops
+
+The `Flux.train!` function can be very convenient, especially for simple problems.
+Its also very flexible with the use of callbacks.
+But for some problems its much cleaner to write your own custom training loop.
+An example follows that works similar to the default `Flux.train` but with no callbacks.
+You don't need callbacks if you just code the calls to your functions directly into the loop.
+E.g. in the places marked with comments.
+
+```
+function my_custom_train!(loss, ps, data, opt)
+  ps = Params(ps)
+  for d in data
+    gs = gradient(ps) do
+      training_loss = loss(d...)
+      # Insert what ever code you want here that needs Training loss, e.g. logging
+      return training_loss
+    end
+    # insert what ever code you want here that needs gradient
+    # E.g. logging with TensorBoardLogger.jl as histogram so you can see if it is becoming huge
+    update!(opt, ps, gs)
+    # Here you might like to check validation set accuracy, and break out to do early stopping
+  end
+end
+```
+You could simplify this further, for example by hard-coding in the loss function.
--- a/src/Flux.jl
+++ b/src/Flux.jl
@ -6,7 +6,7 @@ using Base: tail
 using Zygote, MacroTools, Juno, Reexport, Statistics, Random
 using MacroTools: @forward
@reexport using NNlib
-using Zygote: Params, @adjoint, gradient, pullback
+using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient

 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
@ -21,8 +21,7 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
  ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay


-ENV["CUDA_INIT_SILENT"] = true
-using CUDAdrv, CuArrays
+using CuArrays
 const use_cuda = Ref(false)

 include("utils.jl")
@ -40,12 +39,14 @@ include("data/Data.jl")
 include("deprecations.jl")

 function __init__()
-  if !CUDAdrv.functional()
-    @warn "CUDA available, but CUDAdrv.jl failed to load"
-  elseif length(devices()) == 0
-    @warn "CUDA available, but no GPU detected"
-  elseif !CuArrays.functional()
-    @warn "CUDA GPU available, but CuArrays.jl failed to load"
+  precompiling = ccall(:jl_generating_output, Cint, ()) != 0
+
+  # we don't want to include the CUDA module when precompiling,
+  # or we could end up replacing it at run time (triggering a warning)
+  precompiling && return
+
+  if !CuArrays.functional()
+    # nothing to do here, and either CuArrays or one of its dependencies will have warned
  else
    use_cuda[] = true

@ -54,7 +55,7 @@ function __init__()
    if CuArrays.has_cudnn()
      include(joinpath(@__DIR__, "cuda/cuda.jl"))
    else
-      @warn "CUDA GPU available, but CuArrays.jl did not find libcudnn. Some functionality will not be available."
+      @warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
    end
  end
 end
--- a/src/cuda/curnn.jl
+++ b/src/cuda/curnn.jl
@ -1,6 +1,5 @@
 import ..Flux: Flux, relu
 using CuArrays.CUDAnative
-using CuArrays: @cuindex, cudims

 CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuArray{T,2},<:CuArray{T,1}}
 CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}}
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@ -44,19 +44,23 @@ end
 # it might be replaced in the future for better performance
 # see issue https://github.com/FluxML/Flux.jl/issues/702
 # Johnny Chen -- @johnnychen94
+# only slightly changed to better handle interaction with Zygote @dsweber2
 """
    activations(c::Chain, input)
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
-  rst = []
-  for l in c
-    x = get(rst, length(rst), input)
-    push!(rst, l(x))
-  end
-  return rst
+    extraChain(c.layers, input)
 end

+function extraChain(fs::Tuple, x)
+    res = first(fs)(x)
+    return (res, extraChain(Base.tail(fs), res)...)
+end
+
+extraChain(::Tuple{}, x) = ()
+
+

 """
    Dense(in::Integer, out::Integer, σ = identity)
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@ -118,6 +118,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
    )
 end

+# TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
+@nograd conv_transpose_dims
+
 function (c::ConvTranspose)(x::AbstractArray)
  # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
  σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@ -45,8 +45,7 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
 """
    reset!(rnn)

-Reset the hidden state of a recurrent layer back to its original value. See also
-`truncate!`.
+Reset the hidden state of a recurrent layer back to its original value.

 Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to

--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@ -1,3 +1,4 @@
+using CuArrays
 using NNlib: logsoftmax, logσ

 # Cost functions
@ -35,6 +36,9 @@ Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerica
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)

+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
+
 """
    logitbinarycrossentropy(logŷ, y)

@ -49,6 +53,9 @@ but it is more numerically stable.
 """
 logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)

+# Re-definition to fix interaction with CuArrays.
+CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
+
 """
    normalise(x::AbstractArray; dims=1)

@ -77,3 +84,29 @@ function normalise(x::AbstractArray; dims=1)
  σ′ = std(x, dims = dims, mean = μ′, corrected=false)
  return (x .- μ′) ./ σ′
 end
+
+"""
+    kldivergence(ŷ, y)
+KLDivergence is a measure of how much one probability distribution is different from the other.
+It is always non-negative and zero only when both the distributions are equal everywhere.
+[KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
+"""
+function kldivergence(ŷ, y)
+  entropy = sum(y .* log.(y)) *1 //size(y,2)
+  cross_entropy = crossentropy(ŷ, y)
+  return entropy + cross_entropy
+end
+
+"""
+    poisson(ŷ, y)
+Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
+[Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
+"""
+poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
+
+"""
+    hinge(ŷ, y)
+Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
+[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
+"""
+hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@ -283,7 +283,7 @@ ADAGrad(η = 0.1) = ADAGrad(η, IdDict())

 function apply!(o::ADAGrad, x, Δ)
  η = o.eta
-  acc = get!(o.acc, x, fill(ϵ, size(x)))::typeof(x)
+  acc = get!(o.acc, x, fill!(zero(x), ϵ))::typeof(x)
  @. acc += Δ^2
  @. Δ *= η / (√acc + ϵ)
 end
@ -349,10 +349,10 @@ AMSGrad(η = 0.001, β = (0.9, 0.999)) = AMSGrad(η, β, IdDict())

 function apply!(o::AMSGrad, x, Δ)
  η, β = o.eta, o.beta
-  mt, vt, v̂t = get!(o.state, x, (fill(ϵ, size(x)), fill(ϵ, size(x)), fill(ϵ, size(x))))
+  mt, vt, v̂t = get!(o.state, x, (fill!(zero(x), ϵ), fill!(zero(x), ϵ), fill!(zero(x), ϵ)))
  @. mt = β[1] * mt + (1 - β[1]) * Δ
  @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
-  @. v̂t = max.(v̂t, vt)
+  @. v̂t = max(v̂t, vt)
  @. Δ = η * mt / (√v̂t + ϵ)
 end

@ -444,7 +444,8 @@ end
 """
  InvDecay(γ)

-Applies inverse time decay to an optimiser
+Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
+```

 ## Parameters
  - gamma (γ): Defaults to `0.001`
@ -472,7 +473,7 @@ end
 """
  ExpDecay(eta, decay, decay_step, clip)

-Discount the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
+Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.

 ## Parameters
  - Learning Rate (eta): Defaults to `0.001`.
--- a/src/utils.jl
+++ b/src/utils.jl
@ -1,6 +1,11 @@
 # Arrays
-glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0/sum(dims))
-glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0/sum(dims))
+nfan() = 1, 1 #fan_in, fan_out
+nfan(n) = 1, n #A vector is treated as a n×1 matrix
+nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
+nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
+
+glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
+glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))

 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
@ -98,6 +103,48 @@ function batchseq(xs, pad = nothing, n = maximum(length(x) for x in xs))
  [batch([xs_[j][i] for j = 1:length(xs_)]) for i = 1:n]
 end

+# Flattening models to weight vectors, and back
+
+function _restructure(m, xs)
+  i = 0
+  fmap(m) do x
+    x isa AbstractArray || return x
+    x = reshape(xs[i.+(1:length(x))], size(x))
+    i += length(x)
+    return x
+  end
+end
+
+"""
+    destructure(m)
+
+Flatten a model's parameters into a single weight vector.
+
+    julia> m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+
+    julia> θ, re = destructure(m);
+
+    julia> θ
+    67-element Array{Float32,1}:
+    -0.1407104
+    ...
+
+The second return value `re` allows you to reconstruct the original network after making
+modifications to the weight vector (for example, with a hypernetwork).
+
+    julia> re(θ .* 2)
+    Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+"""
+function destructure(m)
+  xs = Zygote.Buffer([])
+  fmap(m) do x
+    x isa AbstractArray && push!(xs, x)
+    return x
+  end
+  return vcat(vec.(copy(xs))...), p -> _restructure(m, p)
+end
+
 # Other

 """
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@ -25,12 +25,17 @@ cm = gpu(m)
@test all(p isa CuArray for p in params(cm))
@test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}

-x = [1,2,3]
+x = [1.,2.,3.]
 cx = gpu(x)
@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
@test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))

+x = [-1.1491, 0.8619, 0.3127]
+y = [1, 1, 0.]
+@test Flux.binarycrossentropy.(σ.(x),y) ≈ Array(Flux.binarycrossentropy.(cu(σ.(x)),cu(y)))
+@test Flux.logitbinarycrossentropy.(x,y) ≈ Array(Flux.logitbinarycrossentropy.(cu(x),cu(y)))
+
 xs = rand(5, 5)
 ys = Flux.onehotbatch(1:5,1:5)
@test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@ -4,11 +4,13 @@ import Flux: activations
@testset "basic" begin
  @testset "helpers" begin
    @testset "activations" begin
-      dummy_model = Chain(Dense(10,5,σ),Dense(5,2),softmax)
-      x = rand(10)
-      @test activations(Chain(), x) == []
-      @test activations(dummy_model, x)[1] == dummy_model[1](x)
-      @test activations(dummy_model, x)[2] == x |> dummy_model[1] |> dummy_model[2]
+      dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x))
+      x = randn(10)
+      @test activations(dummy_model, x)[1] == x.^2
+      @test activations(dummy_model, x)[2] == (x.^2 .- 3)
+      @test activations(dummy_model, x)[3] == tan.(x.^2 .- 3)
+
+      @test activations(Chain(), x) == ()
      @test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type
    end
  end
@ -19,6 +21,12 @@ import Flux: activations
    # numeric test should be put into testset of corresponding layer
  end

+  @testset "Activations" begin
+    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
+    X = Float32.([1.0; 1.0; 1.0])
+    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
+  end
+
  @testset "Dense" begin
    @test  length(Dense(10, 5)(randn(10))) == 5
    @test_throws DimensionMismatch Dense(10, 5)(randn(1))
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@ -1,5 +1,6 @@
 using Flux, Test
 using Flux: maxpool, meanpool
+using Flux: gradient

@testset "Pooling" begin
  x = randn(Float32, 10, 10, 3, 2)
@ -54,6 +55,10 @@ end
  y = Conv((3,3), 1 => 1)(x)
  x_hat = ConvTranspose((3, 3), 1 => 1)(y)
  @test size(x_hat) == size(x)
+
+  m = ConvTranspose((3,3), 1=>1)
+  # Test that the gradient call does not throw: #900
+  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
 end

@testset "CrossCor" begin
@ -61,7 +66,7 @@ end
  w = rand(2,2,1,1)
  y = CrossCor(w, [0.0])

-  @test sum(w .* x[1:2, 1:2, :, :]) == y(x)[1, 1, 1, 1]
+  @test isapprox(sum(w .* x[1:2, 1:2, :, :]), y(x)[1, 1, 1, 1], rtol=1e-7)

  r = zeros(Float32, 28, 28, 1, 5)
  m = Chain(
@ -84,17 +89,17 @@ end
  l = Conv((3,3), 1=>1)
  expected = zeros(eltype(l.weight),5,5,1,1)
  expected[2:end-1,2:end-1,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)

  l = Conv((3,1), 1=>1)
  expected = zeros(eltype(l.weight),5,7,1,1)
  expected[2:end-1,4,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)

  l = Conv((1,3), 1=>1)
  expected = zeros(eltype(l.weight),7,5,1,1)
  expected[4,2:end-1,1,1] = l.weight
-  @test expected == l(data)
+  @test expected ≈ l(data)

  @test begin
    # we test that the next expression does not throw
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@ -191,6 +191,7 @@ end

 end

+if VERSION >= v"1.1"
@testset "GroupNorm" begin
  # begin tests
  squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
@ -289,5 +290,5 @@ end
      x = Float32.(reshape(collect(1:prod(sizes)), sizes))
    @test BN(x) ≈ GN(x)
  end
-
+end
 end
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@ -49,12 +49,33 @@ const ϵ = 1e-7
  @testset "logitbinarycrossentropy" begin
    @test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0)
  end
-
+  
+  y = [1 2 3]
+  y1 = [4.0 5.0 6.0]
+  @testset "kldivergence" begin
+    @test Flux.kldivergence(y, y1) ≈ 4.761838062403337
+    @test Flux.kldivergence(y, y) ≈ 0 
+  end
+  
+  y = [1 2 3 4]
+  y1 = [5.0 6.0 7.0 8.0]
+  @testset "hinge" begin
+    @test Flux.hinge(y, y1) ≈ 0
+    @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
+  end
+  
+  y = [0.1 0.2 0.3]
+  y1 = [0.4 0.5 0.6]
+  @testset "poisson" begin
+    @test Flux.poisson(y, y1) ≈ 1.0160455586700767
+    @test Flux.poisson(y, y) ≈ 0.5044459776946685
+  end
+  
  @testset "no spurious promotions" begin
    for T in (Float32, Float64)
      y = rand(T, 2)
      ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
        fwd, back = Flux.pullback(f, ŷ, y)
        @test fwd isa T
        @test eltype(back(one(T))[1]) == T
--- a/test/utils.jl
+++ b/test/utils.jl
@ -1,6 +1,6 @@
 using Flux
-using Flux: throttle, glorot_uniform, glorot_normal, stack, unstack
-using StatsBase: std
+using Flux: throttle, nfan, glorot_uniform, glorot_normal, stack, unstack
+using StatsBase: var
 using Random
 using Test

@ -56,18 +56,26 @@ end
  # Set random seed so that these tests don't fail randomly
  Random.seed!(0)

-  # glorot_uniform should yield a kernel with stddev ~= sqrt(6/(n_in + n_out)),
-  # and glorot_normal should yield a kernel with stddev != 2/(n_in _ n_out)
-  for (n_in, n_out) in [(100, 100), (100, 400)]
-    v = glorot_uniform(n_in, n_out)
-    @test minimum(v) > -1.1*sqrt(6/(n_in + n_out))
-    @test minimum(v) < -0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) >  0.9*sqrt(6/(n_in + n_out))
-    @test maximum(v) <  1.1*sqrt(6/(n_in + n_out))
+  @testset "Fan in/out" begin
+    @test nfan() == (1, 1) #For a constant
+    @test nfan(100) == (1, 100) #For vector
+    @test nfan(100, 200) == (200, 100) #For Dense layer
+    @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
+    @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
+    @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
+  end

-    v = glorot_normal(n_in, n_out)
-    @test std(v) > 0.9*sqrt(2/(n_in + n_out))
-    @test std(v) < 1.1*sqrt(2/(n_in + n_out))
+  @testset "glorot" begin
+    # glorot_uniform and glorot_normal should both yield a kernel with
+    # variance ≈ 2/(fan_in + fan_out)
+    for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
+      for init ∈ [glorot_uniform, glorot_normal]
+        v = init(dims...)
+        fan_in, fan_out = nfan(dims...)
+        σ2 = 2 / (fan_in + fan_out)
+        @test 0.9σ2 < var(v) < 1.1σ2
+      end
+    end
  end
 end