diff --git a/.gitignore b/.gitignore index e2cb9ecd..eb18605c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,4 @@ *.jl.mem docs/build/ docs/site/ -docs/flux.css deps -Manifest.toml diff --git a/.travis.yml b/.travis.yml index b26597e9..edc8dca9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,19 +1,25 @@ # Documentation: http://docs.travis-ci.com/user/languages/julia/ language: julia + os: - linux # - osx + julia: - - 0.7 - 1.0 - nightly -# uncomment the following lines to override the default test script -# script: -# - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi -# - julia -e 'Pkg.clone(pwd()); Pkg.build("Flux"); Pkg.test("Flux"; coverage=true)' + matrix: allow_failures: - julia: nightly -after_success: - - julia -e 'using Pkg; Pkg.add("Documenter"); Pkg.add("NNlib")' - - julia -e 'using Pkg; cd(Pkg.dir("Flux")); include(joinpath("docs", "make.jl"))' + +jobs: + include: + - stage: "Documentation" + julia: 1.0 + os: linux + script: + - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); + Pkg.instantiate()' + - julia --project=docs/ docs/make.jl + after_success: skip diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 00000000..ebf4c577 --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,272 @@ +# This file is machine-generated - editing it directly is not advised + +[[AbstractTrees]] +deps = ["Markdown", "Test"] +git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b" +uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +version = "0.2.1" + +[[Adapt]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "53d8fec4f662088c1202530e338a11a919407f3b" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "0.4.2" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinDeps]] +deps = ["Compat", "Libdl", "SHA", "URIParser"] +git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9" +uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee" +version = "0.8.10" + +[[BinaryProvider]] +deps = ["Libdl", "Pkg", "SHA", "Test"] +git-tree-sha1 = "055eb2690182ebc31087859c3dd8598371d3ef9e" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.3" + +[[CodecZlib]] +deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"] +git-tree-sha1 = "e3df104c84dfc108f0ca203fd7f5bbdc98641ae9" +uuid = "944b1d66-785c-5afd-91f1-9de20f533193" +version = "0.5.1" + +[[ColorTypes]] +deps = ["FixedPointNumbers", "Random", "Test"] +git-tree-sha1 = "f73b0e10f2a5756de7019818a41654686da06b09" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.7.5" + +[[Colors]] +deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport", "Test"] +git-tree-sha1 = "9f0a0210450acb91c730b730a994f8eef1d3d543" +uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" +version = "0.9.5" + +[[CommonSubexpressions]] +deps = ["Test"] +git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0" +uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" +version = "0.2.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "ec61a16eed883ad0cfa002d7489b3ce6d039bb9a" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "1.4.0" + +[[DataStructures]] +deps = ["InteractiveUtils", "OrderedCollections", "Random", "Serialization", "Test"] +git-tree-sha1 = "ca971f03e146cf144a9e2f2ce59674f5bf0e8038" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.15.0" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[DiffResults]] +deps = ["Compat", "StaticArrays"] +git-tree-sha1 = "db8acf46717b13d6c48deb7a12007c7f85a70cf7" +uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" +version = "0.0.3" + +[[DiffRules]] +deps = ["Random", "Test"] +git-tree-sha1 = "c49ec69428ffea0c1d1bbdc63d1a70f5df5860ad" +uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" +version = "0.0.7" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[FixedPointNumbers]] +deps = ["Test"] +git-tree-sha1 = "b8045033701c3b10bf2324d7203404be7aef88ba" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.5.3" + +[[ForwardDiff]] +deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"] +git-tree-sha1 = "e393bd3b9102659fb24fe88caedec41f2bc2e7de" +uuid = "f6369f11-7733-5829-9624-2563aa707210" +version = "0.10.2" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[Juno]] +deps = ["Base64", "Logging", "Media", "Profile", "Test"] +git-tree-sha1 = "ce6246e19061e36cbdce954caaae717498daeed8" +uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d" +version = "0.5.4" + +[[LibGit2]] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Compat"] +git-tree-sha1 = "c443e1c8d58a4e9f61b708ad0a88286c7042145b" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.4.4" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[Media]] +deps = ["MacroTools", "Test"] +git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58" +uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27" +version = "0.5.0" + +[[Missings]] +deps = ["Dates", "InteractiveUtils", "SparseArrays", "Test"] +git-tree-sha1 = "d1d2585677f2bd93a97cfeb8faa7a0de0f982042" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "0.4.0" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[NNlib]] +deps = ["Libdl", "LinearAlgebra", "MacroTools", "Requires", "Test"] +git-tree-sha1 = "51330bb45927379007e089997bf548fbe232589d" +uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +version = "0.4.3" + +[[NaNMath]] +deps = ["Compat"] +git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2" +uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" +version = "0.3.2" + +[[OrderedCollections]] +deps = ["Random", "Serialization", "Test"] +git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.0.2" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Reexport]] +deps = ["Pkg"] +git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "0.2.0" + +[[Requires]] +deps = ["Test"] +git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "0.5.2" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SortingAlgorithms]] +deps = ["DataStructures", "Random", "Test"] +git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "0.3.1" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["BinDeps", "BinaryProvider", "Libdl", "Test"] +git-tree-sha1 = "0b45dc2e45ed77f445617b99ff2adf0f5b0f23ea" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "0.7.2" + +[[StaticArrays]] +deps = ["InteractiveUtils", "LinearAlgebra", "Random", "Statistics", "Test"] +git-tree-sha1 = "1eb114d6e23a817cd3e99abc3226190876d7c898" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "0.10.2" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[StatsBase]] +deps = ["DataStructures", "DelimitedFiles", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "Test"] +git-tree-sha1 = "7b596062316c7d846b67bf625d5963a832528598" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.27.0" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TranscodingStreams]] +deps = ["Pkg", "Random", "Test"] +git-tree-sha1 = "a34a2d588e2d2825602bf14a24216d5c8b0921ec" +uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +version = "0.8.1" + +[[URIParser]] +deps = ["Test", "Unicode"] +git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69" +uuid = "30578b45-9adc-5946-b283-645ec420af67" +version = "0.4.0" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[ZipFile]] +deps = ["BinaryProvider", "Libdl", "Printf", "Test"] +git-tree-sha1 = "4000c633efe994b2e10b31b6d91382c4b7412dac" +uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" +version = "0.8.0" diff --git a/Project.toml b/Project.toml new file mode 100644 index 00000000..f1545010 --- /dev/null +++ b/Project.toml @@ -0,0 +1,25 @@ +name = "Flux" +uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" + +[deps] +AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" +Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" +DiffRules = "b552c78f-8df3-52c6-915a-8e097449b14b" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +NaNMath = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" diff --git a/REQUIRE b/REQUIRE index ad3306d6..edfe56bb 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,9 +1,9 @@ -julia 0.7 +julia 1.0 Juno MacroTools 0.3.3 NNlib Requires -Adapt +Adapt 0.4 CodecZlib Colors ZipFile diff --git a/docs/Manifest.toml b/docs/Manifest.toml new file mode 100644 index 00000000..0bb294e1 --- /dev/null +++ b/docs/Manifest.toml @@ -0,0 +1,288 @@ +[[AbstractTrees]] +deps = ["Markdown", "Test"] +git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b" +uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +version = "0.2.1" + +[[Adapt]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "04d15700419b6949d76be1428ab6e0277ff43b06" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "0.4.1" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinDeps]] +deps = ["Compat", "Libdl", "SHA", "URIParser"] +git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9" +uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee" +version = "0.8.10" + +[[BinaryProvider]] +deps = ["Libdl", "Pkg", "SHA", "Test"] +git-tree-sha1 = "055eb2690182ebc31087859c3dd8598371d3ef9e" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.3" + +[[CodecZlib]] +deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"] +git-tree-sha1 = "e3df104c84dfc108f0ca203fd7f5bbdc98641ae9" +uuid = "944b1d66-785c-5afd-91f1-9de20f533193" +version = "0.5.1" + +[[ColorTypes]] +deps = ["FixedPointNumbers", "Random", "Test"] +git-tree-sha1 = "f73b0e10f2a5756de7019818a41654686da06b09" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.7.5" + +[[Colors]] +deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport", "Test"] +git-tree-sha1 = "9f0a0210450acb91c730b730a994f8eef1d3d543" +uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" +version = "0.9.5" + +[[CommonSubexpressions]] +deps = ["Test"] +git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0" +uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" +version = "0.2.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "ec61a16eed883ad0cfa002d7489b3ce6d039bb9a" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "1.4.0" + +[[DataStructures]] +deps = ["InteractiveUtils", "OrderedCollections", "Random", "Serialization", "Test"] +git-tree-sha1 = "ca971f03e146cf144a9e2f2ce59674f5bf0e8038" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.15.0" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[DiffResults]] +deps = ["Compat", "StaticArrays"] +git-tree-sha1 = "db8acf46717b13d6c48deb7a12007c7f85a70cf7" +uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" +version = "0.0.3" + +[[DiffRules]] +deps = ["Random", "Test"] +git-tree-sha1 = "c49ec69428ffea0c1d1bbdc63d1a70f5df5860ad" +uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" +version = "0.0.7" + +[[Distributed]] +deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2", "Markdown", "Pkg", "Test"] +git-tree-sha1 = "1df01539a1c952cef21f2d2d1c092c2bcf0177d7" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.6.0" + +[[Documenter]] +deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "LibGit2", "Logging", "Markdown", "Pkg", "REPL", "Random", "Test", "Unicode"] +git-tree-sha1 = "a6db1c69925cdc53aafb38caec4446be26e0c617" +uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +version = "0.21.0" + +[[FixedPointNumbers]] +deps = ["Test"] +git-tree-sha1 = "b8045033701c3b10bf2324d7203404be7aef88ba" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.5.3" + +[[Flux]] +deps = ["AbstractTrees", "Adapt", "CodecZlib", "Colors", "DiffRules", "ForwardDiff", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Printf", "Random", "Reexport", "Requires", "SpecialFunctions", "Statistics", "StatsBase", "Test", "ZipFile"] +path = ".." +uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" +version = "0.6.10+" + +[[ForwardDiff]] +deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"] +git-tree-sha1 = "b91250044374764e7c29af59a774c4b8d6100b6e" +uuid = "f6369f11-7733-5829-9624-2563aa707210" +version = "0.10.1" + +[[InteractiveUtils]] +deps = ["LinearAlgebra", "Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[Juno]] +deps = ["Base64", "Logging", "Media", "Profile", "Test"] +git-tree-sha1 = "3c29a199713e7ec62cfdc11f44d7760219d5f658" +uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d" +version = "0.5.3" + +[[LibGit2]] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Compat"] +git-tree-sha1 = "c443e1c8d58a4e9f61b708ad0a88286c7042145b" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.4.4" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[Media]] +deps = ["MacroTools", "Test"] +git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58" +uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27" +version = "0.5.0" + +[[Missings]] +deps = ["Dates", "InteractiveUtils", "SparseArrays", "Test"] +git-tree-sha1 = "adc26d2ee85a49c413464110d922cf21efc9d233" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "0.3.1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[NNlib]] +deps = ["Libdl", "LinearAlgebra", "MacroTools", "Requires", "Test"] +git-tree-sha1 = "51330bb45927379007e089997bf548fbe232589d" +uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +version = "0.4.3" + +[[NaNMath]] +deps = ["Compat"] +git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2" +uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" +version = "0.3.2" + +[[OrderedCollections]] +deps = ["Random", "Serialization", "Test"] +git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.0.2" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Reexport]] +deps = ["Pkg"] +git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "0.2.0" + +[[Requires]] +deps = ["Test"] +git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "0.5.2" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SortingAlgorithms]] +deps = ["DataStructures", "Random", "Test"] +git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "0.3.1" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["BinDeps", "BinaryProvider", "Libdl", "Test"] +git-tree-sha1 = "0b45dc2e45ed77f445617b99ff2adf0f5b0f23ea" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "0.7.2" + +[[StaticArrays]] +deps = ["InteractiveUtils", "LinearAlgebra", "Random", "Statistics", "Test"] +git-tree-sha1 = "1eb114d6e23a817cd3e99abc3226190876d7c898" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "0.10.2" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[StatsBase]] +deps = ["DataStructures", "DelimitedFiles", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "Test"] +git-tree-sha1 = "7b596062316c7d846b67bf625d5963a832528598" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.27.0" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TranscodingStreams]] +deps = ["Pkg", "Random", "Test"] +git-tree-sha1 = "a34a2d588e2d2825602bf14a24216d5c8b0921ec" +uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +version = "0.8.1" + +[[URIParser]] +deps = ["Test", "Unicode"] +git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69" +uuid = "30578b45-9adc-5946-b283-645ec420af67" +version = "0.4.0" + +[[UUIDs]] +deps = ["Random"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[ZipFile]] +deps = ["BinaryProvider", "Libdl", "Printf", "Test"] +git-tree-sha1 = "4000c633efe994b2e10b31b6d91382c4b7412dac" +uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" +version = "0.8.0" diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 00000000..c882d475 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,4 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" diff --git a/docs/make.jl b/docs/make.jl index b35beb3c..eb0b7470 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -2,10 +2,11 @@ using Documenter, Flux, NNlib makedocs(modules=[Flux, NNlib], doctest = false, - format = :html, analytics = "UA-36890222-9", sitename = "Flux", - assets = ["../flux.css"], + # Uncomment below for local build + #format = Documenter.HTML(prettyurls = false), + assets = ["assets/flux.css"], pages = ["Home" => "index.md", "Building Models" => ["Basics" => "models/basics.md", @@ -22,10 +23,4 @@ makedocs(modules=[Flux, NNlib], ["Backpropagation" => "internals/tracker.md"], "Community" => "community.md"]) -deploydocs( - repo = "github.com/FluxML/Flux.jl.git", - target = "build", - osname = "linux", - julia = "1.0", - deps = nothing, - make = nothing) +deploydocs(repo = "github.com/FluxML/Flux.jl.git") diff --git a/docs/src/assets/flux.css b/docs/src/assets/flux.css new file mode 100644 index 00000000..541ead5f --- /dev/null +++ b/docs/src/assets/flux.css @@ -0,0 +1,113 @@ +@import url('https://fonts.googleapis.com/css?family=Lato:400,400i'); + +body { + font-family: Lato, "Segoe UI",Roboto,"Helvetica Neue",Arial,sans-serif; +} + +nav.toc { + padding-top: 0; + background: rgb(240, 240, 240); + line-height: 2em; + cursor: default; + user-select: none; +} + +h1+h2 { + margin-top: 0; +} + +/* Green banner in ToC */ +nav.toc > h1 { + margin-top: 0; + padding-top: 0.4em; + padding-bottom: 0.5em; + border-bottom: 5px solid white; + box-shadow: 0px -2px 5px rgb(60,60,60); + margin-bottom: 0.5em; + background: rgb(60, 150, 60); + + font-style: italic; + font-weight: normal; + font-size: 50pt; + text-transform: lowercase; + text-shadow: 2px 2px 5px rgba(0,0,0,0.2); + color: white; +} + +/* Reduce ToC font size */ +.toctext { + font-size: 10pt; +} + +/* Fade out non-clickable ToC headers */ +nav.toc ul span.toctext { + color: rgb(180, 180, 180); +} + +nav.toc ul .toctext { + color: rgb(100, 100, 100); +} + +nav.toc ul a.toctext:hover { + color: inherit; + background: rgb(220, 220, 220); + cursor: default; +} + +nav.toc li.current > .toctext { + background: linear-gradient(90deg, rgb(245,245,245) 0%, white 90%); + font-weight: normal; +} + +nav.toc ul.internal li.toplevel { + font-weight: normal; +} + +/* Content */ + +article { max-width: none; } + +article > p, article > ul { + max-width: 45em; +} + +/* Links */ +a, a:visited { color: rgb(0, 120, 0); } +article p a { border-bottom: 1px solid rgb(200, 230, 200); } +a:hover, a:visited:hover { color: rgb(0, 80, 0); } + +/* Article Links */ +article p a { border-bottom: 1px solid rgb(200, 230, 200); } +article p a:hover, article a:visited:hover { color: rgb(0, 120, 0); } +article p a:hover { border-bottom: 1px solid rgb(150, 200, 150); } + +/* Doctstrings */ +article section.docstring { + padding: 0.5em 0; + border-left: none; + border-right: none; + border-bottom: none; +} + +/* Code */ + +article pre, article p > code { + background: rgb(245, 250, 245); +} + +article pre { + border: none; + max-width: none; + padding: 1em; + border-radius: 10px 0px 0px 10px; + margin-left: -1em; + margin-right: -2em; +} + +.hljs-comment { + font-style: italic; +} + +.hljs-number { + color: rgb(0, 150, 150); +} diff --git a/docs/src/gpu.md b/docs/src/gpu.md index 6be2d7b0..17a7ca5c 100644 --- a/docs/src/gpu.md +++ b/docs/src/gpu.md @@ -4,7 +4,7 @@ Support for array operations on other hardware backends, like GPUs, is provided For example, we can use `CuArrays` (with the `cu` converter) to run our [basic example](models/basics.md) on an NVIDIA GPU. -(Note that you need to build Julia 0.6 from source and have CUDA available to use CuArrays – please see the [CUDAnative.jl](https://github.com/JuliaGPU/CUDAnative.jl) instructions for more details.) +(Note that you need to have CUDA available to use CuArrays – please see the [CuArrays.jl](https://github.com/JuliaGPU/CuArrays.jl) instructions for more details.) ```julia using CuArrays diff --git a/docs/src/internals/tracker.md b/docs/src/internals/tracker.md index 3d39451d..456a9129 100644 --- a/docs/src/internals/tracker.md +++ b/docs/src/internals/tracker.md @@ -100,16 +100,16 @@ minus(a, b) = a - b Firstly, we must tell the tracker system to stop when it sees a call to `minus`, and record it. We can do this using dispatch: ```julia -using Flux.Tracker: TrackedReal, track, @grad +using Flux.Tracker: TrackedArray, track, @grad -minus(a::TrackedArray, b::TrackedArray) = Tracker.track(minus, a, b) +minus(a::TrackedArray, b::TrackedArray) = track(minus, a, b) ``` `track` takes care of building a new `Tracked` object and recording the operation on the tape. We just need to provide a gradient definition. ```julia @grad function minus(a, b) - return minus(data(a),data(b)), Δ -> (Δ, -Δ) + return minus(data(a), data(b)), Δ -> (Δ, -Δ) end ``` @@ -121,6 +121,19 @@ Note that in the backpropagator we don't call `data(a)`; we *do* in fact want to @grad a * b = data(a)*data(b), Δ -> (Δ*b, a*Δ) ``` +We can then calculate the first derivative of `minus` as follows: + +```julia +a = param([1,2,3]) +b = param([3,2,1]) + +c = minus(a, b) # [-2.0 (tracked), 0.0 (tracked), 2.0 (tracked)] + +Tracker.back!(c, 1) +Tracker.grad(a) # [1.00, 1.00, 1.00] +Tracker.grad(b) # [-1.00, -1.00, -1.00] +``` + For multi-argument functions with custom gradients, you likely want to catch not just `minus(::TrackedArray, ::TrackedArray)` but also `minus(::Array, TrackedArray)` and so on. To do so, just define those extra signatures as needed: ```julia diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md index a0a39ab5..606dac1c 100644 --- a/docs/src/models/basics.md +++ b/docs/src/models/basics.md @@ -28,7 +28,7 @@ When a function has many parameters, we can pass them all in explicitly: f(W, b, x) = W * x + b Tracker.gradient(f, 2, 3, 4) -(4.0 (tracked), 1.0, 2.0 (tracked)) +(4.0 (tracked), 1.0 (tracked), 2.0 (tracked)) ``` But machine learning models can have *hundreds* of parameters! Flux offers a nice way to handle this. We can tell Flux to treat something as a parameter via `param`. Then we can collect these together and tell `gradient` to collect the gradients of all of them at once. @@ -102,6 +102,8 @@ All deep learning in Flux, however complex, is a simple generalisation of this e It's common to create more complex models than the linear regression above. For example, we might want to have two linear layers with a nonlinearity like [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function) (`σ`) in between them. In the above style we could write this as: ```julia +using Flux + W1 = param(rand(3, 5)) b1 = param(rand(3)) layer1(x) = W1 * x .+ b1 diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index 4bbb2ba0..47d9dc35 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -10,6 +10,12 @@ MaxPool MeanPool ``` +## Additional Convolution Layers + +```@docs +DepthwiseConv +``` + ## Recurrent Layers Much like the core layers above, but can be used to process sequence data (as well as other kinds of structured data). diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md index 968622be..e9b02865 100644 --- a/docs/src/training/optimisers.md +++ b/docs/src/training/optimisers.md @@ -23,44 +23,30 @@ We want to update each parameter, using the gradient, in order to improve (reduc ```julia using Flux.Tracker: grad, update! -function sgd() - η = 0.1 # Learning Rate - for p in (W, b) - update!(p, -η * grads[p]) - end +η = 0.1 # Learning Rate +for p in (W, b) + update!(p, -η * grads[p]) end ``` -If we call `sgd`, the parameters `W` and `b` will change and our loss should go down. - -There are two pieces here: one is that we need a list of trainable parameters for the model (`[W, b]` in this case), and the other is the update step. In this case the update is simply gradient descent (`x .-= η .* Δ`), but we might choose to do something more advanced, like adding momentum. - -In this case, getting the variables is trivial, but you can imagine it'd be more of a pain with some complex stack of layers. +Running this will alter the parameters `W` and `b` and our loss should go down. Flux provides a more general way to do optimiser updates like this. ```julia -m = Chain( - Dense(10, 5, σ), - Dense(5, 2), softmax) +opt = Descent(0.1) # Gradient descent with learning rate 0.1 + +for p in (W, b) + update!(opt, p, -η * grads[p]) +end ``` -Instead of having to write `[m[1].W, m[1].b, ...]`, Flux provides a params function `params(m)` that returns a list of all parameters in the model for you. - -For the update step, there's nothing whatsoever wrong with writing the loop above – it'll work just fine – but Flux provides various *optimisers* that make it more convenient. - -```julia -opt = SGD([W, b], 0.1) # Gradient descent with learning rate 0.1 - -opt() # Carry out the update, modifying `W` and `b`. -``` - -An optimiser takes a parameter list and returns a function that does the same thing as `update` above. We can pass either `opt` or `update` to our [training loop](training.md), which will then run the optimiser after every mini-batch of data. +An optimiser `update!` accepts a parameter and a gradient, and updates the parameter according to the chosen rule. We can also pass `opt` to our [training loop](training.md), which will update all parameters of the model in a loop. However, we can now easily replace `Descent` with a more advanced optimiser such as `ADAM`. ## Optimiser Reference -All optimisers return a function that, when called, will update the parameters passed to it. +All optimisers return an object that, when passed to `train!`, will update the parameters passed to it. ```@docs -SGD +Descent Momentum Nesterov ADAM diff --git a/docs/src/training/training.md b/docs/src/training/training.md index 5d1f87fa..ae483783 100644 --- a/docs/src/training/training.md +++ b/docs/src/training/training.md @@ -9,7 +9,7 @@ To actually train a model we need three things: With these we can call `Flux.train!`: ```julia -Flux.train!(objective, data, opt) +Flux.train!(objective, params, data, opt) ``` There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo). @@ -24,9 +24,10 @@ m = Chain( Dense(32, 10), softmax) loss(x, y) = Flux.mse(m(x), y) +ps = Flux.params(m) # later -Flux.train!(loss, data, opt) +Flux.train!(loss, ps, data, opt) ``` The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want. @@ -78,7 +79,7 @@ julia> @epochs 2 Flux.train!(...) `train!` takes an additional argument, `cb`, that's used for callbacks so that you can observe the training process. For example: ```julia -train!(objective, data, opt, cb = () -> println("training")) +train!(objective, ps, data, opt, cb = () -> println("training")) ``` Callbacks are called for every batch of training data. You can slow this down using `Flux.throttle(f, timeout)` which prevents `f` from being called more than once every `timeout` seconds. @@ -89,6 +90,6 @@ A more typical callback might look like this: test_x, test_y = # ... create single batch of test data ... evalcb() = @show(loss(test_x, test_y)) -Flux.train!(objective, data, opt, +Flux.train!(objective, ps, data, opt, cb = throttle(evalcb, 5)) ``` diff --git a/src/Flux.jl b/src/Flux.jl index 8c959fec..da040aa0 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -2,11 +2,12 @@ module Flux # Zero Flux Given +using Base: tail using MacroTools, Juno, Requires, Reexport, Statistics, Random using MacroTools: @forward export Chain, Dense, RNN, LSTM, GRU, Conv, MaxPool, MeanPool, - Dropout, LayerNorm, BatchNorm, + DepthwiseConv, Dropout, LayerNorm, BatchNorm, params, mapleaves, cpu, gpu @reexport using NNlib @@ -19,8 +20,9 @@ export Tracker, TrackedArray, TrackedVector, TrackedMatrix, param include("optimise/Optimise.jl") using .Optimise using .Optimise: @epochs -export SGD, ADAM, ADAMW, AdaMax, Momentum, Nesterov, - RMSProp, ADAGrad, ADADelta, AMSGrad, NADAM +export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp, + ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, + ADAMW, InvDecay, ExpDecay, WeightDecay include("utils.jl") include("onehot.jl") diff --git a/src/cuda/cuda.jl b/src/cuda/cuda.jl index fe36bf5d..070c9228 100644 --- a/src/cuda/cuda.jl +++ b/src/cuda/cuda.jl @@ -1,7 +1,37 @@ module CUDA using ..CuArrays +using Pkg.TOML -CuArrays.cudnn_available() && include("cudnn.jl") +function version_check() + minor_version = 9 + project = joinpath(dirname(pathof(CuArrays)), "../Project.toml") + project = TOML.parse(String(read(project))) + version = VersionNumber(get(project, "version", "0.0.0")) + if !(version.major == 0 && version.minor == minor_version) + @warn """ + Flux is only supported with CuArrays v0.$minor_version. + Try running `] pin CuArrays@0.$minor_version`. + """ + end +end + +version_check() + +if !applicable(CuArray{UInt8}, undef, 1) + (T::Type{<:CuArray})(::UndefInitializer, sz...) = T(sz...) +end + +if CuArrays.libcudnn != nothing + if isdefined(CuArrays, :libcudnn_handle) + handle() = CuArrays.libcudnn_handle[] + else + handle() = CuArrays.CUDNN.handle() + end + include("curnn.jl") + include("cudnn.jl") +else + @warn("CUDNN is not installed, some functionality will not be available.") +end end diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl index f033595a..8bd8135e 100644 --- a/src/cuda/cudnn.jl +++ b/src/cuda/cudnn.jl @@ -1,6 +1,6 @@ -using .CuArrays.CUDNN: @check, libcudnn, cudnnStatus_t, libcudnn_handle, - cudnnDataType, TensorDesc, FilterDesc - +using .CuArrays.CUDNN: @check, libcudnn, cudnnStatus_t, cudnnTensorDescriptor_t, + cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc +import ..Flux: data using LinearAlgebra mutable struct DropoutDesc @@ -14,335 +14,215 @@ function DropoutDesc(ρ::Real; seed::Integer=0) d = [C_NULL] s = Csize_t[0] @check ccall((:cudnnCreateDropoutDescriptor,libcudnn), cudnnStatus_t, (Ptr{Ptr{Nothing}},), d) - @check ccall((:cudnnDropoutGetStatesSize,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Csize_t}),libcudnn_handle[],s) - states = CuArray{UInt8}(s[]) # TODO: can we drop this when ρ=0? + @check ccall((:cudnnDropoutGetStatesSize,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Csize_t}),handle(),s) + states = CuArray{UInt8}(undef, s[]) # TODO: can we drop this when ρ=0? desc = DropoutDesc(d[], states) @check ccall((:cudnnSetDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Nothing},Cfloat,Ptr{Nothing},Csize_t,Culonglong), - desc,libcudnn_handle[],ρ,states,length(states),seed) + desc,handle(),ρ,states,length(states),seed) finalizer(desc) do x @check ccall((:cudnnDestroyDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x) end return desc end -const RNN_RELU = 0 # Stock RNN with ReLu activation -const RNN_TANH = 1 # Stock RNN with tanh activation -const LSTM = 2 # LSTM with no peephole connections -const GRU = 3 # Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1) +const BATCHNORM_SPATIAL = 1 +const BATCHNORM_ACTIVATION = 0 +const BATCHNORM_MIN_EPS = 1e-5 -const LINEAR_INPUT = 0 -const SKIP_INPUT = 1 +@inline _wsize(y) = (map(_ -> 1, size(y)[1:end-2])..., size(y)[end-1], 1) -const UNIDIRECTIONAL = 0 -const BIDIRECTIONAL = 1 +@inline _reddims(y) = (collect(1:ndims(y)-2)..., ndims(y)) -const RNN_ALGO_STANDARD = 0 -const RNN_ALGO_PERSIST_STATIC = 1 -const RNN_ALGO_PERSIST_DYNAMIC = 2 - -# param layout: -# RNN: [weight, bias] × [input, hidden] -# GRU: [weight, bias] × [input, hidden] × [reset, update, newmem] -# LSTM: [weight, bias] × [input, hidden] × [input, forget, newmem, output] - -function params(w::CuVector, input, hidden, n = 1) - slice(offset, shape) = reshape(w[offset.+(1:prod(shape))], shape) - wx = slice(0, (input, hidden*n)) - wh = slice(length(wx), (hidden, hidden*n)) - bias = w[length(wx)+length(wh) .+ (1:hidden*n)] - (wx, wh), bias +mutable struct BNCache + mean + ivar end -mutable struct RNNDesc{T} - mode::Int - input::Int - hidden::Int - params::CuVector{T} - weights::NTuple{2,CuMatrix{T}} - bias::CuVector{T} - ptr::Ptr{Nothing} +BNCache() = BNCache(nothing, nothing) + +# NOTE: CuDNN supports only 4D and 5D Tensors for BatchNorm Operations +# so reshape a 2D Tensor into 4D +batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2}, + running_mean::CuArray{T}, running_var::CuArray{T}, momentum; + cache = nothing, alpha = T(1), beta = T(0), + eps = T(1e-5), training = true) where T<:Union{Float32, Float64} = + dropdims(batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), running_mean, running_var, momentum, + cache = cache, alpha = alpha, beta = beta, eps = eps, training = training), dims = (1, 2)) + +function batchnorm(g::CuArray{T}, b::CuArray{T}, x::Union{CuArray{T, 4},CuArray{T,5}}, + running_mean::CuArray{T}, running_var::CuArray{T}, momentum; + cache = nothing, alpha = T(1), beta = T(0), + eps = T(1e-5), training = true) where T<:Union{Float32, Float64} + y = similar(x) + cudnnBNForward!(y, g, b, x, running_mean, running_var, momentum, cache = cache, + alpha = alpha, beta = beta, eps = eps, training = training) + y end -Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr - -function rnnParamSize(T, r, input) - size = Csize_t[0] - @check ccall((:cudnnGetRNNParamsSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Ptr{Nothing},Ptr{Csize_t},Cint), - libcudnn_handle[], r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T)) - return Int(size[])÷sizeof(T) -end - -ngates(mode) = [1, 1, 4, 3][mode+1] -ngates(r::RNNDesc) = ngates(r.mode) - -function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T - d = [C_NULL] - @check ccall((:cudnnCreateRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Ptr{Nothing}},),d) - - dropoutDesc = DropoutDesc(0) - inputMode = LINEAR_INPUT - direction = UNIDIRECTIONAL - algo = RNN_ALGO_STANDARD - @check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint), - libcudnn_handle[],d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T)) - - w = cuzeros(T, rnnParamSize(T, d[], input)) - # TODO: avoid reserve allocation here - rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[]) - finalizer(rd) do x - @check ccall((:cudnnDestroyRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x) +function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray{T}, + running_mean::CuArray{T}, running_var::CuArray{T}, + momentum; cache = nothing, + alpha = T(1), beta = T(0), + eps = T(1e-5), training = true) where T<:Union{Float32, Float64} + dims = _wsize(x) + if eps < BATCHNORM_MIN_EPS + # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", BATCHNORM_MIN_EPS) + eps = BATCHNORM_MIN_EPS end - return rd -end + xd = TensorDesc(x) + yd = TensorDesc(y) + gd = TensorDesc(T, dims) -function rnnWorkspaceSize(r::RNNDesc, seqlen, xdesc) - size = Csize_t[0] - @check ccall((:cudnnGetRNNWorkspaceSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Ptr{Ptr{Nothing}},Ptr{Csize_t}), - libcudnn_handle[], r, seqlen, xdesc, size) - return Int(size[]) -end + if training -const workspace = [CuVector{UInt8}(1)] + if cache !== nothing + mean = zeros(CuArray{T}, dims...) + ivar = ones(CuArray{T}, dims...) + else + mean = C_NULL + ivar = C_NULL + end -getworkspace(bytes) = - length(workspace[]) ≥ bytes ? - workspace[] : - (workspace[] = CuVector{UInt8}(bytes)) - -getworkspace(r::RNNDesc, seqlen, xdesc) = - getworkspace(rnnWorkspaceSize(r, seqlen, xdesc)) - -function rnnTrainingReserveSize(r::RNNDesc, seqlen, xdesc) - size = Csize_t[0] - @check ccall((:cudnnGetRNNTrainingReserveSize,libcudnn), cudnnStatus_t, (Ptr{Nothing}, Ptr{Nothing}, Cint, Ptr{Ptr{Nothing}}, Ptr{Csize_t}), - libcudnn_handle[], r, seqlen, xdesc, size) - return Int(size[]) -end - -function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co, - workspace, reserve=nothing) where T - if reserve == nothing - @check ccall((:cudnnRNNForwardInference, libcudnn), cudnnStatus_t, - (Ptr{Nothing}, Ptr{Nothing}, Cint, - Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, - Ptr{Nothing}, Ptr{T}, Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, + @check ccall((:cudnnBatchNormalizationForwardTraining, libcudnn), cudnnStatus_t, + (cudnnHandle_t,cudnnBatchNormMode_t, + Ptr{T}, Ptr{T}, Ptr{Nothing}, Ptr{T}, - Ptr{Nothing}, Csize_t), - libcudnn_handle[], rnn, seqlen, - xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co, - workspace, length(workspace)) + Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, Ptr{T}, + Cdouble, Ptr{T}, Ptr{T}, + Cdouble, Ptr{T}, Ptr{T}), + handle(), BATCHNORM_SPATIAL, + Ref(T(alpha)), Ref(T(beta)), + xd, x, + yd, y, + gd, g, b, + momentum, running_mean, running_var, + eps, mean, ivar) + + if cache !== nothing + cache.mean = mean + cache.ivar = ivar + end else - @check ccall((:cudnnRNNForwardTraining, libcudnn), cudnnStatus_t, - (Ptr{Nothing}, Ptr{Nothing}, Cint, - Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, - Ptr{Nothing}, Csize_t, Ptr{Nothing}, Csize_t), - libcudnn_handle[], rnn, seqlen, - xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co, - workspace, length(workspace), reserve, length(reserve)) + @check ccall((:cudnnBatchNormalizationForwardInference, libcudnn), cudnnStatus_t, + (Ptr{cudnnHandle_t},cudnnBatchNormMode_t, + Ptr{T}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, Ptr{T}, + Ptr{T}, Ptr{T}, + Cdouble), + handle(), BATCHNORM_SPATIAL, + Ref(T(alpha)), Ref(T(beta)), + xd, x, + yd, y, + gd, g, b, + running_mean, running_var, + eps) end end -xDesc(x) = [TensorDesc(eltype(x), (1, size(x, 1), size(x, 2)))] - -hDesc(h::Nothing) = C_NULL, C_NULL -hDesc(x::Integer) = (@assert x == 0; hDesc(nothing)) -function hDesc(h::CuArray) - TensorDesc(eltype(h), (size(h, 1), size(h, 2), 1)), h +function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2}, dy::CuArray{T, 2}, + running_mean::CuArray{T}, running_var::CuArray{T}, momentum; + cache = nothing, eps = T(1e-5), alpha = T(1), + beta = T(0), training = true) where T<:Union{Float32, Float64} + dg, db, dx = ∇batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), reshape(dy, 1, 1, size(dy, 1), + size(dy, 2)), running_mean, running_var, momentum, cache = cache, eps = eps, + alpha = alpha, beta = beta, training = training) + (dg, db, dropdims(dx, dims = (1, 2))) end -# TODO: can we just manipulate strides here? -# TODO: should use repmat, but this isn't implemented. -hBatch(x::AbstractVector, h::CuVector) = h -hBatch(x::AbstractMatrix, h::CuVector) = h .* cuones(1, size(x, 2)) -hBatch(x::AbstractMatrix, h::CuMatrix) = h .* cuones(1, size(h,2) == 1 ? size(x,2) : 1) - -function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T - h = hBatch(x, h_) - c = c_ == nothing ? nothing : hBatch(x, c_) - @assert size(x, 1) == rnn.input - @assert size(h, 1) == rnn.hidden - @assert size(x, 2) == size(h, 2) - seqLength = 1 - xdesc = xDesc(x) - y = x isa AbstractVector ? similar(x, rnn.hidden) : similar(x, rnn.hidden, size(x, 2)) - ho = similar(h) - ydesc = xDesc(y) - workspace = getworkspace(rnn, seqLength, xdesc) - reserve = train == Val{true} ? - CuVector{UInt8}(rnnTrainingReserveSize(rnn, seqLength, xdesc)) : - nothing - co = c == nothing ? c : similar(c) - cudnnRNNForward(rnn, seqLength, - xdesc, x, - hDesc(h)..., - hDesc(c)..., - FilterDesc(T, (1, 1, length(rnn.params))), rnn.params, - ydesc, y, - hDesc(ho)..., - hDesc(co)..., - workspace, reserve) - result = c == nothing ? (y, ho) : (y, ho, co) - return train == Val{true} ? (reserve, result) : result +function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T}, dy::CuArray{T}, + running_mean::CuArray{T}, running_var::CuArray{T}, momentum; + cache = nothing, eps = T(1e-5), alpha = T(1), + beta = T(0), training = true) where T<:Union{Float32, Float64} + dg = similar(g) + db = similar(b) + dx = similar(x) + cudnnBNBackward!(dg, g, db, dx, x, dy, running_mean, running_var, T(momentum), + training = training, cache = cache, eps = eps, alpha = alpha, beta = beta) + (dg, db, dx) end -forwardTrain(rnn::RNNDesc{T}, x::CuArray{T}, h::CuArray{T}, c = nothing) where T = - forward(rnn, x, h, c, Val{true}) +function cudnnBNBackward!(dg::CuArray{T}, g::CuArray{T}, db::CuArray{T}, + dx::CuArray{T}, x::CuArray{T}, dy::CuArray{T}, + running_mean::CuArray{T}, running_var::CuArray{T}, + momentum; cache = nothing, eps = T(1e-5), + alpha = T(1), beta = T(0), + dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64} + if training + xd = TensorDesc(x) + dyd = TensorDesc(dy) + dxd = TensorDesc(dx) + gd = TensorDesc(T, _wsize(x)) + if cache !== nothing + mean, ivar = cache.mean, cache.ivar + info("mean and ivar are fetched from the cache") + else + mean, ivar = C_NULL, C_NULL + end -function cudnnRNNBackwardData(rnn::RNNDesc{T}, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco, - wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, rs) where T - @check ccall((:cudnnRNNBackwardData,libcudnn),cudnnStatus_t, - (Ptr{Nothing}, Ptr{Nothing}, Cint, - Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, - Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, - Ptr{T}, Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, - Ptr{Nothing}, Csize_t, Ptr{Nothing}, Csize_t), - libcudnn_handle[], rnn, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco, - wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, length(ws), rs, length(rs)) -end + if eps < BATCHNORM_MIN_EPS + eps = BATCHNORM_MIN_EPS + end -function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T - # Same as above, any more efficient way? - dy = dy_ isa Integer ? zero(y) : dy_ - yd = xDesc(y) - dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2)) - dh = similar(h) - dc = c == nothing ? nothing : similar(c) - cudnnRNNBackwardData(rnn, 1, - yd, y, yd, dy, hDesc(dho)..., hDesc(dco)..., - FilterDesc(T, (1, 1, length(rnn.params))), rnn.params, - hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)..., - workspace[], reserve) - return c == nothing ? (dx, dh) : (dx, dh, dc) -end - -backwardData(rnn, y, dy, dho, hx, reserve) = - backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve) - -function cudnnRNNBackwardWeights(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, yd, y, dwd, dw, - workspace, reserve) where T - @check ccall((:cudnnRNNBackwardWeights,libcudnn), cudnnStatus_t, - (Ptr{Nothing}, Ptr{Nothing}, Cint, # handle, rnnDesc, seqLength - Ptr{Ptr{Nothing}}, Ptr{T}, #x - Ptr{Nothing}, Ptr{T}, #hx - Ptr{Ptr{Nothing}}, Ptr{T}, #y - Ptr{Nothing}, Csize_t, #ws - Ptr{Nothing}, Ptr{T}, #dw - Ptr{Nothing}, Csize_t), #rs - libcudnn_handle[], rnn, seqlen, xd, x, hd, h, yd, y, - workspace, length(workspace), dwd, dw, reserve, length(reserve)) -end - -function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T - dw = zero(rnn.params) - cudnnRNNBackwardWeights(rnn, 1, - xDesc(x), x, hDesc(h)..., xDesc(y), y, - FilterDesc(T, (1, 1, length(dw))), dw, - workspace[], reserve) - return params(dw, rnn.input, rnn.hidden, ngates(rnn)) -end - -# Interface - -import ..Flux: Flux, relu -import ..Tracker: TrackedArray -using .CuArrays.CUDAnative -using .CuArrays: @cuindex, cudims - -function LinearAlgebra.copy_transpose!(dst::CuArray, src::CuArray) - function kernel(dst, src) - I = @cuindex dst - dst[I...] = src[reverse(I)...] - return - end - blk, thr = cudims(dst) - @cuda blocks=blk threads=thr kernel(dst, src) - return dst -end - -CuParam{T,N} = Union{CuArray{T,N},TrackedArray{T,N,CuArray{T,N}}} -CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuParam{T,2},<:CuParam{T,1}} -CuGRU{T} = Flux.GRUCell{<:CuParam{T,2},<:CuParam{T,1}} -CuLSTM{T} = Flux.LSTMCell{<:CuParam{T,2},<:CuParam{T,1}} -CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}} - -function copyparams!(m::CuRNNs, d::RNNDesc) - Wi, Wh = d.weights - copy_transpose!(Wi, Flux.data(m.Wi)) - copy_transpose!(Wh, Flux.data(m.Wh)) - copy_transpose!(d.bias, Flux.data(m.b)) - return -end - -function RNNDesc(m::CuRNNs{T}) where T - h, i = length(m.h), size(m.Wi, 2) - mode = m isa CuRNN ? - (m.σ == tanh ? RNN_TANH : RNN_RELU) : - m isa CuGRU ? GRU : LSTM - r = RNNDesc{T}(mode, i, h) - return r -end - -const descs = WeakKeyDict() - -function desc(rnn) - d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = RNNDesc(rnn)) - copyparams!(rnn, d) - return d -end - -import Flux.Tracker -import Flux.Tracker: data, istracked, track, unbroadcast, @grad, nobacksies - -istrain(m::CuRNNs, args...) = any(x -> x isa TrackedArray, (m.Wi, m.Wh, m.b, args...)) - -function (m::CuRNN{T})(h::CuParam{T}, x::CuParam{T}) where T <: Union{Float32,Float64} - result = istrain(m, h, x) ? - track(m, x, h, m.Wi, m.Wh, m.b) : - forward(desc(m), x, h) - return result[2], result[1] -end - -function (m::CuGRU{T})(h::CuParam{T}, x::CuParam{T}) where T <: Union{Float32,Float64} - result = istrain(m, h, x) ? - track(m, x, h, m.Wi, m.Wh, m.b) : - forward(desc(m), x, h) - return result[2], result[1] -end - -function (m::CuLSTM{T})(h::NTuple{2,CuParam{T}}, x::CuParam{T}) where T <: Union{Float32,Float64} - result = istrain(m, h, x) ? - track(m, x, h[1], h[2], m.Wi, m.Wh, m.b) : - forward(desc(m), x, h[1], h[2]) - return (result[2], result[3]), result[1] -end - -(m::CuRNN{T})(h::CuParam{T}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x)) -(m::CuGRU{T})(h::CuParam{T}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x)) -(m::CuLSTM{T})(h::NTuple{2,CuParam{T}}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x)) - -@grad function (m::Union{CuRNN,CuGRU})(x, h, Wi, Wh, b) - reserve, result = forwardTrain(desc(m), data(x), data(h)) - result, function (Δ) - y, ho = result - dy, dho = Δ - h_ = hBatch(x, data(h)) - dx, dh = backwardData(descs[m], y, dy, dho, h_, reserve) - (dWi, dWh), db = backwardWeights(descs[m], data(x), h_, y, reserve) - nobacksies(:RNN, (dx, unbroadcast(size(h), dh), transpose(dWi), transpose(dWh), db)) + @check ccall((:cudnnBatchNormalizationBackward, libcudnn), cudnnStatus_t, + (cudnnHandle_t,cudnnBatchNormMode_t, + Ptr{T}, Ptr{T}, + Ptr{T}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, Ptr{T}, Ptr{T}, + Cdouble, Ptr{T}, Ptr{T}), + handle(), BATCHNORM_SPATIAL, + Ref(T(alpha)), Ref(T(beta)), + Ref(T(dalpha)), Ref(T(dbeta)), + xd, x, + dyd, dy, + dxd, dx, + gd, g, dg, db, + eps, mean, ivar) + else + ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps) + dx .= dy .* reshape(g, _wsize(x)) .* ivar + dg .= squeeze(sum(dy .* (x .- reshape(running_mean, _wsize(x))) .* ivar, _reddims(dy)), dims = (1,2,4)) + db .= squeeze(sum(dy, _reddims(dy)), dims = (1,2,4)) end end -@grad function (m::CuLSTM)(x, h, c, Wi, Wh, b) - reserve, result = forwardTrain(desc(m), data.((x, h, c))...) - result, function (Δ) - y, ho = result - dy, dho, dco = Δ - h_ = hBatch(x, data(h)) - c_ = hBatch(x, data(c)) - dx, dh, dc = backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve) - (dWi, dWh), db = backwardWeights(descs[m], data(x), h_, y, reserve) - nobacksies(:RNN, - (dx, unbroadcast(size(h), dh), unbroadcast(size(c), dc), - transpose(dWi), transpose(dWh), db)) - end -end +# Flux Interface + +(BN::Flux.BatchNorm)(x::Union{CuParam{T,2},CuParam{T,4},CuParam{T,5}}, cache = nothing) where T<:Union{Float32, Float64} = + batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = BN.active) + +batchnorm(g::TrackedArray, b::TrackedArray, x::TrackedArray, running_mean::CuArray{T}, + running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} = + track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...) + +batchnorm(g::TrackedArray, b::TrackedArray, x::CuArray{T}, running_mean::CuArray{T}, + running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} = + track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...) + +batchnorm(g::TrackedArray, b::CuArray{T}, x::TrackedArray, running_mean::CuArray{T}, + running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} = + track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...) + +batchnorm(g::CuArray{T}, b::TrackedArray, x::CuArray{T}, running_mean::CuArray{T}, + running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} = + track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...) + +batchnorm(g::CuArray{T}, b::TrackedArray, x::TrackedArray, running_mean::CuArray{T}, + running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} = + track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...) + +batchnorm(g::TrackedArray, b::CuArray{T}, x::CuArray{T}, running_mean::CuArray{T}, + running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} = + track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...) + +batchnorm(g::CuArray{T}, b::CuArray{T}, x::TrackedArray, running_mean::CuArray{T}, + running_var::CuArray{T}, momentum; kw...) where T<:Union{Float32, Float64} = + track(batchnorm, g, b, x, running_mean, running_var, momentum; kw...) + +@grad batchnorm(g, b, x, running_mean, running_var, momentum; kw...) = + batchnorm(data.((g, b, x))..., running_mean, running_var, momentum; kw...), Δ -> (nobacksies(:batchnorm, ∇batchnorm(data.((g, b, x, Δ))..., running_mean, running_var, momentum; kw...))..., nothing, nothing, nothing) diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl new file mode 100644 index 00000000..210ddd7c --- /dev/null +++ b/src/cuda/curnn.jl @@ -0,0 +1,325 @@ +using .CuArrays.CUDNN: @check, libcudnn, cudnnStatus_t, cudnnTensorDescriptor_t, + cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc +using LinearAlgebra + +const RNN_RELU = 0 # Stock RNN with ReLu activation +const RNN_TANH = 1 # Stock RNN with tanh activation +const LSTM = 2 # LSTM with no peephole connections +const GRU = 3 # Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1) + +const LINEAR_INPUT = 0 +const SKIP_INPUT = 1 + +const UNIDIRECTIONAL = 0 +const BIDIRECTIONAL = 1 + +const RNN_ALGO_STANDARD = 0 +const RNN_ALGO_PERSIST_STATIC = 1 +const RNN_ALGO_PERSIST_DYNAMIC = 2 + +# param layout: +# RNN: [weight, bias] × [input, hidden] +# GRU: [weight, bias] × [input, hidden] × [reset, update, newmem] +# LSTM: [weight, bias] × [input, hidden] × [input, forget, newmem, output] + +function params(w::CuVector, input, hidden, n = 1) + slice(offset, shape) = reshape(view(w, offset.+(1:prod(shape))), shape) + wx = slice(0, (input, hidden*n)) + wh = slice(length(wx), (hidden, hidden*n)) + bias = view(w, length(wx)+length(wh) .+ (1:hidden*n)) + (wx, wh), bias +end + +mutable struct RNNDesc{T} + mode::Int + input::Int + hidden::Int + params::CuVector{T} + weights::NTuple{2,CuMatrix{T}} + bias::CuVector{T} + ptr::Ptr{Nothing} +end + +Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr + +function rnnParamSize(T, r, input) + size = Csize_t[0] + @check ccall((:cudnnGetRNNParamsSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Ptr{Nothing},Ptr{Csize_t},Cint), + handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T)) + return Int(size[])÷sizeof(T) +end + +ngates(mode) = [1, 1, 4, 3][mode+1] +ngates(r::RNNDesc) = ngates(r.mode) + +function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T + d = [C_NULL] + @check ccall((:cudnnCreateRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Ptr{Nothing}},),d) + + dropoutDesc = DropoutDesc(0) + inputMode = LINEAR_INPUT + direction = UNIDIRECTIONAL + algo = RNN_ALGO_STANDARD + @check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint), + handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T)) + + w = cuzeros(T, rnnParamSize(T, d[], input)) + # TODO: avoid reserve allocation here + rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[]) + finalizer(rd) do x + @check ccall((:cudnnDestroyRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x) + end + return rd +end + +function rnnWorkspaceSize(r::RNNDesc, seqlen, xdesc) + size = Csize_t[0] + @check ccall((:cudnnGetRNNWorkspaceSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Ptr{Ptr{Nothing}},Ptr{Csize_t}), + handle(), r, seqlen, xdesc, size) + return Int(size[]) +end + +const workspace = [CuVector{UInt8}(undef, 1)] + +getworkspace(bytes) = + length(workspace[]) ≥ bytes ? + workspace[] : + (workspace[] = CuVector{UInt8}(undef, bytes)) + +getworkspace(r::RNNDesc, seqlen, xdesc) = + getworkspace(rnnWorkspaceSize(r, seqlen, xdesc)) + +function rnnTrainingReserveSize(r::RNNDesc, seqlen, xdesc) + size = Csize_t[0] + @check ccall((:cudnnGetRNNTrainingReserveSize,libcudnn), cudnnStatus_t, (Ptr{Nothing}, Ptr{Nothing}, Cint, Ptr{Ptr{Nothing}}, Ptr{Csize_t}), + handle(), r, seqlen, xdesc, size) + return Int(size[]) +end + +function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co, + workspace, reserve=nothing) where T + if reserve == nothing + @check ccall((:cudnnRNNForwardInference, libcudnn), cudnnStatus_t, + (Ptr{Nothing}, Ptr{Nothing}, Cint, + Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Csize_t), + handle(), rnn, seqlen, + xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co, + workspace, length(workspace)) + else + @check ccall((:cudnnRNNForwardTraining, libcudnn), cudnnStatus_t, + (Ptr{Nothing}, Ptr{Nothing}, Cint, + Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Csize_t, Ptr{Nothing}, Csize_t), + handle(), rnn, seqlen, + xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co, + workspace, length(workspace), reserve, length(reserve)) + end +end + +xDesc(x) = [TensorDesc(eltype(x), (1, size(x, 1), size(x, 2)))] + +hDesc(h::Nothing) = C_NULL, C_NULL +hDesc(x::Integer) = (@assert x == 0; hDesc(nothing)) +function hDesc(h::CuArray) + TensorDesc(eltype(h), (size(h, 1), size(h, 2), 1)), h +end + +# TODO: can we just manipulate strides here? +# TODO: should use repmat, but this isn't implemented. +hBatch(x::AbstractVector, h::CuVector) = h +hBatch(x::AbstractMatrix, h::CuVector) = h .* cuones(1, size(x, 2)) +hBatch(x::AbstractMatrix, h::CuMatrix) = h .* cuones(1, size(h,2) == 1 ? size(x,2) : 1) + +function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T + h = hBatch(x, h_) + c = c_ == nothing ? nothing : hBatch(x, c_) + @assert size(x, 1) == rnn.input + @assert size(h, 1) == rnn.hidden + @assert size(x, 2) == size(h, 2) + seqLength = 1 + xdesc = xDesc(x) + y = x isa AbstractVector ? similar(x, rnn.hidden) : similar(x, rnn.hidden, size(x, 2)) + ho = similar(h) + ydesc = xDesc(y) + workspace = getworkspace(rnn, seqLength, xdesc) + reserve = train == Val{true} ? + CuVector{UInt8}(undef, rnnTrainingReserveSize(rnn, seqLength, xdesc)) : + nothing + co = c == nothing ? c : similar(c) + cudnnRNNForward(rnn, seqLength, + xdesc, x, + hDesc(h)..., + hDesc(c)..., + FilterDesc(T, (1, 1, length(rnn.params))), rnn.params, + ydesc, y, + hDesc(ho)..., + hDesc(co)..., + workspace, reserve) + result = c == nothing ? (y, ho) : (y, ho, co) + return train == Val{true} ? (reserve, result) : result +end + +forwardTrain(rnn::RNNDesc{T}, x::CuArray{T}, h::CuArray{T}, c = nothing) where T = + forward(rnn, x, h, c, Val{true}) + +function cudnnRNNBackwardData(rnn::RNNDesc{T}, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco, + wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, rs) where T + @check ccall((:cudnnRNNBackwardData,libcudnn),cudnnStatus_t, + (Ptr{Nothing}, Ptr{Nothing}, Cint, + Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, + Ptr{T}, Ptr{Ptr{Nothing}}, Ptr{T}, Ptr{Nothing}, Ptr{T}, Ptr{Nothing}, Ptr{T}, + Ptr{Nothing}, Csize_t, Ptr{Nothing}, Csize_t), + handle(), rnn, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco, + wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, length(ws), rs, length(rs)) +end + +function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T + # Same as above, any more efficient way? + dy = dy_ isa Integer ? zero(y) : dy_ + yd = xDesc(y) + dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2)) + dh = similar(h) + dc = c == nothing ? nothing : similar(c) + cudnnRNNBackwardData(rnn, 1, + yd, y, yd, dy, hDesc(dho)..., hDesc(dco)..., + FilterDesc(T, (1, 1, length(rnn.params))), rnn.params, + hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)..., + workspace[], reserve) + return c == nothing ? (dx, dh) : (dx, dh, dc) +end + +backwardData(rnn, y, dy, dho, hx, reserve) = + backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve) + +function cudnnRNNBackwardWeights(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, yd, y, dwd, dw, + workspace, reserve) where T + @check ccall((:cudnnRNNBackwardWeights,libcudnn), cudnnStatus_t, + (Ptr{Nothing}, Ptr{Nothing}, Cint, # handle, rnnDesc, seqLength + Ptr{Ptr{Nothing}}, Ptr{T}, #x + Ptr{Nothing}, Ptr{T}, #hx + Ptr{Ptr{Nothing}}, Ptr{T}, #y + Ptr{Nothing}, Csize_t, #ws + Ptr{Nothing}, Ptr{T}, #dw + Ptr{Nothing}, Csize_t), #rs + handle(), rnn, seqlen, xd, x, hd, h, yd, y, + workspace, length(workspace), dwd, dw, reserve, length(reserve)) +end + +function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T + dw = zero(rnn.params) + cudnnRNNBackwardWeights(rnn, 1, + xDesc(x), x, hDesc(h)..., xDesc(y), y, + FilterDesc(T, (1, 1, length(dw))), dw, + workspace[], reserve) + return params(dw, rnn.input, rnn.hidden, ngates(rnn)) +end + +# Interface + +import ..Flux: Flux, relu +import ..Tracker: TrackedArray +using .CuArrays.CUDAnative +using .CuArrays: @cuindex, cudims + +function LinearAlgebra.copy_transpose!(dst::CuArray, src::CuArray) + function kernel(dst, src) + I = @cuindex dst + dst[I...] = src[reverse(I)...] + return + end + blk, thr = cudims(dst) + @cuda blocks=blk threads=thr kernel(dst, src) + return dst +end + +CuParam{T,N} = Union{CuArray{T,N},TrackedArray{T,N,CuArray{T,N}}} +CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuParam{T,2},<:CuParam{T,1}} +CuGRU{T} = Flux.GRUCell{<:CuParam{T,2},<:CuParam{T,1}} +CuLSTM{T} = Flux.LSTMCell{<:CuParam{T,2},<:CuParam{T,1}} +CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}} + +function copyparams!(m::CuRNNs, d::RNNDesc) + Wi, Wh = d.weights + copy_transpose!(Wi, Flux.data(m.Wi)) + copy_transpose!(Wh, Flux.data(m.Wh)) + copy_transpose!(d.bias, Flux.data(m.b)) + return +end + +function RNNDesc(m::CuRNNs{T}) where T + h, i = length(m.h), size(m.Wi, 2) + mode = m isa CuRNN ? + (m.σ == tanh ? RNN_TANH : RNN_RELU) : + m isa CuGRU ? GRU : LSTM + r = RNNDesc{T}(mode, i, h) + return r +end + +const descs = WeakKeyDict() + +function desc(rnn) + d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = RNNDesc(rnn)) + copyparams!(rnn, d) + return d +end + +import Flux.Tracker +import Flux.Tracker: data, istracked, track, unbroadcast, @grad, nobacksies + +istrain(m::CuRNNs, args...) = any(x -> x isa TrackedArray, (m.Wi, m.Wh, m.b, args...)) + +function (m::CuRNN{T})(h::CuParam{T}, x::CuParam{T}) where T <: Union{Float32,Float64} + result = istrain(m, h, x) ? + track(m, x, h, m.Wi, m.Wh, m.b) : + forward(desc(m), x, h) + return result[2], result[1] +end + +function (m::CuGRU{T})(h::CuParam{T}, x::CuParam{T}) where T <: Union{Float32,Float64} + result = istrain(m, h, x) ? + track(m, x, h, m.Wi, m.Wh, m.b) : + forward(desc(m), x, h) + return result[2], result[1] +end + +function (m::CuLSTM{T})(h::NTuple{2,CuParam{T}}, x::CuParam{T}) where T <: Union{Float32,Float64} + result = istrain(m, h, x) ? + track(m, x, h[1], h[2], m.Wi, m.Wh, m.b) : + forward(desc(m), x, h[1], h[2]) + return (result[2], result[3]), result[1] +end + +(m::CuRNN{T})(h::CuParam{T}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x)) +(m::CuGRU{T})(h::CuParam{T}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x)) +(m::CuLSTM{T})(h::NTuple{2,CuParam{T}}, x) where T <: Union{Float32,Float64} = m(h, CuArray{T}(x)) + +@grad function (m::Union{CuRNN,CuGRU})(x, h, Wi, Wh, b) + reserve, result = forwardTrain(desc(m), data(x), data(h)) + result, function (Δ) + y, ho = result + dy, dho = Δ + h_ = hBatch(x, data(h)) + dx, dh = backwardData(descs[m], y, dy, dho, h_, reserve) + (dWi, dWh), db = backwardWeights(descs[m], data(x), h_, y, reserve) + nobacksies(:RNN, (dx, unbroadcast(h, dh), transpose(dWi), transpose(dWh), db)) + end +end + +@grad function (m::CuLSTM)(x, h, c, Wi, Wh, b) + reserve, result = forwardTrain(desc(m), data.((x, h, c))...) + result, function (Δ) + y, ho = result + dy, dho, dco = Δ + h_ = hBatch(x, data(h)) + c_ = hBatch(x, data(c)) + dx, dh, dc = backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve) + (dWi, dWh), db = backwardWeights(descs[m], data(x), h_, y, reserve) + nobacksies(:RNN, + (dx, unbroadcast(h, dh), unbroadcast(c, dc), + transpose(dWi), transpose(dWh), db)) + end +end diff --git a/src/data/Data.jl b/src/data/Data.jl index d5b5f38d..ddf0624b 100644 --- a/src/data/Data.jl +++ b/src/data/Data.jl @@ -13,6 +13,9 @@ end include("mnist.jl") export MNIST +include("fashion-mnist.jl") +export FashionMNIST + include("cmudict.jl") using .CMUDict diff --git a/src/data/fashion-mnist.jl b/src/data/fashion-mnist.jl new file mode 100644 index 00000000..e4510b47 --- /dev/null +++ b/src/data/fashion-mnist.jl @@ -0,0 +1,64 @@ +module FashionMNIST + +using ..MNIST: gzopen, imageheader, rawimage, labelheader, rawlabel + +const dir = joinpath(@__DIR__, "../../deps/fashion-mnist") + +function load() + mkpath(dir) + cd(dir) do + for file in ["train-images-idx3-ubyte", + "train-labels-idx1-ubyte", + "t10k-images-idx3-ubyte", + "t10k-labels-idx1-ubyte"] + isfile(file) && continue + @info "Downloading Fashion-MNIST dataset" + download("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/$file.gz", "$file.gz") + open(file, "w") do io + write(io, gzopen(read, "$file.gz")) + end + end + end +end + +const TRAINIMAGES = joinpath(dir, "train-images-idx3-ubyte") +const TRAINLABELS = joinpath(dir, "train-labels-idx1-ubyte") +const TESTIMAGES = joinpath(dir, "t10k-images-idx3-ubyte") +const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte") + +""" + images() + images(:test) + +Load the Fashion-MNIST images. + +Each image is a 28×28 array of `Gray` colour values (see Colors.jl). + +Returns the 60,000 training images by default; pass `:test` to retreive the +10,000 test images. +""" +function images(set = :train) + load() + io = IOBuffer(read(set == :train ? TRAINIMAGES : TESTIMAGES)) + _, N, nrows, ncols = imageheader(io) + [rawimage(io) for _ in 1:N] +end + +""" + labels() + labels(:test) + +Load the labels corresponding to each of the images returned from `images()`. +Each label is a number from 0-9. + +Returns the 60,000 training labels by default; pass `:test` to retreive the +10,000 test labels. +""" +function labels(set = :train) + load() + io = IOBuffer(read(set == :train ? TRAINLABELS : TESTLABELS)) + _, N = labelheader(io) + [rawlabel(io) for _ = 1:N] +end + +end diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 3e887472..758aa0a9 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -16,19 +16,21 @@ m(x) == m[2](m[1](x)) `Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`. `m[1:3](x)` will calculate the output of the first three layers. """ -struct Chain - layers::Vector{Any} - Chain(xs...) = new([xs...]) +struct Chain{T<:Tuple} + layers::T + Chain(xs...) = new{typeof(xs)}(xs) end -@forward Chain.layers Base.getindex, Base.first, Base.last, Base.lastindex, Base.push! -@forward Chain.layers Base.iterate +@forward Chain.layers Base.getindex, Base.length, Base.first, Base.last, + Base.iterate, Base.lastindex children(c::Chain) = c.layers mapchildren(f, c::Chain) = Chain(f.(c.layers)...) -adapt(T, c::Chain) = Chain(map(x -> adapt(T, x), c.layers)...) -(c::Chain)(x) = foldl((x, m) -> m(x), c.layers; init = x) +applychain(::Tuple{}, x) = x +applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x)) + +(c::Chain)(x) = applychain(c.layers, x) Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...) @@ -75,7 +77,7 @@ end @treelike Dense -function (a::Dense)(x) +function (a::Dense)(x::AbstractArray) W, b, σ = a.W, a.b, a.σ σ.(W*x .+ b) end @@ -114,3 +116,11 @@ end function Base.show(io::IO, l::Diagonal) print(io, "Diagonal(", length(l.α), ")") end + +# Try to avoid hitting generic matmul in some simple cases +# Base's matmul is so slow that it's worth the extra conversion to hit BLAS +(a::Dense{<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = + invoke(a, Tuple{AbstractArray}, x) + +(a::Dense{<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = + a(T.(x)) diff --git a/src/layers/conv.jl b/src/layers/conv.jl index dbf8ccf9..99fc16f2 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -1,4 +1,4 @@ -using NNlib: conv +using NNlib: conv, depthwiseconv @generated sub2(::Val{N}) where N = :(Val($(N-2))) @@ -30,14 +30,14 @@ Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity; stride = 1, pad = 0, dilation = 1) where {T,N} = Conv(σ, w, b, expand.(sub2(Val(N)), (stride, pad, dilation))...) -Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; init = initn, - stride = 1, pad = 0, dilation = 1) where N = +Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; + init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N = Conv(param(init(k..., ch...)), param(zeros(ch[2])), σ, stride = stride, pad = pad, dilation = dilation) @treelike Conv -function (c::Conv)(x) +function (c::Conv)(x::AbstractArray) # TODO: breaks gpu broadcast :( # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1))) σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1) @@ -51,6 +51,62 @@ function Base.show(io::IO, l::Conv) print(io, ")") end +(a::Conv{<:Any,<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = + invoke(a, Tuple{AbstractArray}, x) + +(a::Conv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = + a(T.(x)) + +""" + DepthwiseConv(size, in) + DepthwiseConv(size, in=>mul) + DepthwiseConv(size, in=>mul, relu) + +Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`. +`in` and `mul` specify the number of input channels and channel multiplier respectively. +In case the `mul` is not specified it is taken as 1. + +Data should be stored in WHCN order. In other words, a 100×100 RGB image would +be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array. + +Takes the keyword arguments `pad` and `stride`. +""" +struct DepthwiseConv{N,F,A,V} + σ::F + weight::A + bias::V + stride::NTuple{N,Int} + pad::NTuple{N,Int} +end + +DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity; + stride = 1, pad = 0) where {T,N} = + DepthwiseConv(σ, w, b, expand.(sub2(Val(N)), (stride, pad))...) + +DepthwiseConv(k::NTuple{N,Integer}, ch::Integer, σ = identity; init = initn, + stride = 1, pad = 0) where N = + DepthwiseConv(param(init(k..., 1, ch)), param(zeros(ch)), σ, + stride = stride, pad = pad) + +DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; init = initn, + stride::NTuple{N,Integer} = map(_->1,k), + pad::NTuple{N,Integer} = map(_->0,k)) where N = + DepthwiseConv(param(init(k..., ch[2], ch[1])), param(zeros(ch[2]*ch[1])), σ, + stride = stride, pad = pad) + +@treelike DepthwiseConv + +function (c::DepthwiseConv)(x) + σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1) + σ.(depthwiseconv(x, c.weight, stride = c.stride, pad = c.pad) .+ b) +end + +function Base.show(io::IO, l::DepthwiseConv) + print(io, "DepthwiseConv(", size(l.weight)[1:ndims(l.weight)-2]) + print(io, ", ", size(l.weight, ndims(l.weight)), "=>", size(l.weight, ndims(l.weight)-1)) + l.σ == identity || print(io, ", ", l.σ) + print(io, ")") +end """ MaxPool(k) @@ -60,9 +116,9 @@ Max pooling layer. `k` stands for the size of the window for each dimension of t Takes the keyword arguments `pad` and `stride`. """ struct MaxPool{N} - k::NTuple{N,Int} - pad::NTuple{N,Int} - stride::NTuple{N,Int} + k::NTuple{N,Int} + pad::NTuple{N,Int} + stride::NTuple{N,Int} end MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N = diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 164f6fa7..9201e991 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -44,7 +44,6 @@ end _testmode!(a::Dropout, test) = (a.active = !test) """ - LayerNorm(h::Integer) A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be @@ -86,7 +85,6 @@ See [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf). Example: - ```julia m = Chain( Dense(28^2, 64), @@ -101,14 +99,14 @@ mutable struct BatchNorm{F,V,W,N} β::V # bias γ::V # scale μ::W # moving mean - σ::W # moving std + σ²::W # moving std ϵ::N momentum::N active::Bool end BatchNorm(chs::Integer, λ = identity; - initβ = (i) -> zeros(i), initγ = (i) -> ones(i), ϵ = 1e-8, momentum = .1) = + initβ = (i) -> zeros(i), initγ = (i) -> ones(i), ϵ = 1e-5, momentum = .1) = BatchNorm(λ, param(initβ(chs)), param(initγ(chs)), zeros(chs), ones(chs), ϵ, momentum, true) @@ -124,31 +122,31 @@ function (BN::BatchNorm)(x) if !BN.active μ = reshape(BN.μ, affine_shape...) - σ = reshape(BN.σ, affine_shape...) + σ² = reshape(BN.σ², affine_shape...) else T = eltype(x) ϵ = data(convert(T, BN.ϵ)) axes = [1:dims-2; dims] # axes to reduce along (all but channels axis) μ = mean(x, dims = axes) - σ = sqrt.(mean((x .- μ).^2, dims = axes) .+ ϵ) + σ² = sum((x .- μ) .^ 2, dims = axes) ./ m # update moving mean/std mtm = data(convert(T, BN.momentum)) - BN.μ = (1 - mtm) .* BN.μ .+ mtm .* dropdims(data(μ), dims = (axes...,)) - BN.σ = (1 - mtm) .* BN.σ .+ mtm .* dropdims(data(σ), dims = (axes...,)) .* m ./ (m - 1) + BN.μ = (1 - mtm) .* BN.μ .+ mtm .* reshape(data(μ), :) + BN.σ² = ((1 - mtm) .* BN.σ² .+ mtm .* reshape(data(σ²), :) .* m ./ (m - 1)) end let λ = BN.λ - λ.(reshape(γ, affine_shape...) .* ((x .- μ) ./ σ) .+ reshape(β, affine_shape...)) + λ.(reshape(γ, affine_shape...) .* ((x .- μ) ./ sqrt.(σ² .+ BN.ϵ)) .+ reshape(β, affine_shape...)) end end children(BN::BatchNorm) = - (BN.λ, BN.β, BN.γ, BN.μ, BN.σ, BN.ϵ, BN.momentum, BN.active) + (BN.λ, BN.β, BN.γ, BN.μ, BN.σ², BN.ϵ, BN.momentum, BN.active) mapchildren(f, BN::BatchNorm) = # e.g. mapchildren(cu, BN) - BatchNorm(BN.λ, f(BN.β), f(BN.γ), f(BN.μ), f(BN.σ), BN.ϵ, BN.momentum, BN.active) + BatchNorm(BN.λ, f(BN.β), f(BN.γ), f(BN.μ), f(BN.σ²), BN.ϵ, BN.momentum, BN.active) _testmode!(BN::BatchNorm, test) = (BN.active = !test) diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index 3b40af04..40cd322a 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -148,7 +148,7 @@ Base.show(io::IO, l::LSTMCell) = print(io, "LSTMCell(", size(l.Wi, 2), ", ", size(l.Wi, 1)÷4, ")") """ - LSTM(in::Integer, out::Integer, σ = tanh) + LSTM(in::Integer, out::Integer) Long Short Term Memory recurrent layer. Behaves like an RNN but generally exhibits a longer memory span over sequences. @@ -189,7 +189,7 @@ Base.show(io::IO, l::GRUCell) = print(io, "GRUCell(", size(l.Wi, 2), ", ", size(l.Wi, 1)÷3, ")") """ - GRU(in::Integer, out::Integer, σ = tanh) + GRU(in::Integer, out::Integer) Gated Recurrent Unit layer. Behaves like an RNN but generally exhibits a longer memory span over sequences. diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 891ec230..95b1d44a 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -2,16 +2,16 @@ using NNlib: logsoftmax, logσ # Cost functions -mse(ŷ, y) = sum((ŷ .- y).^2)/length(y) +mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y) function crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1) - -sum(y .* log.(ŷ) .* weight) / size(y, 2) + -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2) end @deprecate logloss(x, y) crossentropy(x, y) function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1) - return -sum(y .* logsoftmax(logŷ) .* weight) / size(y, 2) + return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2) end """ diff --git a/src/onehot.jl b/src/onehot.jl index 5d902c77..cd29f14e 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -28,9 +28,9 @@ Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs batch(xs::AbstractArray{<:OneHotVector}) = OneHotMatrix(length(first(xs)), xs) -import Adapt.adapt +import Adapt: adapt, adapt_structure -adapt(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data)) +adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data)) @init @require CuArrays="3a865a2d-5b23-5a0f-bc46-62713ec82fae" begin import .CuArrays: CuArray, cudaconvert @@ -68,3 +68,6 @@ end a::TrackedMatrix * b::OneHotVector = invoke(*, Tuple{AbstractMatrix,OneHotVector}, a, b) a::TrackedMatrix * b::OneHotMatrix = invoke(*, Tuple{AbstractMatrix,OneHotMatrix}, a, b) + +onecold(x::TrackedVector, l...) = onecold(data(x), l...) +onecold(x::TrackedMatrix, l...) = onecold(data(x), l...) diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl index c4828c9e..5bb38d1e 100644 --- a/src/optimise/Optimise.jl +++ b/src/optimise/Optimise.jl @@ -1,23 +1,12 @@ module Optimise export train!, - SGD, ADAM, ADAMW, AdaMax, Momentum, Nesterov, - RMSProp, ADAGrad, ADADelta, AMSGrad, NADAM, stop, StopException - -struct Param{T} - x::T - Δ::T -end - -Param(x::AbstractArray) = Param(x, zero(x)) + SGD, Descent, ADAM, Momentum, Nesterov, RMSProp, + ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW, + InvDecay, ExpDecay, WeightDecay, stop, Optimiser include("optimisers.jl") -include("interface.jl") include("train.jl") - -using Flux.Tracker: TrackedArray - -Param(x::TrackedArray) = Param(x.data, x.grad) -# Base.convert(::Type{Param}, x::TrackedArray) = Param(x.data, x.grad) +include("deprecations.jl") end diff --git a/src/optimise/deprecations.jl b/src/optimise/deprecations.jl new file mode 100644 index 00000000..34853bf6 --- /dev/null +++ b/src/optimise/deprecations.jl @@ -0,0 +1,126 @@ +using Base: depwarn +using Flux: Params + +check_decay(opt, decay) = decay == 0 ? opt : Optimiser(opt, InvDecay(decay)) + +# legacy update rule +updaterule(opt, ps) = () -> update!(opt, ps) + +function SGD(params::Union{AbstractArray, Params}, η = 0.1; decay = 0.) + depwarn("SGD(params) is deprecated; use Descent(η::Float64) instead", :SGD) + + ps = params + opt = Descent(η) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function Momentum(params::Union{AbstractArray, Params}, η = 0.01; ρ = 0.9, decay = 0.) + depwarn("Momentum(params) is deprecated; use Momentum(η::Float64) instead", :Momentum) + + ps = params + opt = Momentum(η, ρ) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function Nesterov(params::Union{AbstractArray, Params}, η = 0.001; ρ = 0.9, decay = 0.) + depwarn("Nesterov(params) is deprecated; use Nesterov(η::Float64) instead", :Nesterov) + + ps = params + opt = Nesterov(η, ρ) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function RMSProp(params::Union{AbstractArray, Params}, η = 0.001; ρ = 0.9, decay = 0.) + depwarn("RMSProp(params) is deprecated; use RMSProp(η::Float64) instead", :RMSProp) + + ps = params + opt = RMSProp(η, ρ) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function ADAM(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.) + depwarn("ADAM(params) is deprecated; use ADAM(η::Float64) instead", :ADAM) + + ps = params + β = (β1, β2) + opt = ADAM(η, β) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function ADAGrad(params::Union{AbstractArray, Params}, η::Float64 = 0.1; decay = 0.) + depwarn("ADAGrad(params) is deprecated; use ADAGrad(η::Float64) instead", :ADAGrad) + + ps = params + opt = ADAGrad(η) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function ADADelta(params::Union{AbstractArray, Params}, ρ::Float64 = 0.9; decay = 0.) + depwarn("ADADelta(params) is deprecated; use ADADelta(η::Float64) instead", :ADADelta) + + ps = params + opt = ADADelta(ρ) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function AdaMax(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.) + depwarn("AdaMax(params) is deprecated; use AdaMax(η::Float64) instead", :AdaMax) + + ps = params + β = (β1, β2) + opt = AdaMax(η, β) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function AMSGrad(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.) + depwarn("AMSGrad(params) is deprecated; use AMSGrad(η::Float64) instead", :AMSGrad) + + ps = params + β = (β1, β2) + opt = AMSGrad(η, β) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function NADAM(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.) + depwarn("NADAM(params) is deprecated; use NADAM(η::Float64) instead", :NADAM) + + ps = params + β = (β1, β2) + opt = NADAM(η, β) + opt = check_decay(opt, decay) + updaterule(opt, ps) +end + +function ADAMW(params::Union{AbstractArray, Params}, η = 0.001; β1 = 0.9, β2 = 0.999, decay = 0.) + depwarn("ADAMW(params) is deprecated; use ADAMW(η::Float64) instead", :ADAMW) + + ps = params + β = (β1, β2) + opt = ADAMW(η, β) + opt = check_decay(opt, decay) + decay != 0 && (opt = Optimiser(opt, WeightDecay(decay))) + updaterule(opt, ps) +end + +# Old training loop + +struct OldOptimiser + func +end + +update!(opt::OldOptimiser, ps) = opt.func() + +# Train function +function train!(loss, data, opt; cb = () -> ()) + depwarn("train!(loss, data, opt) is deprecated; use train!(loss, params, data, opt) instead", :train!) + train!(loss, (), data, OldOptimiser(opt); cb = cb) +end diff --git a/src/optimise/interface.jl b/src/optimise/interface.jl deleted file mode 100644 index 096e2d87..00000000 --- a/src/optimise/interface.jl +++ /dev/null @@ -1,110 +0,0 @@ -call(f, xs...) = f(xs...) - -# note for optimisers: set to zero -# p.Δ at the end of the weights update -function optimiser(ps, fs...) - ps = [Param(p) for p in ps] - fs = map(ps) do p - os = map(f -> f(p), fs) - () -> foreach(call, os) - end - () -> foreach(call, fs) -end - -""" - SGD(params, η = 0.1; decay = 0) - -Classic gradient descent optimiser with learning rate `η`. -For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`. - -Supports inverse decaying learning rate if the `decay` argument is provided. -""" -SGD(ps, η = 0.1; decay = 0) = - optimiser(ps, p -> invdecay(p, decay), p -> descent(p,η)) - -""" - Momentum(params, η = 0.01; ρ = 0.9, decay = 0) - -SGD with learning rate `η`, momentum `ρ` and optional learning rate inverse decay. -""" -Momentum(ps, η = 0.01; ρ = 0.9, decay = 0) = - optimiser(ps, p->invdecay(p,decay), p->momentum(p, ρ, η), p->descent(p,1)) - -""" - Nesterov(params, η = 0.01; ρ = 0.9, decay = 0) - -SGD with learning rate `η`, Nesterov momentum `ρ` and optional learning rate inverse decay. -""" -Nesterov(ps, η = 0.01; ρ = 0.9, decay = 0) = - optimiser(ps, p->invdecay(p,decay), p->nesterov(p, ρ, η), p->descent(p,1)) - -""" - RMSProp(params, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0) - -[RMSProp](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) -optimiser. Parameters other than learning rate don't need tuning. Often a good -choice for recurrent networks. -""" -RMSProp(ps, η = 0.001; ρ = 0.9, ϵ = 1e-8, decay = 0) = - optimiser(ps, p->rmsprop(p; η=η, ρ=ρ, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) - -""" - ADAM(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) - -[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser. -""" -ADAM(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) = - optimiser(ps, p->adam(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) - -""" - ADAMW((params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) - -[ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam. -""" -ADAMW(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) = - optimiser(ps, p->adam(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->descentweightdecay(p,1,decay)) - -""" - AdaMax(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) - -[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on -the ∞-norm. -""" -AdaMax(ps, η = 0.002; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) = - optimiser(ps, p->adamax(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) - -""" - ADAGrad(params, η = 0.01; ϵ = 1e-8, decay = 0) - -[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser. -Parameters don't need tuning. -""" -ADAGrad(ps, η = 0.01; ϵ = 1e-8, decay = 0) = - optimiser(ps, p->adagrad(p; η=η, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) - -""" - ADADelta(params; ρ = 0.9, ϵ = 1e-8, decay = 0) - -[ADADelta](http://arxiv.org/abs/1212.5701) optimiser. Parameters don't need -tuning. -""" -ADADelta(ps; ρ = 0.9, ϵ = 1e-8, decay = 0) = - optimiser(ps, p->adadelta(p; ρ=ρ, ϵ=ϵ), p->descent(p,1)) - -""" - AMSGrad(params; η = 0.001, β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) - -[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need -tuning. -""" -AMSGrad(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) = - optimiser(ps, p -> amsgrad(p; η = η, β1 = β1, β2 = β2, ϵ = ϵ), p -> invdecay(p, decay), p -> descent(p, 1)) - -""" - NADAM(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) - -[NADAM](https://openreview.net/pdf?id=OM0jvwB8jIp57ZJjtNEZ) optimiser. Parameters other -than learning rate don't need tuning. -""" -NADAM(ps, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08, decay = 0) = - optimiser(ps, p->nadam(p; η=η, β1=β1, β2=β2, ϵ=ϵ), p->invdecay(p,decay), p->descent(p,1)) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index 1f7a7c9c..1c7957ee 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -1,130 +1,327 @@ -function descent(p::Param, η::Real) - function () - @. p.x -= η * p.Δ - @. p.Δ = 0 +using Flux +using Base: @get! +using MacroTools: @forward + +const ϵ = 1e-8 + +# TODO: should use weak refs + +""" + Descent(η) + +Classic gradient descent optimiser with learning rate `η`. +For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`. +""" +mutable struct Descent + eta::Float64 +end + +Descent() = Descent(0.1) + +function update!(o::Descent, x, Δ) + Δ .*= o.eta +end + +""" + Momentum(params, η = 0.01; ρ = 0.9) + +Gradient descent with learning rate `η` and momentum `ρ`. +""" +mutable struct Momentum + eta::Float64 + rho::Float64 + velocity::IdDict +end + +Momentum(η = 0.01, ρ = 0.9) = Momentum(η, ρ, IdDict()) + +function update!(o::Momentum, x, Δ) + η, ρ = o.eta, o.rho + v = get!(o.velocity, x, zero(x))::typeof(x) + @. v = ρ * v - η * Δ + @. Δ = -v +end + +""" + Nesterov(eta, ρ = 0.9) + +Gradient descent with learning rate `η` and Nesterov momentum `ρ`. +""" +mutable struct Nesterov + eta::Float64 + rho::Float64 + velocity::IdDict +end + +Nesterov(η = 0.001, ρ = 0.9) = Nesterov(η, ρ, IdDict()) + +function update!(o::Nesterov, x, Δ) + η, ρ = o.eta, o.rho + v = get!(o.velocity, x, zero(x))::typeof(x) + d = @. ρ^2 * v - (1+ρ) * η * Δ + @. v = ρ*v - η*Δ + @. Δ = -d +end + +""" + RMSProp(η = 0.001, ρ = 0.9) + +[RMSProp](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) +optimiser. Parameters other than learning rate don't need tuning. Often a good +choice for recurrent networks. +""" +mutable struct RMSProp + eta::Float64 + rho::Float64 + acc::IdDict +end + +RMSProp(η = 0.001, ρ = 0.9) = RMSProp(η, ρ, IdDict()) + +function update!(o::RMSProp, x, Δ) + η, ρ = o.eta, o.rho + acc = get!(o.acc, x, zero(x))::typeof(x) + @. acc = ρ * acc + (1 - ρ) * Δ^2 + @. Δ *= η / (√acc + ϵ) +end + +""" + ADAM(η = 0.001, β = (0.9, 0.999)) + +[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser. +""" +mutable struct ADAM + eta::Float64 + beta::Tuple{Float64,Float64} + state::IdDict +end + +ADAM(η = 0.001, β = (0.9, 0.999)) = ADAM(η, β, IdDict()) + +function update!(o::ADAM, x, Δ) + η, β = o.eta, o.beta + mt, vt, βp = get!(o.state, x, (zero(x), zero(x), β)) + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. vt = β[2] * vt + (1 - β[2]) * Δ^2 + @. Δ = mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + ϵ) * η + o.state[x] = (mt, vt, βp .* β) + return Δ +end + +""" + AdaMax(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08) + +[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on +the ∞-norm. +""" +mutable struct AdaMax + eta::Float64 + beta::Tuple{Float64,Float64} + state::IdDict +end + +AdaMax(η = 0.001, β = (0.9, 0.999)) = AdaMax(η, β, IdDict()) + +function update!(o::AdaMax, x, Δ) + η, β = o.eta, o.beta + mt, ut, βp = get!(o.state, x, (zero(x), zero(x), β)) + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. ut = max(β[2] * ut, abs(Δ)) + @. Δ = (η/(1 - βp[1])) * mt/(ut + ϵ) + o.state[x] = (mt, ut, βp .* β) + return Δ +end + +""" + ADAGrad(η = 0.1; ϵ = 1e-8) + +[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser. +Parameters don't need tuning. +""" +mutable struct ADAGrad + eta::Float64 + acc::IdDict +end + +ADAGrad(η = 0.1) = ADAGrad(η, IdDict()) + +function update!(o::ADAGrad, x, Δ) + η = o.eta + acc = get!(o.acc, x, fill(ϵ, size(x)))::typeof(x) + @. acc += Δ^2 + @. Δ *= η / (√acc + ϵ) +end + +""" + ADADelta(ρ = 0.9, ϵ = 1e-8) + +[ADADelta](http://arxiv.org/abs/1212.5701) optimiser. Parameters don't need +tuning. +""" +mutable struct ADADelta + rho::Float64 + state::IdDict +end + +ADADelta(ρ = 0.9) = ADADelta(ρ, IdDict()) + +function update!(o::ADADelta, x, Δ) + ρ = o.rho + acc, Δacc = get!(o.state, x, (zero(x), zero(x))) + @. acc = ρ * acc + (1 - ρ) * Δ^2 + @. Δ *= √Δacc/ (√acc + ϵ) + @. Δacc = ρ * Δacc + (1 - ρ) * Δ^2 + return Δ +end + +""" + AMSGrad(η = 0.001, β = (0.9, 0.999)) + +[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need +tuning. +""" +mutable struct AMSGrad + eta::Float64 + beta::Tuple{Float64, Float64} + state::IdDict +end + +AMSGrad(η = 0.001, β = (0.9, 0.999)) = AMSGrad(η, β, IdDict()) + +function update!(o::AMSGrad, x, Δ) + η, β = o.eta, o.beta + mt, vt, v̂t = get!(o.state, x, (fill(ϵ, size(x)), fill(ϵ, size(x)), fill(ϵ, size(x)))) + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2 + @. v̂t = max.(v̂t, vt) + @. Δ = η * mt / (√v̂t + ϵ) +end + +""" + NADAM(η = 0.001, β = (0.9, 0.999)) + +[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need +tuning. +""" +mutable struct NADAM + eta::Float64 + beta::Tuple{Float64, Float64} + state::IdDict +end + +NADAM(η = 0.001, β = (0.9, 0.999)) = NADAM(η, β, IdDict()) + +function update!(o::NADAM, x, Δ) + η, β = o.eta, o.beta + β1p, β2p = o.beta + mt, vt = get!(o.state, x, (zero(x), zero(x))) + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. vt = β[2] * vt + (1 - β[2]) * Δ^2 + @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / (√(vt * β[2] / (1 - β2p)) + ϵ) * η + o.state[x] = (mt, vt, (β1p * β[1], β2p * β[2])) + return Δ +end + +""" + ADAMW((η = 0.001, β = (0.9, 0.999), decay = 0) + +[ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam. +""" +ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) = + Optimiser(ADAM(η, β), WeightDecay(decay)) + +# Compose optimizers + +""" + Optimiser(a, b, c...) + +Combine several optimisers into one; each optimiser produces a modified gradient +that will be fed into the next, and this is finally applied to the parameter as +usual. +""" +mutable struct Optimiser + os::Vector{Any} +end + +Optimiser(o...) = Optimiser(Any[o...]) + +@forward Optimiser.os Base.getindex, Base.first, Base.last, Base.lastindex, Base.push!, Base.setindex! +@forward Optimiser.os Base.iterate + +Base.getindex(c::Optimiser, i::AbstractArray) = Optimiser(c.os[i]...) + +function update!(o::Optimiser, x, Δ) + for opt in o.os + Δ = update!(opt, x, Δ) end + return Δ end -# Ref: https://arxiv.org/abs/1711.05101.pdf -function descentweightdecay(p::Param, η::Real, γ::Real) - function () - @. p.x = p.x - η * (p.Δ + γ * p.x) - @. p.Δ = 0 +""" +`InvDecay(γ)` + +Apply inverse time decay to an optimiser +```julia + Optimiser(InvDecay(..), Opt(..)) +``` +""" +mutable struct InvDecay + gamma::Float64 + state::IdDict +end + +InvDecay(γ = 0.001) = InvDecay(γ, IdDict()) + +function update!(o::InvDecay, x, Δ) + γ = o.gamma + n = get!(o.state, x, 1) + Δ .*= 1 / (1 + γ * n) + o.state[x] = n + 1 + return Δ +end + +""" +`ExpDecay(eta, decay, decay_step, clip)` + +Schedule the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`. + +To apply exponential decay to an optimiser: +```julia + Optimiser(ExpDecay(..), Opt(..)) +``` +""" +mutable struct ExpDecay + eta::Float64 + decay::Float64 + step::Int64 + clip::Float64 + current::IdDict +end + +ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4) = ExpDecay(opt, decay, decay_step, clip, IdDict()) + +function update!(o::ExpDecay, x, Δ) + η, s, decay = o.eta, o.step, o.decay + n = o.current[x] = get(o.current, x, 0) + 1 + if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1 + η = max(η * decay^(s / n), o.clip) + o.eta = η end + @. Δ *= decay end -function momentum(p::Param, ρ, η) - v = zero(p.x) - function () - @. v = ρ * v - η * p.Δ - @. p.Δ = -v - end +""" +`WeightDecay(wd)` + +Decay the weight parameter by `wd` +""" +mutable struct WeightDecay + wd::Real end -# Ref. https://arxiv.org/pdf/1212.0901.pdf -function nesterov(p::Param, ρ, η) - v = zero(p.x) - function () - d = @. ρ^2 * v - (1+ρ) * η * p.Δ - @. v = ρ*v - η*p.Δ - @. p.Δ = -d - end -end +WeightDecay() = WeightDecay(0) -function rmsprop(p::Param; η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = 1e-8) - acc = zero(p.x) - function () - @. acc = ρ * acc + (1 - ρ) * p.Δ^2 - @. p.Δ *= η / √(acc + ϵ) - end -end - -function adagrad(p::Param; η::Real = 0.01, ϵ::Real = 1e-8) - acc = zero(p.x) .+ ϵ - function () - @. acc += p.Δ^2 - @. p.Δ *= η / √(acc + ϵ) - end -end - -function adadelta(p::Param; ρ::Real = 0.9, ϵ::Real = 1e-8) - acc = zero(p.x) - Δacc = zero(p.x) - function () - @. acc = ρ * acc + (1 - ρ) * p.Δ^2 - @. p.Δ *= √(Δacc + ϵ) / √(acc + ϵ) - @. Δacc = ρ * Δacc + (1 - ρ) * p.Δ^2 - end -end - -function adam(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8) - mt = zero(p.x) - vt = zero(p.x) - β1p, β2p = β1, β2 - function () - @. mt = β1 * mt + (1 - β1) * p.Δ - @. vt = β2 * vt + (1 - β2) * p.Δ^2 - @. p.Δ = mt / (1 - β1p) / √(vt / (1 - β2p) + ϵ) * η - β1p *= β1 - β2p *= β2 - end -end - -function adamax(p::Param; η::Real = 0.002, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8) - mt = zero(p.x) - ut = zero(p.x) - β1p = β1 - function () - @. mt = β1 * mt + (1 - β1) * p.Δ - @. ut = max(β2 * ut, abs(p.Δ)) - @. p.Δ = (η/(1 - β1p)) * mt/(ut + ϵ) - β1p *= β1 - end -end - -function amsgrad(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8) - mt = zero(p.x) - vt = zero(p.x) .+ ϵ - v̂t = zero(p.x) .+ ϵ - function () - @. mt = β1 * mt + (1 - β1) * p.Δ - @. vt = β2 * vt + (1 - β2) * p.Δ ^ 2 - @. v̂t = max.(v̂t, vt) - @. p.Δ = η * mt / √v̂t - end -end - -function nadam(p::Param; η::Real = 0.001, β1::Real = 0.9, β2::Real = 0.999, ϵ::Real = 1e-8) - mt = zero(p.x) - vt = zero(p.x) - β1p, β2p = β1, β2 - function () - @. mt = β1 * mt + (1 - β1) * p.Δ - @. vt = β2 * vt + (1 - β2) * p.Δ^2 - @. p.Δ = (β1 * mt / (1 - β1 * β1p) + (1 - β1) * p.Δ / (1 - β1p)) / √(vt * β2 / (1 - β2p) + ϵ) * η - β1p *= β1 - β2p *= β2 - end -end - -clip(p::Param, thresh::Real) = () -> clamp!(p.Δ, -thresh, thresh) - -function expdecay(p::Param, γ::Real) - if γ != 0 - return () -> p.Δ .+= γ .* p.x - else - return () -> nothing - end -end - -function invdecay(p::Param, γ::Real) - if γ != 0 - n = 0 - return () -> begin - p.Δ .*= 1 / (1 + γ * n) - n += 1 - end - else - return () -> nothing - end +function update!(o::WeightDecay, x, Δ) + wd = o.wd + @. Δ += wd * x end diff --git a/src/optimise/train.jl b/src/optimise/train.jl index 09893873..cd8296ce 100644 --- a/src/optimise/train.jl +++ b/src/optimise/train.jl @@ -1,7 +1,17 @@ using Juno -using Flux.Tracker: back! +using Flux.Tracker: data, grad, back! import Base.depwarn +function update!(opt, xs) + for x in xs + Δ = update!(opt, x.data, x.grad) + x.data .-= Δ + Δ .= 0 + end +end + +# Callback niceties +call(f, xs...) = f(xs...) runall(f) = f runall(fs::AbstractVector) = () -> foreach(call, fs) @@ -35,7 +45,7 @@ function stop() end """ - train!(loss, data, opt) + train!(loss, params, data, opt; cb) For each datapoint `d` in `data` computes the gradient of `loss(d...)` through backpropagation and calls the optimizer `opt`. @@ -44,22 +54,22 @@ Takes a callback as keyword argument `cb`. For example, this will print "trainin every 10 seconds: ```julia -Flux.train!(loss, data, opt, +Flux.train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10)) ``` -The callback can return `:stop` to interrupt the training loop. +The callback can call `Flux.stop()` to interrupt the training loop. Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays. """ -function train!(loss, data, opt; cb = () -> ()) +function train!(loss, ps, data, opt; cb = () -> ()) cb = runall(cb) opt = runall(opt) @progress for d in data try l = loss(d...) @interrupts back!(l) - opt() + update!(opt, ps) if cb() == :stop depwarn("Use of `:stop` is deprecated; use `Flux.stop()` instead", :stop) break diff --git a/src/tracker/Tracker.jl b/src/tracker/Tracker.jl index 190837ab..010f9f4f 100644 --- a/src/tracker/Tracker.jl +++ b/src/tracker/Tracker.jl @@ -5,7 +5,8 @@ using MacroTools: @q, @forward import Base: == -export TrackedArray, TrackedVector, TrackedMatrix, Params, param, back! +export TrackedArray, TrackedVector, TrackedMatrix, Params, gradient, + param, back! tracker(x) = nothing @@ -60,17 +61,11 @@ macro grad(ex) @q(Tracker._forward($(args...)) where $(T...) = $body) |> esc end -function update!(x, Δ) - x.data .+= data(Δ) - tracker(x).grad .= 0 - return x -end - include("idset.jl") include("back.jl") -include("scalar.jl") -include("array.jl") include("numeric.jl") +include("lib/real.jl") +include("lib/array.jl") """ hook(f, x) -> x′ @@ -99,7 +94,8 @@ end nobacksies(f, x) = track(nobacksies, f, x) nobacksies(f, xs::Tuple) = map(x -> nobacksies(f, x), xs) -@grad nobacksies(f, x) = data(x), Δ -> error("Nested AD not defined for $f") +@grad nobacksies(f::Symbol, x) = data(x), Δ -> error("Nested AD not defined for $f") +@grad nobacksies(f::String, x) = data(x), Δ -> error(f) param(x::Number) = TrackedReal(float(x)) param(xs::AbstractArray) = TrackedArray(float.(xs)) @@ -108,10 +104,8 @@ param(xs::AbstractArray) = TrackedArray(float.(xs)) param(x::TrackedReal) = track(identity, x) param(x::TrackedArray) = track(identity, x) -import NNlib.cudata -import Adapt.adapt +import Adapt: adapt, adapt_structure -cudata(x::TrackedArray) = data(x) -adapt(T, xs::TrackedArray) = param(adapt(T, data(xs))) +adapt_structure(T, xs::TrackedArray) = param(adapt(T, data(xs))) end diff --git a/src/tracker/back.jl b/src/tracker/back.jl index e5a84a71..a8a6e2f1 100644 --- a/src/tracker/back.jl +++ b/src/tracker/back.jl @@ -19,62 +19,87 @@ function scan(x) return end -function back_(c::Call, Δ) +function back_(c::Call, Δ, once) Δs = c.func(Δ) (Δs isa Tuple && length(Δs) >= length(c.args)) || error("Gradient is not a tuple of length $(length(c.args))") - foreach(back, c.args, data.(Δs)) + foreach((x, d) -> back(x, d, once), c.args, data.(Δs)) end -back_(::Call{Nothing}, Δ) = nothing +back_(::Call{Nothing}, Δ, once) = nothing +back_(::Call{Missing}, Δ, once) = error("`back!` was already used") accum!(x, Δ) = x .+ Δ accum!(x::AbstractArray, Δ) = (x .+= Δ) -function back(x::Tracked, Δ) +function back(x::Tracked, Δ, once) x.isleaf && (x.grad = accum!(x.grad, Δ); return) ref = x.ref -= 1 - if ref > 0 || isdefined(x, :grad) - if isdefined(x, :grad) - x.grad = accum!(x.grad, Δ) - else - x.grad = Δ - end - ref == 0 && back_(x.f, x.grad) + grad = if isdefined(x, :grad) + x.grad = accum!(x.grad, Δ) + elseif ref > 0 + x.grad = Δ else - ref == 0 && back_(x.f, Δ) + Δ + end + if ref == 0 + back_(x.f, grad, once) + once && !x.isleaf && (x.f = Call(missing, ())) end return end -back(::Nothing, _) = return +back(::Nothing, Δ, once) = return # Interface methods # TODO: if an error occurs in `back` the refcounts will be broken # and `back` will silently fail to update. +# (but only if you re-use intermediate values between passes) # Refcounts are also probably not safe in some situations (e.g. back called # from within a backpropagator) -function back!(x, Δ) +function back!(x, Δ; once = true) istracked(x) || return scan(x) - back(tracker(x), Δ) + back(tracker(x), Δ, once) return end +function gradient_(f, xs...) + xs = param.(data.(xs)) + l = f(xs...) + losscheck(l) + back!(l) + nobacksies("Use `gradient(...; nest = true)` for nested derivatives", + grad.(xs)) +end + # Out-of-place gradients struct Params - params::IdSet - Params(xs) = new(IdSet(xs)) + order::Vector{Any} + params::IdSet{Any} + Params() = new([], IdSet()) end -@forward Params.params Base.iterate, Base.length +@forward Params.order Base.iterate, Base.length + +function Base.push!(ps::Params, x) + if !(x in ps.params) + push!(ps.order, x) + push!(ps.params, x) + end + return ps +end + +Base.push!(ps::Params, x...) = (foreach(x -> push!(ps, x), x); ps) + +Params(xs) = push!(Params(), xs...) function Base.show(io::IO, ps::Params) print(io, "Params([") - join(io, ps.params, ", ") + join(io, ps.order, ", ") print(io, "])") end @@ -91,12 +116,12 @@ Grads() = Grads(IdDict()) Grads(ps::Params) = Grads(IdDict(tracker(p) => init_grad(data(p)) for p in ps)) Base.getindex(g::Grads, x::Tracked) = g.grads[x] + function Base.getindex(g::Grads, x) istracked(x) || error("Object not tracked: $x") g[tracker(x)] end - accum!(g::Grads, x, Δ) = g[x] = haskey(g, x) ? g[x] .+ Δ : Δ function back_(g::Grads, c::Call, Δ) @@ -146,20 +171,13 @@ function losscheck(x) isnan(x) && error("Loss is NaN") end -function gradient(f, args...) +function gradient_nested(f, args...) y, back = forward(f, args...) losscheck(y) return back(1) end -derivative(f, x) = gradient(f, x)[1] +gradient(f, xs...; nest = false) = + nest ? gradient_nested(f, xs...) : gradient_(f, xs...) -# Non-nesting versions - -function gradient_(f, xs...) - xs = param.(xs) - l = f(xs...) - losscheck(l) - back!(l) - grad.(xs) -end +gradient(f, ps::Params) = gradient_nested(f, ps) diff --git a/src/tracker/idset.jl b/src/tracker/idset.jl index 62570c99..372e262a 100644 --- a/src/tracker/idset.jl +++ b/src/tracker/idset.jl @@ -7,6 +7,7 @@ Base.eltype(::IdSet{T}) where T = T IdSet() = IdSet{Any}() +Base.push!(s::IdSet) = s Base.push!(s::IdSet{T}, x::T) where T = (s.dict[x] = nothing; s) Base.delete!(s::IdSet{T}, x::T) where T = (delete!(s.dict, x); s) Base.in(x, s::IdSet) = haskey(s.dict, x) diff --git a/src/tracker/array.jl b/src/tracker/lib/array.jl similarity index 75% rename from src/tracker/array.jl rename to src/tracker/lib/array.jl index 3d9836d0..690b0e18 100644 --- a/src/tracker/array.jl +++ b/src/tracker/lib/array.jl @@ -33,8 +33,18 @@ TrackedArray(x::AbstractArray) = TrackedArray(Call(), x, zero(x)) Base.eltype(x::Type{<:TrackedArray{T}}) where T <: Real = TrackedReal{T} -Base.show(io::IO, ::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} = - print(io, "TrackedArray{…,$A}") +Base.convert(::Type{T}, x::S) where {T<:TrackedArray,S<:T} = x + +Base.convert(::Type{<:TrackedArray}, x::TrackedArray) = + error("Not implemented: convert $(typeof(x)) to $T") + +Base.convert(::Type{<:TrackedArray{T,N,A}}, x::AbstractArray) where {T,N,A} = + TrackedArray(convert(A, x)) + +Base.show(io::IO, t::Type{TrackedArray{T,N,A}}) where {T,N,A<:AbstractArray{T,N}} = + @isdefined(A) ? + print(io, "TrackedArray{…,$A}") : + invoke(show, Tuple{IO,DataType}, io, t) function Base.summary(io::IO, x::TrackedArray) print(io, "Tracked ") @@ -43,11 +53,24 @@ end Base.print_array(io::IO, x::TrackedArray) = Base.print_array(io, data(x)) +function Base.show(io::IO, x::TrackedArray) + show(io, data(x)) + print(io, " (tracked)") +end + +Base.copy(x::TrackedArray) = x + Base.setindex!(xs::TrackedArray, v, i...) = error("Can't differentiate `setindex!`") back!(::TrackedArray) = error("Value is not scalar; use `back!(sum(x))` or `back!(x, Δ)`") +function update!(x::TrackedArray, Δ) + x.data .+= data(Δ) + tracker(x).grad .= 0 + return x +end + # Fallthrough methods for f in :[Base.size, Base.ndims, Base.collect].args @@ -80,6 +103,17 @@ Base.getindex(xs::TrackedArray, i...) = track(getindex, xs, i...) end end +Base.view(x::TrackedArray, inds...) = track(Base.view, x, inds...) + +@grad function view(x::AbstractArray, inds...) + view(data(x), inds...), function (Δ) + grad_output = zero(x) + subgrad = view(grad_output, inds...) + subgrad[:] = data(Δ) + (nobacksies(:view, grad_output), map(_->nothing, inds)...) + end +end + Base.:-(xs::TrackedArray) = track(-, xs) @grad -(xs) = -data(xs), Δ -> (-Δ,) @@ -87,8 +121,8 @@ Base.:-(xs::TrackedArray) = track(-, xs) Base.transpose(xs::TrackedArray) = track(transpose, xs) Base.adjoint(xs::TrackedArray) = track(adjoint, xs) -@grad transpose(xs) = transpose(data(xs)), Δ -> (reshape(transpose(Δ), size(xs)),) -@grad adjoint(xs) = data(xs)', Δ -> (reshape(Δ', size(xs)),) +@grad transpose(xs) = transpose(data(xs)), Δ -> (trim(xs, transpose(Δ)),) +@grad adjoint(xs) = data(xs)', Δ -> (trim(xs, Δ'),) Base.repeat(xs::TrackedArray; kw...) = track(repeat, xs; kw...) @@ -108,30 +142,28 @@ Base.repeat(xs::TrackedArray; kw...) = track(repeat, xs; kw...) end end -for f in [:vcat, :hcat] - UArray = :(Union{TrackedArray,Vector,Matrix,Adjoint,Transpose}) - @eval begin - # This section is a bit of a hack since julia doesn't have a standardised - # promotion mechanism for concatenation yet - # https://github.com/JuliaLang/julia/pull/20815 +function combinations(xs, n) + n < 1 && return [[]] + cs = combinations(xs, n-1) + [[x, c...] for x in xs, c in cs] +end - # It should support tracked concatenation with rank ∈ (1,2) with a - # TrackedArray anywhere among the arguments This works as long as base has - # other functions that captures `(::Union{Vector,RowVector,Matrix}...)`. - Base.$f(a::$UArray...) = track($f, a...) +for i = 0:2, c = combinations([:AbstractArray, :TrackedArray], i), f = [:hcat, :vcat] + cnames = map(_ -> gensym(), c) + @eval Base.$f($([:($x::$c) for (x, c) in zip(cnames, c)]...), x::TrackedArray, xs::AbstractArray...) = + track($f, $(cnames...), x, xs...) +end - # It should support tracked concatenation with rank>2 if the TrackedArray is - # first - Base.$f(a::TrackedArray, b::AbstractArray...) = track($f, a, b...) - Base.$f(a::TrackedArray, b::$UArray...) = track($f, a, b...) # resolves ambiguity introduced by previous row +for i = 0:2, c = combinations([:AbstractVecOrMat, :TrackedVecOrMat], i), f = [:hcat, :vcat] + cnames = map(_ -> gensym(), c) + @eval Base.$f($([:($x::$c{T}) for (x, c) in zip(cnames, c)]...), x::TrackedVecOrMat{T}, xs::AbstractVecOrMat{T}...) where T = + track($f, $(cnames...), x, xs...) +end - # It should support tracked concatenation with rank>2 if the TrackedArray is - # second - Base.$f(a::Array, b::TrackedArray, c::AbstractArray...) = track($f, a, b, c...) - Base.$f(a::Union{Vector,Matrix,Adjoint,Transpose}, b::TrackedArray, - c::$UArray...) = - track($f, a, b, c...) # resolves ambiguity introduced by previous row - end +for i = 0:2, c = combinations([:AbstractVector, :TrackedVector], i), f = [:hcat, :vcat] + cnames = map(_ -> gensym(), c) + @eval Base.$f($([:($x::$c{T}) for (x, c) in zip(cnames, c)]...), x::TrackedVector{T}, xs::AbstractVector{T}...) where T = + track($f, $(cnames...), x, xs...) end @grad function vcat(xs...) @@ -164,10 +196,11 @@ end end end -Base.cat(a::TrackedArray; dims) = track(cat, a, dims = dims) -Base.cat(a::TrackedArray, b::TrackedArray, c::AbstractArray...; dims) = track(cat, a, b, c..., dims = dims) -Base.cat(a::TrackedArray, b::AbstractArray, c::AbstractArray...; dims) = track(cat, a, b, c..., dims = dims) -Base.cat(a::AbstractArray, b::TrackedArray, c::AbstractArray...; dims) = track(cat, a, b, c..., dims = dims) +for i = 0:2, c = combinations([:AbstractArray, :TrackedArray], i) + cnames = map(_ -> gensym(), c) + @eval Base.cat($([:($x::$c) for (x, c) in zip(cnames, c)]...), x::TrackedArray, xs::AbstractArray...; dims) = + track(cat, $(cnames...), x, xs..., dims = dims) +end @grad function cat(Xs...; dims) cat(data.(Xs)..., dims = dims), function (Δ) @@ -307,8 +340,8 @@ end # BLAS -LinearAlgebra.diagm(x::TrackedVector) = track(diagm, x) -@grad diagm(x) = diagm(data(x)), Δ -> (diag(Δ),) +LinearAlgebra.diagm(x::Pair{<:Integer, <:TrackedVector}) = track(diagm, x...) +@grad diagm(i, x) = diagm(i => data(x)), Δ -> (nothing, diag(Δ, i)) x::TrackedMatrix * y::AbstractMatrix = track(*, x, y) x::AbstractMatrix * y::TrackedMatrix = track(*, x, y) @@ -328,7 +361,7 @@ x::TrackedVector * y::TrackedVector = track(*, x, y) # NNlib using NNlib -import NNlib: softmax, ∇softmax, logsoftmax, ∇logsoftmax, conv, maxpool, meanpool +import NNlib: softmax, ∇softmax, logsoftmax, ∇logsoftmax, conv, depthwiseconv, maxpool, meanpool softmax(xs::TrackedArray) = track(softmax, xs) @@ -338,6 +371,16 @@ logsoftmax(xs::TrackedArray) = track(logsoftmax, xs) @grad logsoftmax(xs) = logsoftmax(data(xs)), Δ -> (nobacksies(:logsoftmax, ∇logsoftmax(data(Δ), data(xs))),) +depthwiseconv(x::TrackedArray, w::TrackedArray; kw...) = track(depthwiseconv, x, w; kw...) +depthwiseconv(x::AbstractArray, w::TrackedArray; kw...) = track(depthwiseconv, x, w; kw...) +depthwiseconv(x::TrackedArray, w::AbstractArray; kw...) = track(depthwiseconv, x, w; kw...) + +@grad depthwiseconv(x, w; kw...) = + depthwiseconv(data(x), data(w); kw...), + Δ -> nobacksies(:depthwiseconv, + (NNlib.∇depthwiseconv_data(data.((Δ, x, w))...; kw...), + NNlib.∇depthwiseconv_filter(data.((Δ, x, w))...; kw...))) + conv(x::TrackedArray, w::TrackedArray; kw...) = track(conv, x, w; kw...) conv(x::AbstractArray, w::TrackedArray; kw...) = track(conv, x, w; kw...) conv(x::TrackedArray, w::AbstractArray; kw...) = track(conv, x, w; kw...) @@ -374,8 +417,7 @@ unbroadcast(x::AbstractArray, Δ) = trim(x, sum(Δ, dims = ntuple(i -> size(x, i) == 1 ? i : ndims(Δ)+1, Val(ndims(Δ))))) unbroadcast(x::Number, Δ) = sum(Δ) -unbroadcast(x::Base.RefValue{<:Function}, _) = nothing -unbroadcast(x::Base.RefValue{<:Val}, _) = nothing +unbroadcast(x::Base.RefValue, _) = nothing dual(x, p) = x dual(x::Real, p) = Dual(x, p) @@ -423,26 +465,28 @@ end using Requires # https://github.com/FluxML/Flux.jl/issues/353 -@init Requires.isprecompiling() || @eval Base.Broadcast begin - function flatten(bc::Broadcasted{Style}) where {Style} - isflat(bc) && return bc - args = cat_nested(bc) - let makeargs = make_makeargs(bc), f = bc.f - newf = @inline function(args::Vararg{Any,N}) where N - f(makeargs(args...)...) +if VERSION < v"1.1.0-DEV.548" + @init Requires.isprecompiling() || @eval Base.Broadcast begin + function flatten(bc::Broadcasted{Style}) where {Style} + isflat(bc) && return bc + args = cat_nested(bc) + let makeargs = make_makeargs(bc), f = bc.f + newf = @inline function(args::Vararg{Any,N}) where N + f(makeargs(args...)...) + end + return Broadcasted{Style}(newf, args, bc.axes) end - return Broadcasted{Style}(newf, args, bc.axes) end - end - @inline function make_makeargs(makeargs, t::Tuple{<:Broadcasted,Vararg{Any}}) - bc = t[1] - let makeargs = make_makeargs(makeargs, tail(t)), f = bc.f - let makeargs = make_makeargs(makeargs, bc.args) - headargs, tailargs = make_headargs(bc.args), make_tailargs(bc.args) - return @inline function(args::Vararg{Any,N}) where N - args1 = makeargs(args...) - a, b = headargs(args1...), tailargs(args1...) - (f(a...), b...) + @inline function make_makeargs(makeargs, t::Tuple{<:Broadcasted,Vararg{Any}}) + bc = t[1] + let makeargs = make_makeargs(makeargs, tail(t)), f = bc.f + let makeargs = make_makeargs(makeargs, bc.args) + headargs, tailargs = make_headargs(bc.args), make_tailargs(bc.args) + return @inline function(args::Vararg{Any,N}) where N + args1 = makeargs(args...) + a, b = headargs(args1...), tailargs(args1...) + (f(a...), b...) + end end end end diff --git a/src/tracker/scalar.jl b/src/tracker/lib/real.jl similarity index 69% rename from src/tracker/scalar.jl rename to src/tracker/lib/real.jl index 81ccb9a3..a4f90a0c 100644 --- a/src/tracker/scalar.jl +++ b/src/tracker/lib/real.jl @@ -1,4 +1,4 @@ -struct TrackedReal{T<:Real} <: Real +mutable struct TrackedReal{T<:Real} <: Real data::T tracker::Tracked{T} end @@ -10,19 +10,28 @@ tracker(x::TrackedReal) = x.tracker track(f::Call, x::Real) = TrackedReal(x, Tracked{typeof(x)}(f, zero(x))) -function back!(x::TrackedReal) +function back!(x::TrackedReal; once = true) isinf(x) && error("Loss is Inf") isnan(x) && error("Loss is NaN") - return back!(x, 1) + return back!(x, 1, once = once) +end + +function update!(x::TrackedReal, Δ) + x.data += data(Δ) + tracker(x).grad = 0 + return x end function Base.show(io::IO, x::TrackedReal) + T = get(io, :typeinfo, Any) show(io, data(x)) - print(io, " (tracked)") + T <: TrackedReal || print(io, " (tracked)") end Base.decompose(x::TrackedReal) = Base.decompose(data(x)) +Base.copy(x::TrackedReal) = x + Base.convert(::Type{TrackedReal{T}}, x::TrackedReal{T}) where T = x Base.convert(::Type{TrackedReal{T}}, x::Real) where T = TrackedReal(convert(T, x)) @@ -30,23 +39,32 @@ Base.convert(::Type{TrackedReal{T}}, x::Real) where T = TrackedReal(convert(T, x Base.convert(::Type{TrackedReal{T}}, x::TrackedReal{S}) where {T,S} = error("Not implemented: convert tracked $S to tracked $T") -for op in [:(==), :≈, :<] +for op in [:(==), :≈, :<, :(<=)] @eval Base.$op(x::TrackedReal, y::Real) = Base.$op(data(x), y) @eval Base.$op(x::Real, y::TrackedReal) = Base.$op(x, data(y)) @eval Base.$op(x::TrackedReal, y::TrackedReal) = Base.$op(data(x), data(y)) end Base.eps(x::TrackedReal) = eps(data(x)) +Base.eps(::Type{TrackedReal{T}}) where T = eps(T) for f in :[isinf, isnan, isfinite].args @eval Base.$f(x::TrackedReal) = Base.$f(data(x)) end -Base.Printf.fix_dec(x::TrackedReal, n::Int) = Base.Printf.fix_dec(data(x), n) +Base.Printf.fix_dec(x::TrackedReal, n::Int, a...) = Base.Printf.fix_dec(data(x), n, a...) + +Base.float(x::TrackedReal) = x Base.promote_rule(::Type{TrackedReal{S}},::Type{T}) where {S,T} = TrackedReal{promote_type(S,T)} +using Random + +for f in :[rand, randn, randexp].args + @eval Random.$f(rng::AbstractRNG,::Type{TrackedReal{T}}) where {T} = param(rand(rng,T)) +end + using DiffRules, SpecialFunctions, NaNMath for (M, f, arity) in DiffRules.diffrules() @@ -58,12 +76,18 @@ for (M, f, arity) in DiffRules.diffrules() end end +# Work around zero(π) not working, for some reason +_zero(::Irrational) = nothing +_zero(x) = zero(x) + for (M, f, arity) in DiffRules.diffrules() arity == 2 || continue da, db = DiffRules.diffrule(M, f, :a, :b) f = :($M.$f) @eval begin - @grad $f(a::Real, b::Real) = $f(data(a), data(b)), Δ -> (Δ * $da, Δ * $db) + @grad $f(a::TrackedReal, b::TrackedReal) = $f(data(a), data(b)), Δ -> (Δ * $da, Δ * $db) + @grad $f(a::TrackedReal, b::Real) = $f(data(a), b), Δ -> (Δ * $da, _zero(b)) + @grad $f(a::Real, b::TrackedReal) = $f(a, data(b)), Δ -> (_zero(a), Δ * $db) $f(a::TrackedReal, b::TrackedReal) = track($f, a, b) $f(a::TrackedReal, b::Real) = track($f, a, b) $f(a::Real, b::TrackedReal) = track($f, a, b) @@ -75,6 +99,12 @@ import Base:^ ^(a::TrackedReal, b::Integer) = track(^, a, b) +# Hack for conversions + +using ForwardDiff: Dual + +(T::Type{<:Real})(x::Dual) = Dual(T(x.value), map(T, x.partials.values)) + # Tuples struct TrackedTuple{T<:Tuple} @@ -115,8 +145,8 @@ function scan(c::Call{typeof(collect)}) foreach(scan, c.args[1]) end -function back_(c::Call{typeof(collect)}, Δ) - foreach(back, c.args[1], data(Δ)) +function back_(c::Call{typeof(collect)}, Δ, once) + foreach((x, d) -> back(x, d, once), c.args[1], data(Δ)) end function back_(g::Grads, c::Call{typeof(collect)}, Δ) diff --git a/src/treelike.jl b/src/treelike.jl index 9b3518d3..88e878c4 100644 --- a/src/treelike.jl +++ b/src/treelike.jl @@ -40,7 +40,7 @@ function prefor(f, x; seen = IdSet()) end function params(m) - ps = [] + ps = Params() prefor(p -> Tracker.istracked(p) && Tracker.isleaf(p) && !any(p′ -> p′ === p, ps) && push!(ps, p), diff --git a/src/utils.jl b/src/utils.jl index c53f7864..9bad3760 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,8 +1,12 @@ # Arrays +glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0/sum(dims)) +glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0/sum(dims)) -initn(dims...) = randn(dims...)/100 -glorot_uniform(dims...) = (rand(dims...) .- 0.5) .* sqrt(24.0/(sum(dims))) -glorot_normal(dims...) = randn(dims...) .* sqrt(2.0/sum(dims)) +ones(T::Type, dims...) = Base.ones(T, dims...) +zeros(T::Type, dims...) = Base.zeros(T, dims...) + +ones(dims...) = Base.ones(Float32, dims...) +zeros(dims...) = Base.zeros(Float32, dims...) unsqueeze(xs, dim) = reshape(xs, (size(xs)[1:dim-1]..., 1, size(xs)[dim:end]...)) @@ -24,7 +28,7 @@ julia> chunk(1:10, 3) """ chunk(xs, n) = collect(Iterators.partition(xs, ceil(Int, length(xs)/n))) -batchindex(xs, i) = (reverse(Base.tail(reverse(indices(xs))))..., i) +batchindex(xs, i) = (reverse(Base.tail(reverse(axes(xs))))..., i) """ frequencies(xs) @@ -66,7 +70,7 @@ julia> batch([[1,2,3],[4,5,6]]) function batch(xs) data = first(xs) isa AbstractArray ? similar(first(xs), size(first(xs))..., length(xs)) : - Vector{eltype(xs)}(length(xs)) + Vector{eltype(xs)}(undef, length(xs)) for (i, x) in enumerate(xs) data[batchindex(data, i)...] = x end @@ -147,9 +151,24 @@ function jacobian(m,x) n = length(x) J = Matrix{eltype(x)}(undef,n,k) for i = 1:k - Flux.back!(y[i]) # Populate gradient accumulator + Flux.back!(y[i], once = false) # Populate gradient accumulator J[:,i] = xp.grad - xp.grad .*= 0 # Reset gradient accumulator + xp.grad .= 0 # Reset gradient accumulator end J' end + +""" + @jit ... + +The `@jit` annotation can be applied to any code, and the code will be compiled +for performance. + + @jit f(x) = @jit(x) + @jit(x) + +Note that compilation happens regardless of the `@jit` macro, so it should only +be used for aesthetic purposes, or by recovering Python users. +""" +macro jit(ex) + esc(ex) +end diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl index 16f90e89..f7a08503 100644 --- a/test/cuda/cuda.jl +++ b/test/cuda/cuda.jl @@ -11,6 +11,8 @@ x = param(randn(5, 5)) cx = gpu(x) @test cx isa TrackedArray && cx.data isa CuArray +@test Flux.onecold(param(gpu([1.,2.,3.]))) == 3 + x = Flux.onehotbatch([1, 2, 3], 1:3) cx = gpu(x) @test cx isa Flux.OneHotMatrix && cx.data isa CuArray @@ -36,4 +38,8 @@ Flux.back!(sum(l)) end -CuArrays.cudnn_available() && include("cudnn.jl") +if CuArrays.libcudnn != nothing + @info "Testing Flux/CUDNN" + include("cudnn.jl") + include("curnn.jl") +end diff --git a/test/cuda/cudnn.jl b/test/cuda/cudnn.jl index d5cf442b..9a154961 100644 --- a/test/cuda/cudnn.jl +++ b/test/cuda/cudnn.jl @@ -1,48 +1,48 @@ -using Flux, CuArrays, Test +using Flux, Flux.Tracker, CuArrays, Test +using Flux.Tracker: TrackedArray, data -@info "Testing Flux/CUDNN" +@testset "CUDNN BatchNorm" begin + @testset "4D Input" begin + x = TrackedArray(Float64.(collect(reshape(1:12, 2, 2, 3, 1)))) + m = BatchNorm(3) + cx = gpu(x) + cm = gpu(m) -@testset "RNN" begin - @testset for R in [RNN, GRU, LSTM] - rnn = R(10, 5) - curnn = mapleaves(gpu, rnn) - @testset for batch_size in (1, 5) - Flux.reset!(rnn) - Flux.reset!(curnn) - x = batch_size == 1 ? - param(rand(10)) : - param(rand(10,batch_size)) - cux = gpu(x) - y = (rnn(x); rnn(x)) - cuy = (curnn(cux); curnn(cux)) + y = m(x) + cy = cm(cx) - @test y.data ≈ collect(cuy.data) - @test haskey(Flux.CUDA.descs, curnn.cell) + @test cy isa TrackedArray{Float32,4,CuArray{Float32,4}} - Δ = randn(size(y)) + @test cpu(data(cy)) ≈ data(y) - Flux.back!(y, Δ) - Flux.back!(cuy, gpu(Δ)) + g = rand(size(y)...) + Flux.back!(y, g) + Flux.back!(cy, gpu(g)) - @test x.grad ≈ collect(cux.grad) - @test rnn.cell.Wi.grad ≈ collect(curnn.cell.Wi.grad) - @test rnn.cell.Wh.grad ≈ collect(curnn.cell.Wh.grad) - @test rnn.cell.b.grad ≈ collect(curnn.cell.b.grad) - @test rnn.cell.h.grad ≈ collect(curnn.cell.h.grad) - if isdefined(rnn.cell, :c) - @test rnn.cell.c.grad ≈ collect(curnn.cell.c.grad) - end - - Flux.reset!(rnn) - Flux.reset!(curnn) - ohx = batch_size == 1 ? - Flux.onehot(rand(1:10), 1:10) : - Flux.onehotbatch(rand(1:10, batch_size), 1:10) - cuohx = gpu(ohx) - y = (rnn(ohx); rnn(ohx)) - cuy = (curnn(cuohx); curnn(cuohx)) - - @test y.data ≈ collect(cuy.data) + @test m.γ.grad ≈ cpu(cm.γ.grad) + @test m.β.grad ≈ cpu(cm.β.grad) + @test x.grad ≈ cpu(x.grad) + end + + @testset "2D Input" begin + x = TrackedArray(Float64.(collect(reshape(1:12, 3, 4)))) + m = BatchNorm(3) + cx = gpu(x) + cm = gpu(m) + + y = m(x) + cy = cm(cx) + + @test cy isa TrackedArray{Float32,2,CuArray{Float32,2}} + + @test cpu(data(cy)) ≈ data(y) + + g = rand(size(y)...) + Flux.back!(y, g) + Flux.back!(cy, gpu(g)) + + @test m.γ.grad ≈ cpu(cm.γ.grad) + @test m.β.grad ≈ cpu(cm.β.grad) + @test x.grad ≈ cpu(x.grad) end - end end diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl new file mode 100644 index 00000000..3f5e1819 --- /dev/null +++ b/test/cuda/curnn.jl @@ -0,0 +1,46 @@ +using Flux, CuArrays, Test + +@testset "RNN" begin + @testset for R in [RNN, GRU, LSTM] + rnn = R(10, 5) + curnn = mapleaves(gpu, rnn) + @testset for batch_size in (1, 5) + Flux.reset!(rnn) + Flux.reset!(curnn) + x = batch_size == 1 ? + param(rand(10)) : + param(rand(10,batch_size)) + cux = gpu(x) + y = (rnn(x); rnn(x)) + cuy = (curnn(cux); curnn(cux)) + + @test y.data ≈ collect(cuy.data) + @test haskey(Flux.CUDA.descs, curnn.cell) + + Δ = randn(size(y)) + + Flux.back!(y, Δ) + Flux.back!(cuy, gpu(Δ)) + + @test x.grad ≈ collect(cux.grad) + @test rnn.cell.Wi.grad ≈ collect(curnn.cell.Wi.grad) + @test rnn.cell.Wh.grad ≈ collect(curnn.cell.Wh.grad) + @test rnn.cell.b.grad ≈ collect(curnn.cell.b.grad) + @test rnn.cell.h.grad ≈ collect(curnn.cell.h.grad) + if isdefined(rnn.cell, :c) + @test rnn.cell.c.grad ≈ collect(curnn.cell.c.grad) + end + + Flux.reset!(rnn) + Flux.reset!(curnn) + ohx = batch_size == 1 ? + Flux.onehot(rand(1:10), 1:10) : + Flux.onehotbatch(rand(1:10, batch_size), 1:10) + cuohx = gpu(ohx) + y = (rnn(ohx); rnn(ohx)) + cuy = (curnn(cuohx); curnn(cuohx)) + + @test y.data ≈ collect(cuy.data) + end + end +end diff --git a/test/data.jl b/test/data.jl index 9c2901cb..a73d1ec3 100644 --- a/test/data.jl +++ b/test/data.jl @@ -10,4 +10,7 @@ using Test @test MNIST.images()[1] isa Matrix @test MNIST.labels() isa Vector{Int64} +@test FashionMNIST.images()[1] isa Matrix +@test FashionMNIST.labels() isa Vector{Int64} + @test Data.Sentiment.train() isa Vector{Data.Tree{Any}} diff --git a/test/layers/basic.jl b/test/layers/basic.jl new file mode 100644 index 00000000..b8d9efd1 --- /dev/null +++ b/test/layers/basic.jl @@ -0,0 +1,33 @@ +using Test, Random + +@testset "basic" begin + @testset "Chain" begin + @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10)) + @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10)) + # numeric test should be put into testset of corresponding layer + end + + @testset "Dense" begin + @test length(Dense(10, 5)(randn(10))) == 5 + @test_throws DimensionMismatch Dense(10, 5)(randn(1)) + @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting + @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting + + @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(1, 1) + @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2) + @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1) + @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20] + + end + + @testset "Diagonal" begin + @test length(Flux.Diagonal(10)(randn(10))) == 10 + @test length(Flux.Diagonal(10)(1)) == 10 + @test length(Flux.Diagonal(10)(randn(1))) == 10 + @test_throws DimensionMismatch Flux.Diagonal(10)(randn(2)) + + @test Flux.Diagonal(2)([1 2]) == [1 2; 1 2] + @test Flux.Diagonal(2)([1,2]) == [1,2] + @test Flux.Diagonal(2)([1 2; 3 4]) == [1 2; 3 4] + end +end diff --git a/test/layers/conv.jl b/test/layers/conv.jl index 5928bd75..160b7fbb 100644 --- a/test/layers/conv.jl +++ b/test/layers/conv.jl @@ -2,7 +2,7 @@ using Flux, Test using Flux: maxpool, meanpool @testset "Pooling" begin - x = randn(10, 10, 3, 2) + x = randn(Float32, 10, 10, 3, 2) mp = MaxPool((2, 2)) @test mp(x) == maxpool(x, (2,2)) mp = MeanPool((2, 2)) @@ -10,7 +10,7 @@ using Flux: maxpool, meanpool end @testset "CNN" begin - r = zeros(28, 28, 1, 5) + r = zeros(Float32, 28, 28, 1, 5) m = Chain( Conv((2, 2), 1=>16, relu), MaxPool((2,2)), diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index b17120b0..18276140 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -1,4 +1,5 @@ using Flux: testmode! +using Flux.Tracker: data @testset "Dropout" begin x = [1.,2.,3.] @@ -28,7 +29,8 @@ using Flux: testmode! end @testset "BatchNorm" begin - let m = BatchNorm(2), x = param([1 2; 3 4; 5 6]') + let m = BatchNorm(2), x = param([1 3 5; + 2 4 6]) @test m.β.data == [0, 0] # initβ(2) @test m.γ.data == [1, 1] # initγ(2) @@ -53,29 +55,30 @@ end # .1 * 4 + 0 = .4 @test m.μ ≈ reshape([0.3, 0.4], 2, 1) - # julia> .1 .* std(x, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.] + # julia> .1 .* var(x, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.] # 2×1 Array{Float64,2}: - # 1.14495 - # 1.14495 - @test m.σ ≈ .1 .* std(x.data, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.] + # 1.3 + # 1.3 + @test m.σ² ≈ .1 .* var(x.data, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.] testmode!(m) @test !m.active x′ = m(x).data - @test x′[1] ≈ (1 .- 0.3) / 1.1449489742783179 + @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.3), atol = 1.0e-5) end # with activation function - let m = BatchNorm(2, σ), x = param([1 2; 3 4; 5 6]') + let m = BatchNorm(2, sigmoid), x = param([1 3 5; + 2 4 6]) @test m.active m(x) testmode!(m) @test !m.active - x′ = m(x).data - @test x′[1] ≈ σ((1 - 0.3) / 1.1449489742783179) + y = m(x).data + @test isapprox(y, data(sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ))), atol = 1.0e-7) end let m = BatchNorm(2), x = param(reshape(1:6, 3, 2, 1)) @@ -85,7 +88,7 @@ end end let m = BatchNorm(2), x = param(reshape(1:12, 2, 3, 2, 1)) - y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :) + y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :) y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4]) @test m(x) == y end diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl index d4599908..34abb8cb 100644 --- a/test/layers/stateless.jl +++ b/test/layers/stateless.jl @@ -49,4 +49,16 @@ const ϵ = 1e-7 @testset "logitbinarycrossentropy" begin @test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0) end + + @testset "no spurious promotions" begin + for T in (Float16, Float32, Float64) + y = rand(T, 2) + ŷ = rand(T, 2) + for f in (mse, crossentropy, logitcrossentropy) + fwd, back = Flux.Tracker.forward(mse, ŷ, y) + @test typeof(fwd) == Flux.Tracker.TrackedReal{T} + @test eltype(back(one(T))[1]) == Flux.Tracker.TrackedReal{T} + end + end + end end diff --git a/test/optimise.jl b/test/optimise.jl index 502d9ab2..fcc40dd1 100644 --- a/test/optimise.jl +++ b/test/optimise.jl @@ -1,16 +1,40 @@ using Flux.Optimise +using Flux.Optimise: runall using Flux.Tracker using Test @testset "Optimise" begin w = randn(10, 10) - @testset for Opt in [SGD, Nesterov, Momentum, ADAM, AdaMax, RMSProp, ps -> ADAGrad(ps, 0.1), ADADelta, AMSGrad, NADAM] + @testset for Opt in [ADAMW, ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, Descent, ADAM, Nesterov, RMSProp, Momentum] w′ = param(randn(10, 10)) loss(x) = Flux.mse(w*x, w′*x) - opt = Opt([w′]) - for t=1:10^5 + opt = Opt(0.001) + if opt isa Descent || opt isa ADAGrad + opt = Opt(0.1) + end + if opt isa ADADelta + opt = Opt(0.9) + end + for t = 1: 10^5 l = loss(rand(10)) back!(l) - opt() + delta = Optimise.update!(opt, w′.data, w′.grad) + w′.data .-= delta + end + @test Flux.mse(w, w′) < 0.01 + end +end + +@testset "Optimiser" begin + w = randn(10, 10) + @testset for Opt in [InvDecay, WeightDecay, ExpDecay] + w′ = param(randn(10, 10)) + loss(x) = Flux.mse(w*x, w′*x) + opt = Optimiser(Opt(), ADAM(0.001)) + for t = 1:10^5 + l = loss(rand(10)) + back!(l) + delta = Optimise.update!(opt, w′.data, w′.grad) + w′.data .-= delta end @test Flux.mse(w, w′) < 0.01 end @@ -21,9 +45,17 @@ end l = param(1) Flux.train!(() -> (sleep(0.1); i += 1; l), + (), Iterators.repeated((), 100), - ()->(), - cb = Flux.throttle(() -> (i > 3 && stop()), 1)) + Descent(), + cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1)) @test 3 < i < 50 + + # Test multiple callbacks + x = 0 + fs = [() -> (), () -> x = 1] + cbs = runall(fs) + cbs() + @test x == 1 end diff --git a/test/runtests.jl b/test/runtests.jl index 7a55dca6..25d600dd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,19 +1,4 @@ -# Pkg.test runs with --check_bounds=1, forcing all bounds checks. -# This is incompatible with CUDAnative (see JuliaGPU/CUDAnative.jl#98) -if Base.JLOptions().check_bounds == 1 - file = @__FILE__ - run(``` - $(Base.julia_cmd()) - --color=$(Base.have_color ? "yes" : "no") - --compiled-modules=$(Bool(Base.JLOptions().use_compiled_modules) ? "yes" : "no") - --startup-file=$(Base.JLOptions().startupfile != 2 ? "yes" : "no") - --code-coverage=$(["none", "user", "all"][1+Base.JLOptions().code_coverage]) - $(file) - ```) - exit() -end - -using Flux, Test, Random +using Flux, Test, Random, Statistics using Random Random.seed!(0) @@ -32,6 +17,7 @@ include("data.jl") @info "Testing Layers" +include("layers/basic.jl") include("layers/normalisation.jl") include("layers/stateless.jl") include("layers/conv.jl") diff --git a/test/tracker.jl b/test/tracker.jl index a4772f2e..4380402e 100644 --- a/test/tracker.jl +++ b/test/tracker.jl @@ -1,9 +1,9 @@ using Flux using Flux.Tracker, Test, NNlib -using Flux.Tracker: TrackedReal, gradcheck, grad, derivative, checkpoint -using NNlib: conv +using Flux.Tracker: TrackedReal, gradient, gradcheck, grad, checkpoint +using NNlib: conv, depthwiseconv using Printf: @sprintf -using LinearAlgebra: Diagonal, dot, LowerTriangular, norm +using LinearAlgebra: diagm, dot, LowerTriangular, norm using Statistics: mean, std using Random # using StatsBase @@ -33,16 +33,16 @@ gradtest(f, dims...) = gradtest(f, rand.(Float64, dims)...) @test gradtest(Flux.crossentropy, rand(5,5), rand(5, 5)) @test gradtest(x -> x', rand(5)) + +@testset "indexing & slicing" begin + gradtest(x->view(x, 1:2, 1:2), rand(4, 4)) +end + function promotiontest(f, A, B, C) r0 = f(A, B, C) r1 = f(param(A), B, C) r2 = f(A, param(B), C) - if all(ndims.((A,B,C)) .≤ 2) && f ∈ [hcat, vcat] - r3 = f(A, B, param(C)) - else - @test_throws MethodError f(A, B, param(C)) # until julia#20815 is resolved - r3 = r2 - end + r3 = f(A, B, param(C)) r4 = f(param(A), param(B), param(C)) @test !isa(r0, TrackedArray) @@ -127,7 +127,7 @@ end @test gradtest(kron, rand(5,1), rand(3,1), rand(8,1)) @test gradtest(kron, rand(5,2), rand(3,2), rand(8,2)) -@test gradtest(f-> Matrix(Diagonal(f)), rand(3)) +@test gradtest(x -> diagm(0 => x), rand(3)) @test gradtest(W -> inv(log.(W * W)), (5,5)) @test gradtest((A, B) -> A / B , (1,5), (5,5)) @@ -181,12 +181,16 @@ end @test gradtest(conv, rand(10, 10, 3, 2), randn(Float64,2, 2, 3, 2)) @test gradtest(conv, rand(10, 10, 10, 3, 2), randn(Float64,2, 2, 2, 3, 2)) +@test gradtest(depthwiseconv, rand(10,10,3,2), randn(2, 2, 2, 3)) + @test gradtest(x -> maxpool(x, (2,2)), rand(10, 10, 3, 2)) @test gradtest(x -> maxpool(x, (2,2,2)), rand(10, 10, 10, 3, 2)) @test gradtest(x -> meanpool(x, (2,2)), rand(10, 10, 3, 2)) @test gradtest(x -> meanpool(x, (2,2,2)), rand(5, 5, 5, 3, 2)) +@test gradtest(x -> Float64.(x), 5) + @testset "equality & order" begin # TrackedReal @test param(2)^2 == param(4) @@ -230,10 +234,10 @@ end @testset "Intermediates" begin x = param([1]) l = sum((x .+ x).^2) - Flux.back!(l) + Flux.back!(l, once = false) @test x.grad == [8] x.grad .= 0 - Flux.back!(l) + Flux.back!(l, once = false) @test x.grad == [8] end @@ -258,7 +262,7 @@ Tracker.back!(b) back!(z) @test grad.((x,y)) == (3, 2) - @test Tracker.gradient(2, 3) do x, y + @test gradient(2, 3) do x, y xy = Tracker.collect([x, y]) xy[1]*xy[2] end == (3, 2) @@ -278,10 +282,27 @@ end count += 1 a * b end - @test derivative(x -> mul(5, x), 3) == 5 + @test gradient(x -> mul(5, x), 3)[1] == 5 @test count == 1 - @test derivative(x -> checkpoint(mul, 5, x), 3) == 5 + @test gradient(x -> checkpoint(mul, 5, x), 3)[1] == 5 @test count == 3 end +@testset "Updates" begin + xs = param([1, 2, 3]) + Tracker.update!(xs, param([4, 5, 6])) + @test xs == [5, 7, 9] + x = param(3) + Tracker.update!(x, param(4)) + @test x == 7 +end + +@testset "Params" begin + W = param(randn(5, 10)) + x = rand(10) + dW = gradient(W -> sum(W*x), W)[1] + gs = gradient(() -> sum(W*x), Tracker.Params([W])) + @test gs[W] == dW +end + end #testset diff --git a/test/utils.jl b/test/utils.jl index 2aade669..af0d50fe 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,5 +1,5 @@ using Flux -using Flux: throttle, jacobian, initn, glorot_uniform, glorot_normal +using Flux: throttle, jacobian, glorot_uniform, glorot_normal using StatsBase: std using Random using Test @@ -64,10 +64,6 @@ end @testset "Initialization" begin # Set random seed so that these tests don't fail randomly Random.seed!(0) - # initn() should yield a kernel with stddev ~= 1e-2 - v = initn(10, 10) - @test std(v) > 0.9*1e-2 - @test std(v) < 1.1*1e-2 # glorot_uniform should yield a kernel with stddev ~= sqrt(6/(n_in + n_out)), # and glorot_normal should yield a kernel with stddev != 2/(n_in _ n_out)