merge conflicts
This commit is contained in:
commit
5086c0f4f0
11
.travis.yml
11
.travis.yml
|
@ -7,16 +7,16 @@ os:
|
||||||
|
|
||||||
julia:
|
julia:
|
||||||
- 1.3
|
- 1.3
|
||||||
|
- 1
|
||||||
- nightly
|
- nightly
|
||||||
|
|
||||||
matrix:
|
notifications:
|
||||||
allow_failures:
|
email: false
|
||||||
- julia: nightly
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
include:
|
include:
|
||||||
- stage: "Documentation"
|
- stage: "Documentation"
|
||||||
julia: 1.3
|
julia: 1
|
||||||
os: linux
|
os: linux
|
||||||
script:
|
script:
|
||||||
- julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
|
- julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
|
||||||
|
@ -24,6 +24,9 @@ jobs:
|
||||||
- julia --project=docs/ docs/make.jl
|
- julia --project=docs/ docs/make.jl
|
||||||
after_success: skip
|
after_success: skip
|
||||||
|
|
||||||
|
allow_failures:
|
||||||
|
- julia: nightly
|
||||||
|
|
||||||
## uncomment the following lines to override the default test script
|
## uncomment the following lines to override the default test script
|
||||||
script:
|
script:
|
||||||
- julia --color=yes -e 'using Pkg; Pkg.activate(); Pkg.instantiate(); Pkg.test()'
|
- julia --color=yes -e 'using Pkg; Pkg.activate(); Pkg.instantiate(); Pkg.test()'
|
||||||
|
|
173
Manifest.toml
173
Manifest.toml
|
@ -8,15 +8,21 @@ version = "0.5.0"
|
||||||
|
|
||||||
[[AbstractTrees]]
|
[[AbstractTrees]]
|
||||||
deps = ["Markdown"]
|
deps = ["Markdown"]
|
||||||
git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
|
git-tree-sha1 = "86d092c2599f1f7bb01668bf8eb3412f98d61e47"
|
||||||
uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
|
uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
|
||||||
version = "0.3.0"
|
version = "0.3.2"
|
||||||
|
|
||||||
[[Adapt]]
|
[[Adapt]]
|
||||||
deps = ["LinearAlgebra"]
|
deps = ["LinearAlgebra"]
|
||||||
git-tree-sha1 = "82dab828020b872fa9efd3abec1152b075bc7cbf"
|
git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
|
||||||
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
|
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
|
||||||
version = "1.0.0"
|
version = "1.0.1"
|
||||||
|
|
||||||
|
[[ArrayLayouts]]
|
||||||
|
deps = ["FillArrays", "LinearAlgebra"]
|
||||||
|
git-tree-sha1 = "41956a49a8a4fefa1bf6664bca4a3035aba4c3a0"
|
||||||
|
uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
|
||||||
|
version = "0.2.3"
|
||||||
|
|
||||||
[[Base64]]
|
[[Base64]]
|
||||||
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
|
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
|
||||||
|
@ -34,39 +40,45 @@ version = "0.2.0"
|
||||||
|
|
||||||
[[CUDAapi]]
|
[[CUDAapi]]
|
||||||
deps = ["Libdl", "Logging"]
|
deps = ["Libdl", "Logging"]
|
||||||
git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
|
git-tree-sha1 = "831b825d10104bd29e28f6da93312a976830717b"
|
||||||
uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
|
uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
|
||||||
version = "2.1.0"
|
version = "4.0.0"
|
||||||
|
|
||||||
[[CUDAdrv]]
|
[[CUDAdrv]]
|
||||||
deps = ["CEnum", "CUDAapi", "Printf"]
|
deps = ["CEnum", "CUDAapi", "Printf"]
|
||||||
git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
|
git-tree-sha1 = "e650cbaee92b60433313157926b1e80d0c3a0e2e"
|
||||||
uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
|
uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
|
||||||
version = "5.0.1"
|
version = "6.2.2"
|
||||||
|
|
||||||
[[CUDAnative]]
|
[[CUDAnative]]
|
||||||
deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
|
deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
|
||||||
git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
|
git-tree-sha1 = "d1fc99635d0002c8a819b78cb1f441eb44310725"
|
||||||
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
|
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
|
||||||
version = "2.7.0"
|
version = "3.0.2"
|
||||||
|
|
||||||
|
[[CodeTracking]]
|
||||||
|
deps = ["InteractiveUtils", "UUIDs"]
|
||||||
|
git-tree-sha1 = "0becdab7e6fbbcb7b88d8de5b72e5bb2f28239f3"
|
||||||
|
uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
|
||||||
|
version = "0.5.8"
|
||||||
|
|
||||||
[[CodecZlib]]
|
[[CodecZlib]]
|
||||||
deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
|
deps = ["TranscodingStreams", "Zlib_jll"]
|
||||||
git-tree-sha1 = "05916673a2627dd91b4969ff8ba6941bc85a960e"
|
git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
|
||||||
uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
|
uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
|
||||||
version = "0.6.0"
|
version = "0.7.0"
|
||||||
|
|
||||||
[[ColorTypes]]
|
[[ColorTypes]]
|
||||||
deps = ["FixedPointNumbers", "Random"]
|
deps = ["FixedPointNumbers", "Random"]
|
||||||
git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
|
git-tree-sha1 = "c4c1cca28748906265ed62c788d6fe6f0134d264"
|
||||||
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
|
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
|
||||||
version = "0.8.1"
|
version = "0.10.0"
|
||||||
|
|
||||||
[[Colors]]
|
[[Colors]]
|
||||||
deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
|
deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"]
|
||||||
git-tree-sha1 = "c9c1845d6bf22e34738bee65c357a69f416ed5d1"
|
git-tree-sha1 = "2fdeb981ebcf52cd800ddb6a0aa5eac34153552d"
|
||||||
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
|
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
|
||||||
version = "0.9.6"
|
version = "0.12.0"
|
||||||
|
|
||||||
[[CommonSubexpressions]]
|
[[CommonSubexpressions]]
|
||||||
deps = ["Test"]
|
deps = ["Test"]
|
||||||
|
@ -74,11 +86,23 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
|
||||||
uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
|
uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
|
||||||
version = "0.2.0"
|
version = "0.2.0"
|
||||||
|
|
||||||
|
[[CompilerSupportLibraries_jll]]
|
||||||
|
deps = ["Libdl", "Pkg"]
|
||||||
|
git-tree-sha1 = "7c4f882c41faa72118841185afc58a2eb00ef612"
|
||||||
|
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
|
||||||
|
version = "0.3.3+0"
|
||||||
|
|
||||||
|
[[Cthulhu]]
|
||||||
|
deps = ["CodeTracking", "InteractiveUtils", "REPL", "Unicode"]
|
||||||
|
git-tree-sha1 = "484790098c85c26f8e59051f8ff1a0745c034a7d"
|
||||||
|
uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
|
||||||
|
version = "1.0.1"
|
||||||
|
|
||||||
[[CuArrays]]
|
[[CuArrays]]
|
||||||
deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
|
deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
|
||||||
git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
|
git-tree-sha1 = "e8c55b38dcca955f5aed8ec4479cdc95810db1e1"
|
||||||
uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
|
uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
|
||||||
version = "1.6.0"
|
version = "2.0.1"
|
||||||
|
|
||||||
[[DataAPI]]
|
[[DataAPI]]
|
||||||
git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
|
git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
|
||||||
|
@ -87,9 +111,9 @@ version = "1.1.0"
|
||||||
|
|
||||||
[[DataStructures]]
|
[[DataStructures]]
|
||||||
deps = ["InteractiveUtils", "OrderedCollections"]
|
deps = ["InteractiveUtils", "OrderedCollections"]
|
||||||
git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
|
git-tree-sha1 = "73eb18320fe3ba58790c8b8f6f89420f0a622773"
|
||||||
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
||||||
version = "0.17.7"
|
version = "0.17.11"
|
||||||
|
|
||||||
[[Dates]]
|
[[Dates]]
|
||||||
deps = ["Printf"]
|
deps = ["Printf"]
|
||||||
|
@ -107,78 +131,61 @@ version = "1.0.2"
|
||||||
|
|
||||||
[[DiffRules]]
|
[[DiffRules]]
|
||||||
deps = ["NaNMath", "Random", "SpecialFunctions"]
|
deps = ["NaNMath", "Random", "SpecialFunctions"]
|
||||||
git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
|
git-tree-sha1 = "eb0c34204c8410888844ada5359ac8b96292cfd1"
|
||||||
uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
|
uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
|
||||||
version = "1.0.0"
|
version = "1.0.1"
|
||||||
|
|
||||||
[[Distributed]]
|
[[Distributed]]
|
||||||
deps = ["Random", "Serialization", "Sockets"]
|
deps = ["Random", "Serialization", "Sockets"]
|
||||||
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
|
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
|
||||||
|
|
||||||
[[FFTW]]
|
|
||||||
deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
|
|
||||||
git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
|
|
||||||
uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
|
|
||||||
version = "1.2.0"
|
|
||||||
|
|
||||||
[[FFTW_jll]]
|
|
||||||
deps = ["Libdl", "Pkg"]
|
|
||||||
git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
|
|
||||||
uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
|
|
||||||
version = "3.3.9+3"
|
|
||||||
|
|
||||||
[[FillArrays]]
|
[[FillArrays]]
|
||||||
deps = ["LinearAlgebra", "Random", "SparseArrays"]
|
deps = ["LinearAlgebra", "Random", "SparseArrays"]
|
||||||
git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
|
git-tree-sha1 = "51cc2f9bc4eb9c6c0e81ec2f779d1085583cc956"
|
||||||
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
|
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
|
||||||
version = "0.8.4"
|
version = "0.8.7"
|
||||||
|
|
||||||
[[FixedPointNumbers]]
|
[[FixedPointNumbers]]
|
||||||
git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
|
git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238"
|
||||||
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
|
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
|
||||||
version = "0.6.1"
|
version = "0.8.0"
|
||||||
|
|
||||||
[[ForwardDiff]]
|
[[ForwardDiff]]
|
||||||
deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
|
deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
|
||||||
git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
|
git-tree-sha1 = "869540e4367122fbffaace383a5bdc34d6e5e5ac"
|
||||||
uuid = "f6369f11-7733-5829-9624-2563aa707210"
|
uuid = "f6369f11-7733-5829-9624-2563aa707210"
|
||||||
version = "0.10.8"
|
version = "0.10.10"
|
||||||
|
|
||||||
[[GPUArrays]]
|
[[GPUArrays]]
|
||||||
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
|
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
|
||||||
git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
|
git-tree-sha1 = "d586762b08dcda13228df8967119b9cb6f22ade5"
|
||||||
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
|
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
|
||||||
version = "2.0.1"
|
version = "3.1.0"
|
||||||
|
|
||||||
[[IRTools]]
|
[[IRTools]]
|
||||||
deps = ["InteractiveUtils", "MacroTools", "Test"]
|
deps = ["InteractiveUtils", "MacroTools", "Test"]
|
||||||
git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
|
git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
|
||||||
uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
|
uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
|
|
||||||
[[IntelOpenMP_jll]]
|
|
||||||
deps = ["Libdl", "Pkg"]
|
|
||||||
git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
|
|
||||||
uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
|
|
||||||
version = "2018.0.3+0"
|
|
||||||
|
|
||||||
[[InteractiveUtils]]
|
[[InteractiveUtils]]
|
||||||
deps = ["Markdown"]
|
deps = ["Markdown"]
|
||||||
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
|
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
|
||||||
|
|
||||||
[[Juno]]
|
[[Juno]]
|
||||||
deps = ["Base64", "Logging", "Media", "Profile", "Test"]
|
deps = ["Base64", "Logging", "Media", "Profile"]
|
||||||
git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8"
|
git-tree-sha1 = "e1ba2a612645b3e07c773c3a208f215745081fe6"
|
||||||
uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
|
uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
|
||||||
version = "0.7.2"
|
version = "0.8.1"
|
||||||
|
|
||||||
[[LLVM]]
|
[[LLVM]]
|
||||||
deps = ["CEnum", "Libdl", "Printf", "Unicode"]
|
deps = ["CEnum", "Libdl", "Printf", "Unicode"]
|
||||||
git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
|
git-tree-sha1 = "b6b86801ae2f2682e0a4889315dc76b68db2de71"
|
||||||
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
|
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
|
||||||
version = "1.3.3"
|
version = "1.3.4"
|
||||||
|
|
||||||
[[LibGit2]]
|
[[LibGit2]]
|
||||||
|
deps = ["Printf"]
|
||||||
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
|
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
|
||||||
|
|
||||||
[[Libdl]]
|
[[Libdl]]
|
||||||
|
@ -191,17 +198,11 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||||
[[Logging]]
|
[[Logging]]
|
||||||
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
||||||
|
|
||||||
[[MKL_jll]]
|
|
||||||
deps = ["Libdl", "Pkg"]
|
|
||||||
git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
|
|
||||||
uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
|
|
||||||
version = "2019.0.117+0"
|
|
||||||
|
|
||||||
[[MacroTools]]
|
[[MacroTools]]
|
||||||
deps = ["DataStructures", "Markdown", "Random"]
|
deps = ["Markdown", "Random"]
|
||||||
git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
|
git-tree-sha1 = "f7d2e3f654af75f01ec49be82c231c382214223a"
|
||||||
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
|
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
|
||||||
version = "0.5.3"
|
version = "0.5.5"
|
||||||
|
|
||||||
[[Markdown]]
|
[[Markdown]]
|
||||||
deps = ["Base64"]
|
deps = ["Base64"]
|
||||||
|
@ -224,9 +225,9 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
|
||||||
|
|
||||||
[[NNlib]]
|
[[NNlib]]
|
||||||
deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
|
deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
|
||||||
git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
|
git-tree-sha1 = "d9f196d911f55aeaff11b11f681b135980783824"
|
||||||
uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
|
uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
|
||||||
version = "0.6.2"
|
version = "0.6.6"
|
||||||
|
|
||||||
[[NaNMath]]
|
[[NaNMath]]
|
||||||
git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
|
git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
|
||||||
|
@ -234,10 +235,10 @@ uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
|
||||||
version = "0.3.3"
|
version = "0.3.3"
|
||||||
|
|
||||||
[[OpenSpecFun_jll]]
|
[[OpenSpecFun_jll]]
|
||||||
deps = ["Libdl", "Pkg"]
|
deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
|
||||||
git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
|
git-tree-sha1 = "d51c416559217d974a1113522d5919235ae67a87"
|
||||||
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
|
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
|
||||||
version = "0.5.3+1"
|
version = "0.5.3+3"
|
||||||
|
|
||||||
[[OrderedCollections]]
|
[[OrderedCollections]]
|
||||||
deps = ["Random", "Serialization", "Test"]
|
deps = ["Random", "Serialization", "Test"]
|
||||||
|
@ -273,9 +274,9 @@ version = "0.2.0"
|
||||||
|
|
||||||
[[Requires]]
|
[[Requires]]
|
||||||
deps = ["UUIDs"]
|
deps = ["UUIDs"]
|
||||||
git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
|
git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b"
|
||||||
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
|
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
|
||||||
version = "1.0.0"
|
version = "1.0.1"
|
||||||
|
|
||||||
[[SHA]]
|
[[SHA]]
|
||||||
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
||||||
|
@ -298,9 +299,9 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
|
||||||
|
|
||||||
[[SpecialFunctions]]
|
[[SpecialFunctions]]
|
||||||
deps = ["OpenSpecFun_jll"]
|
deps = ["OpenSpecFun_jll"]
|
||||||
git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
|
git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
|
||||||
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
|
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
|
||||||
version = "0.9.0"
|
version = "0.10.0"
|
||||||
|
|
||||||
[[StaticArrays]]
|
[[StaticArrays]]
|
||||||
deps = ["LinearAlgebra", "Random", "Statistics"]
|
deps = ["LinearAlgebra", "Random", "Statistics"]
|
||||||
|
@ -314,9 +315,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
||||||
|
|
||||||
[[StatsBase]]
|
[[StatsBase]]
|
||||||
deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
|
deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
|
||||||
git-tree-sha1 = "c53e809e63fe5cf5de13632090bc3520649c9950"
|
git-tree-sha1 = "a6102b1f364befdb05746f386b67c6b7e3262c45"
|
||||||
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
|
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
|
||||||
version = "0.32.0"
|
version = "0.33.0"
|
||||||
|
|
||||||
[[Test]]
|
[[Test]]
|
||||||
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
|
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
|
||||||
|
@ -343,21 +344,21 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
||||||
|
|
||||||
[[ZipFile]]
|
[[ZipFile]]
|
||||||
deps = ["Libdl", "Printf", "Zlib_jll"]
|
deps = ["Libdl", "Printf", "Zlib_jll"]
|
||||||
git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
|
git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
|
||||||
uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
|
uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
|
||||||
version = "0.9.0"
|
version = "0.9.1"
|
||||||
|
|
||||||
[[Zlib_jll]]
|
[[Zlib_jll]]
|
||||||
deps = ["Libdl", "Pkg"]
|
deps = ["Libdl", "Pkg"]
|
||||||
git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
|
git-tree-sha1 = "2f6c3e15e20e036ee0a0965879b31442b7ec50fa"
|
||||||
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
|
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
|
||||||
version = "1.2.11+7"
|
version = "1.2.11+9"
|
||||||
|
|
||||||
[[Zygote]]
|
[[Zygote]]
|
||||||
deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
|
deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
|
||||||
git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
|
git-tree-sha1 = "1ccbfbe8930376e31752b812daa2532c723dc332"
|
||||||
uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
|
uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
|
||||||
version = "0.4.5"
|
version = "0.4.13"
|
||||||
|
|
||||||
[[ZygoteRules]]
|
[[ZygoteRules]]
|
||||||
deps = ["MacroTools"]
|
deps = ["MacroTools"]
|
||||||
|
|
3
NEWS.md
3
NEWS.md
|
@ -1,3 +1,6 @@
|
||||||
|
# v0.10.5
|
||||||
|
* Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`.
|
||||||
|
|
||||||
# v0.10.0
|
# v0.10.0
|
||||||
* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
|
* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
|
||||||
- The dependency on Tracker.jl has been removed.
|
- The dependency on Tracker.jl has been removed.
|
||||||
|
|
16
Project.toml
16
Project.toml
|
@ -1,6 +1,6 @@
|
||||||
name = "Flux"
|
name = "Flux"
|
||||||
uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
|
uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
|
||||||
version = "0.10.2"
|
version = "0.10.4"
|
||||||
|
|
||||||
[deps]
|
[deps]
|
||||||
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
|
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
|
||||||
|
@ -26,21 +26,23 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
|
||||||
[compat]
|
[compat]
|
||||||
AbstractTrees = "0.2, 0.3"
|
AbstractTrees = "0.2, 0.3"
|
||||||
Adapt = "1"
|
Adapt = "1"
|
||||||
CodecZlib = "0.5, 0.6"
|
CodecZlib = "0.5, 0.6, 0.7"
|
||||||
Colors = "0.8, 0.9, 0.10, 0.11"
|
Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
|
||||||
CuArrays = "1.6"
|
CuArrays = "2"
|
||||||
Juno = "0.5, 0.6, 0.7, 0.8"
|
Juno = "0.5, 0.6, 0.7, 0.8"
|
||||||
MacroTools = "0.3, 0.4, 0.5"
|
MacroTools = "0.3, 0.4, 0.5"
|
||||||
NNlib = "0.6"
|
NNlib = "0.6"
|
||||||
Reexport = "0.2"
|
Reexport = "0.2"
|
||||||
StatsBase = "0"
|
StatsBase = "0"
|
||||||
ZipFile = "0.7, 0.8, 0.9"
|
ZipFile = "0.7, 0.8, 0.9"
|
||||||
Zygote = "0.4"
|
Zygote = "0.4.13"
|
||||||
julia = "1"
|
julia = "1.3"
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
|
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
|
||||||
|
IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
|
||||||
|
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||||
|
|
||||||
[targets]
|
[targets]
|
||||||
test = ["Test", "Documenter"]
|
test = ["Test", "Documenter", "IterTools", "LinearAlgebra"]
|
||||||
|
|
|
@ -1,89 +0,0 @@
|
||||||
# This file is machine-generated - editing it directly is not advised
|
|
||||||
|
|
||||||
[[Base64]]
|
|
||||||
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
|
|
||||||
|
|
||||||
[[Dates]]
|
|
||||||
deps = ["Printf"]
|
|
||||||
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
|
||||||
|
|
||||||
[[Distributed]]
|
|
||||||
deps = ["Random", "Serialization", "Sockets"]
|
|
||||||
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
|
|
||||||
|
|
||||||
[[DocStringExtensions]]
|
|
||||||
deps = ["LibGit2", "Markdown", "Pkg", "Test"]
|
|
||||||
git-tree-sha1 = "0513f1a8991e9d83255e0140aace0d0fc4486600"
|
|
||||||
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
|
|
||||||
version = "0.8.0"
|
|
||||||
|
|
||||||
[[Documenter]]
|
|
||||||
deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
|
|
||||||
git-tree-sha1 = "c61d6eedbc3c4323c08b64af12d29c8ee0fcbb5f"
|
|
||||||
uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
|
|
||||||
version = "0.23.2"
|
|
||||||
|
|
||||||
[[InteractiveUtils]]
|
|
||||||
deps = ["Markdown"]
|
|
||||||
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
|
|
||||||
|
|
||||||
[[JSON]]
|
|
||||||
deps = ["Dates", "Mmap", "Parsers", "Unicode"]
|
|
||||||
git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
|
|
||||||
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
|
|
||||||
version = "0.21.0"
|
|
||||||
|
|
||||||
[[LibGit2]]
|
|
||||||
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
|
|
||||||
|
|
||||||
[[Logging]]
|
|
||||||
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
|
||||||
|
|
||||||
[[Markdown]]
|
|
||||||
deps = ["Base64"]
|
|
||||||
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
|
|
||||||
|
|
||||||
[[Mmap]]
|
|
||||||
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
|
|
||||||
|
|
||||||
[[Parsers]]
|
|
||||||
deps = ["Dates", "Test"]
|
|
||||||
git-tree-sha1 = "db2b35dedab3c0e46dc15996d170af07a5ab91c9"
|
|
||||||
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
|
|
||||||
version = "0.3.6"
|
|
||||||
|
|
||||||
[[Pkg]]
|
|
||||||
deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
|
|
||||||
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
|
||||||
|
|
||||||
[[Printf]]
|
|
||||||
deps = ["Unicode"]
|
|
||||||
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
|
||||||
|
|
||||||
[[REPL]]
|
|
||||||
deps = ["InteractiveUtils", "Markdown", "Sockets"]
|
|
||||||
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
|
|
||||||
|
|
||||||
[[Random]]
|
|
||||||
deps = ["Serialization"]
|
|
||||||
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
|
||||||
|
|
||||||
[[SHA]]
|
|
||||||
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
|
||||||
|
|
||||||
[[Serialization]]
|
|
||||||
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
|
|
||||||
|
|
||||||
[[Sockets]]
|
|
||||||
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
|
|
||||||
|
|
||||||
[[Test]]
|
|
||||||
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
|
|
||||||
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
|
||||||
|
|
||||||
[[UUIDs]]
|
|
||||||
deps = ["Random", "SHA"]
|
|
||||||
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
|
|
||||||
|
|
||||||
[[Unicode]]
|
|
||||||
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
|
|
@ -1,2 +1,6 @@
|
||||||
[deps]
|
[deps]
|
||||||
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
|
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
|
||||||
|
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
|
||||||
|
|
||||||
|
[compat]
|
||||||
|
Documenter = "0.24"
|
||||||
|
|
31
docs/make.jl
31
docs/make.jl
|
@ -1,29 +1,36 @@
|
||||||
using Pkg;
|
|
||||||
Pkg.activate(joinpath(@__DIR__, "..")); Pkg.instantiate()
|
|
||||||
Pkg.activate(); Pkg.instantiate()
|
|
||||||
|
|
||||||
pushfirst!(LOAD_PATH, joinpath(@__DIR__, ".."))
|
|
||||||
|
|
||||||
using Documenter, Flux, NNlib
|
using Documenter, Flux, NNlib
|
||||||
|
|
||||||
|
DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
|
||||||
makedocs(modules=[Flux, NNlib],
|
makedocs(modules=[Flux, NNlib],
|
||||||
|
doctest = VERSION >= v"1.4",
|
||||||
sitename = "Flux",
|
sitename = "Flux",
|
||||||
pages = ["Home" => "index.md",
|
pages = ["Home" => "index.md",
|
||||||
"Building Models" =>
|
"Building Models" =>
|
||||||
["Basics" => "models/basics.md",
|
["Basics" => "models/basics.md",
|
||||||
"Recurrence" => "models/recurrence.md",
|
"Recurrence" => "models/recurrence.md",
|
||||||
"Regularisation" => "models/regularisation.md",
|
"Regularisation" => "models/regularisation.md",
|
||||||
"Model Reference" => "models/layers.md"],
|
"Model Reference" => "models/layers.md",
|
||||||
|
"Advanced Model Building" => "models/advanced.md",
|
||||||
|
"NNlib" => "models/nnlib.md"],
|
||||||
|
"Handling Data" =>
|
||||||
|
["One-Hot Encoding" => "data/onehot.md",
|
||||||
|
"DataLoader" => "data/dataloader.md"],
|
||||||
"Training Models" =>
|
"Training Models" =>
|
||||||
["Optimisers" => "training/optimisers.md",
|
["Optimisers" => "training/optimisers.md",
|
||||||
"Training" => "training/training.md"],
|
"Training" => "training/training.md"],
|
||||||
"One-Hot Encoding" => "data/onehot.md",
|
|
||||||
"GPU Support" => "gpu.md",
|
"GPU Support" => "gpu.md",
|
||||||
"Saving & Loading" => "saving.md",
|
"Saving & Loading" => "saving.md",
|
||||||
|
"The Julia Ecosystem" => "ecosystem.md",
|
||||||
|
"Utility Functions" => "utilities.md",
|
||||||
"Performance Tips" => "performance.md",
|
"Performance Tips" => "performance.md",
|
||||||
|
"Datasets" => "datasets.md",
|
||||||
"Community" => "community.md"],
|
"Community" => "community.md"],
|
||||||
format = Documenter.HTML(assets = ["assets/flux.css"],
|
format = Documenter.HTML(
|
||||||
analytics = "UA-36890222-9",
|
analytics = "UA-36890222-9",
|
||||||
prettyurls = haskey(ENV, "CI")))
|
assets = ["assets/flux.css"],
|
||||||
|
prettyurls = get(ENV, "CI", nothing) == "true"),
|
||||||
|
)
|
||||||
|
|
||||||
deploydocs(repo = "github.com/FluxML/Flux.jl.git")
|
deploydocs(repo = "github.com/FluxML/Flux.jl.git",
|
||||||
|
target = "build",
|
||||||
|
push_preview = true)
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
# DataLoader
|
||||||
|
Flux provides the `DataLoader` type in the `Flux.Data` module to handle iteration over mini-batches of data.
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.Data.DataLoader
|
||||||
|
```
|
|
@ -31,6 +31,11 @@ julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c])
|
||||||
:c
|
:c
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.onehot
|
||||||
|
Flux.onecold
|
||||||
|
```
|
||||||
|
|
||||||
## Batches
|
## Batches
|
||||||
|
|
||||||
`onehotbatch` creates a batch (matrix) of one-hot vectors, and `onecold` treats matrices as batches.
|
`onehotbatch` creates a batch (matrix) of one-hot vectors, and `onecold` treats matrices as batches.
|
||||||
|
@ -52,3 +57,7 @@ julia> onecold(ans, [:a, :b, :c])
|
||||||
```
|
```
|
||||||
|
|
||||||
Note that these operations returned `OneHotVector` and `OneHotMatrix` rather than `Array`s. `OneHotVector`s behave like normal vectors but avoid any unnecessary cost compared to using an integer index directly. For example, multiplying a matrix with a one-hot vector simply slices out the relevant row of the matrix under the hood.
|
Note that these operations returned `OneHotVector` and `OneHotMatrix` rather than `Array`s. `OneHotVector`s behave like normal vectors but avoid any unnecessary cost compared to using an integer index directly. For example, multiplying a matrix with a one-hot vector simply slices out the relevant row of the matrix under the hood.
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.onehotbatch
|
||||||
|
```
|
||||||
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Datasets
|
||||||
|
|
||||||
|
Flux includes several standard machine learning datasets.
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.Data.Iris.features()
|
||||||
|
Flux.Data.Iris.labels()
|
||||||
|
Flux.Data.MNIST.images()
|
||||||
|
Flux.Data.MNIST.labels()
|
||||||
|
Flux.Data.FashionMNIST.images()
|
||||||
|
Flux.Data.FashionMNIST.labels()
|
||||||
|
Flux.Data.CMUDict.phones()
|
||||||
|
Flux.Data.CMUDict.symbols()
|
||||||
|
Flux.Data.CMUDict.rawdict()
|
||||||
|
Flux.Data.CMUDict.cmudict()
|
||||||
|
Flux.Data.Sentiment.train()
|
||||||
|
Flux.Data.Sentiment.test()
|
||||||
|
Flux.Data.Sentiment.dev()
|
||||||
|
```
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
# The Julia Ecosystem
|
||||||
|
|
||||||
|
One of the main strengths of Julia lies in an ecosystem of packages
|
||||||
|
globally providing a rich and consistent user experience.
|
||||||
|
|
||||||
|
This is a non-exhaustive list of Julia packages, nicely complementing `Flux` in typical
|
||||||
|
machine learning and deep learning workflows:
|
||||||
|
|
||||||
|
- [ArgParse.jl](https://github.com/carlobaldassi/ArgParse.jl): package for parsing command-line arguments to Julia programs.
|
||||||
|
- [Augmentor.jl](https://github.com/Evizero/Augmentor.jl): a fast image augmentation library in Julia for machine learning.
|
||||||
|
- [BSON.jl](https://github.com/JuliaIO/BSON.jl): package for working with the Binary JSON serialisation format
|
||||||
|
- [DataFrames.jl](https://github.com/joshday/OnlineStats.jl): in-memory tabular data in Julia
|
||||||
|
- [DrWatson.jl](https://github.com/JuliaDynamics/DrWatson.jl): a scientific project assistant software
|
||||||
|
- [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl): utility package for accessing common machine learning datasets
|
||||||
|
- [OnlineStats.jl](https://github.com/joshday/OnlineStats.jl): single-pass algorithms for statistics
|
||||||
|
- [Parameters.jl](https://github.com/mauro3/Parameters.jl): types with default field values, keyword constructors and (un-)pack macros
|
||||||
|
- [ProgressMeters.jl](https://github.com/timholy/ProgressMeter.jl): progress meters for long-running computations
|
||||||
|
- [TensorBoardLogger.jl](https://github.com/PhilipVinc/TensorBoardLogger.jl): easy peasy logging to [tensorboard](https://www.tensorflow.org/tensorboard) in Julia
|
||||||
|
|
||||||
|
|
||||||
|
This tight integration among Julia pakages is shown in some of the examples in the [model-zoo](https://github.com/FluxML/model-zoo) repository.
|
|
@ -30,7 +30,7 @@ If you define a structured model, like a `Dense` layer or `Chain`, you just need
|
||||||
```julia
|
```julia
|
||||||
d = Dense(10, 5, σ)
|
d = Dense(10, 5, σ)
|
||||||
d = fmap(cu, d)
|
d = fmap(cu, d)
|
||||||
d.W # Tracked CuArray
|
d.W # CuArray
|
||||||
d(cu(rand(10))) # CuArray output
|
d(cu(rand(10))) # CuArray output
|
||||||
|
|
||||||
m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
|
m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
|
||||||
|
@ -53,7 +53,7 @@ julia> x = rand(10) |> gpu
|
||||||
0.511655
|
0.511655
|
||||||
|
|
||||||
julia> m(x)
|
julia> m(x)
|
||||||
Tracked 5-element CuArray{Float32,1}:
|
5-element CuArray{Float32,1}:
|
||||||
-0.30535
|
-0.30535
|
||||||
⋮
|
⋮
|
||||||
-0.618002
|
-0.618002
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
# Advanced Model Building and Customisation
|
||||||
|
|
||||||
|
Here we will try and describe usage of some more advanced features that Flux provides to give more control over model building.
|
||||||
|
|
||||||
|
## Customising Parameter Collection for a Model
|
||||||
|
|
||||||
|
Taking reference from our example `Affine` layer from the [basics](basics.md#Building-Layers-1).
|
||||||
|
|
||||||
|
By default all the fields in the `Affine` type are collected as its parameters, however, in some cases it may be desired to hold other metadata in our "layers" that may not be needed for training, and are hence supposed to be ignored while the parameters are collected. With Flux, it is possible to mark the fields of our layers that are trainable in two ways.
|
||||||
|
|
||||||
|
The first way of achieving this is through overloading the `trainable` function.
|
||||||
|
|
||||||
|
```julia-repl
|
||||||
|
julia> @functor Affine
|
||||||
|
|
||||||
|
julia> a = Affine(rand(3,3), rand(3))
|
||||||
|
Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955])
|
||||||
|
|
||||||
|
julia> Flux.params(a) # default behavior
|
||||||
|
Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])
|
||||||
|
|
||||||
|
julia> Flux.trainable(a::Affine) = (a.W, a.b,)
|
||||||
|
|
||||||
|
julia> Flux.params(a)
|
||||||
|
Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])
|
||||||
|
```
|
||||||
|
|
||||||
|
Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`.
|
||||||
|
|
||||||
|
Another way of achieving this is through the `@functor` macro directly. Here, we can mark the fields we are interested in by grouping them in the second argument:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
Flux.@functor Affine (W,)
|
||||||
|
```
|
||||||
|
|
||||||
|
However, doing this requires the `struct` to have a corresponding constructor that accepts those parameters.
|
||||||
|
|
||||||
|
## Freezing Layer Parameters
|
||||||
|
|
||||||
|
When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
|
||||||
|
|
||||||
|
Consider a simple multi-layer perceptron model where we want to avoid optimising the first two `Dense` layers. We can obtain
|
||||||
|
this using the slicing features `Chain` provides:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
m = Chain(
|
||||||
|
Dense(784, 64, relu),
|
||||||
|
Dense(64, 64, relu),
|
||||||
|
Dense(32, 10)
|
||||||
|
)
|
||||||
|
|
||||||
|
ps = Flux.params(m[3:end])
|
||||||
|
```
|
||||||
|
|
||||||
|
The `Zygote.Params` object `ps` now holds a reference to only the parameters of the layers passed to it.
|
||||||
|
|
||||||
|
During training, the gradients will only be computed for (and applied to) the last `Dense` layer, therefore only that would have its parameters changed.
|
||||||
|
|
||||||
|
`Flux.params` also takes multiple inputs to make it easy to collect parameters from heterogenous models with a single call. A simple demonstration would be if we wanted to omit optimising the second `Dense` layer in the previous example. It would look something like this:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
Flux.params(m[1], m[3:end])
|
||||||
|
```
|
||||||
|
|
||||||
|
Sometimes, a more fine-tuned control is needed.
|
||||||
|
We can freeze a specific parameter of a specific layer which already entered a `Params` object `ps`,
|
||||||
|
by simply deleting it from `ps`:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
ps = params(m)
|
||||||
|
delete!(ps, m[2].b)
|
||||||
|
```
|
||||||
|
|
|
@ -69,8 +69,8 @@ b = rand(2)
|
||||||
predict(x) = W*x .+ b
|
predict(x) = W*x .+ b
|
||||||
|
|
||||||
function loss(x, y)
|
function loss(x, y)
|
||||||
ŷ = predict(x)
|
ŷ = predict(x)
|
||||||
sum((y .- ŷ).^2)
|
sum((y .- ŷ).^2)
|
||||||
end
|
end
|
||||||
|
|
||||||
x, y = rand(5), rand(2) # Dummy data
|
x, y = rand(5), rand(2) # Dummy data
|
||||||
|
@ -220,6 +220,8 @@ Flux.@functor Affine
|
||||||
|
|
||||||
This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
|
This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
|
||||||
|
|
||||||
|
For some more helpful tricks, including parameter freezing, please checkout the [advanced usage guide](advanced.md).
|
||||||
|
|
||||||
## Utility functions
|
## Utility functions
|
||||||
|
|
||||||
Flux provides some utility functions to help you generate models in an automated fashion.
|
Flux provides some utility functions to help you generate models in an automated fashion.
|
||||||
|
@ -238,5 +240,5 @@ Currently limited to the following layers:
|
||||||
- `MeanPool`
|
- `MeanPool`
|
||||||
|
|
||||||
```@docs
|
```@docs
|
||||||
outdims
|
Flux.outdims
|
||||||
```
|
```
|
||||||
|
|
|
@ -14,10 +14,13 @@ These layers are used to build convolutional neural networks (CNNs).
|
||||||
```@docs
|
```@docs
|
||||||
Conv
|
Conv
|
||||||
MaxPool
|
MaxPool
|
||||||
|
GlobalMaxPool
|
||||||
MeanPool
|
MeanPool
|
||||||
|
GlobalMeanPool
|
||||||
DepthwiseConv
|
DepthwiseConv
|
||||||
ConvTranspose
|
ConvTranspose
|
||||||
CrossCor
|
CrossCor
|
||||||
|
flatten
|
||||||
```
|
```
|
||||||
|
|
||||||
## Recurrent Layers
|
## Recurrent Layers
|
||||||
|
@ -29,6 +32,7 @@ RNN
|
||||||
LSTM
|
LSTM
|
||||||
GRU
|
GRU
|
||||||
Flux.Recur
|
Flux.Recur
|
||||||
|
Flux.reset!
|
||||||
```
|
```
|
||||||
|
|
||||||
## Other General Purpose Layers
|
## Other General Purpose Layers
|
||||||
|
@ -40,40 +44,45 @@ Maxout
|
||||||
SkipConnection
|
SkipConnection
|
||||||
```
|
```
|
||||||
|
|
||||||
## Activation Functions
|
|
||||||
|
|
||||||
Non-linearities that go between layers of your model. Most of these functions are defined in [NNlib](https://github.com/FluxML/NNlib.jl) but are available by default in Flux.
|
|
||||||
|
|
||||||
Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
|
|
||||||
|
|
||||||
```@docs
|
|
||||||
σ
|
|
||||||
relu
|
|
||||||
leakyrelu
|
|
||||||
elu
|
|
||||||
swish
|
|
||||||
```
|
|
||||||
|
|
||||||
## Normalisation & Regularisation
|
## Normalisation & Regularisation
|
||||||
|
|
||||||
These layers don't affect the structure of the network but may improve training times or reduce overfitting.
|
These layers don't affect the structure of the network but may improve training times or reduce overfitting.
|
||||||
|
|
||||||
```@docs
|
```@docs
|
||||||
|
Flux.normalise
|
||||||
BatchNorm
|
BatchNorm
|
||||||
|
Flux.dropout
|
||||||
Dropout
|
Dropout
|
||||||
AlphaDropout
|
AlphaDropout
|
||||||
LayerNorm
|
LayerNorm
|
||||||
|
InstanceNorm
|
||||||
GroupNorm
|
GroupNorm
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Testmode
|
||||||
|
|
||||||
|
Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `Flux.testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified.
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.testmode!
|
||||||
|
trainmode!
|
||||||
|
```
|
||||||
|
|
||||||
## Cost Functions
|
## Cost Functions
|
||||||
```@docs
|
```@docs
|
||||||
mse
|
Flux.mae
|
||||||
crossentropy
|
Flux.mse
|
||||||
logitcrossentropy
|
Flux.msle
|
||||||
binarycrossentropy
|
Flux.huber_loss
|
||||||
logitbinarycrossentropy
|
Flux.crossentropy
|
||||||
kldivergence
|
Flux.logitcrossentropy
|
||||||
poisson
|
Flux.binarycrossentropy
|
||||||
hinge
|
Flux.logitbinarycrossentropy
|
||||||
|
Flux.kldivergence
|
||||||
|
Flux.poisson
|
||||||
|
Flux.hinge
|
||||||
|
Flux.squared_hinge
|
||||||
|
Flux.dice_coeff_loss
|
||||||
|
Flux.tversky_loss
|
||||||
```
|
```
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
# NNlib
|
||||||
|
|
||||||
|
Flux re-exports all of the functions exported by the [NNlib](https://github.com/FluxML/NNlib.jl) package.
|
||||||
|
|
||||||
|
## Activation Functions
|
||||||
|
|
||||||
|
Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
NNlib.celu
|
||||||
|
NNlib.elu
|
||||||
|
NNlib.gelu
|
||||||
|
NNlib.hardsigmoid
|
||||||
|
NNlib.hardtanh
|
||||||
|
NNlib.leakyrelu
|
||||||
|
NNlib.lisht
|
||||||
|
NNlib.logcosh
|
||||||
|
NNlib.logsigmoid
|
||||||
|
NNlib.mish
|
||||||
|
NNlib.relu
|
||||||
|
NNlib.relu6
|
||||||
|
NNlib.rrelu
|
||||||
|
NNlib.selu
|
||||||
|
NNlib.sigmoid
|
||||||
|
NNlib.softplus
|
||||||
|
NNlib.softshrink
|
||||||
|
NNlib.softsign
|
||||||
|
NNlib.swish
|
||||||
|
NNlib.tanhshrink
|
||||||
|
NNlib.trelu
|
||||||
|
```
|
||||||
|
|
||||||
|
## Softmax
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
NNlib.softmax
|
||||||
|
NNlib.logsoftmax
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pooling
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
NNlib.maxpool
|
||||||
|
NNlib.meanpool
|
||||||
|
```
|
||||||
|
|
||||||
|
## Convolution
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
NNlib.conv
|
||||||
|
NNlib.depthwiseconv
|
||||||
|
```
|
||||||
|
|
||||||
|
## Batched Operations
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
NNlib.batched_mul
|
||||||
|
NNlib.batched_mul!
|
||||||
|
NNlib.batched_adjoint
|
||||||
|
NNlib.batched_transpose
|
||||||
|
```
|
|
@ -31,7 +31,7 @@ julia> params(m)
|
||||||
param([0.0, 0.0, 0.0, 0.0, 0.0])
|
param([0.0, 0.0, 0.0, 0.0, 0.0])
|
||||||
|
|
||||||
julia> sum(norm, params(m))
|
julia> sum(norm, params(m))
|
||||||
26.01749952921026 (tracked)
|
26.01749952921026
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's a larger example with a multi-layer perceptron.
|
Here's a larger example with a multi-layer perceptron.
|
||||||
|
@ -52,7 +52,7 @@ One can also easily add per-layer regularisation via the `activations` function:
|
||||||
```julia
|
```julia
|
||||||
julia> using Flux: activations
|
julia> using Flux: activations
|
||||||
|
|
||||||
julia> c = Chain(Dense(10,5,σ),Dense(5,2),softmax)
|
julia> c = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
|
||||||
Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
|
Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
|
||||||
|
|
||||||
julia> activations(c, rand(10))
|
julia> activations(c, rand(10))
|
||||||
|
@ -64,3 +64,7 @@ julia> activations(c, rand(10))
|
||||||
julia> sum(norm, ans)
|
julia> sum(norm, ans)
|
||||||
2.1166067f0
|
2.1166067f0
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.activations
|
||||||
|
```
|
||||||
|
|
|
@ -4,7 +4,7 @@ All the usual [Julia performance tips apply](https://docs.julialang.org/en/v1/ma
|
||||||
As always [profiling your code](https://docs.julialang.org/en/v1/manual/profile/#Profiling-1) is generally a useful way of finding bottlenecks.
|
As always [profiling your code](https://docs.julialang.org/en/v1/manual/profile/#Profiling-1) is generally a useful way of finding bottlenecks.
|
||||||
Below follow some Flux specific tips/reminders.
|
Below follow some Flux specific tips/reminders.
|
||||||
|
|
||||||
## Don't use more precision than you need.
|
## Don't use more precision than you need
|
||||||
|
|
||||||
Flux works great with all kinds of number types.
|
Flux works great with all kinds of number types.
|
||||||
But often you do not need to be working with say `Float64` (let alone `BigFloat`).
|
But often you do not need to be working with say `Float64` (let alone `BigFloat`).
|
||||||
|
@ -14,7 +14,8 @@ Which means allocations occur much faster.
|
||||||
And you use less memory.
|
And you use less memory.
|
||||||
|
|
||||||
|
|
||||||
## Make sure your activation and loss functions preserve the type of their inputs
|
## Preserve inputs' types
|
||||||
|
|
||||||
Not only should your activation and loss functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
|
Not only should your activation and loss functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
|
||||||
they should also preserve the type of their inputs.
|
they should also preserve the type of their inputs.
|
||||||
|
|
||||||
|
@ -29,31 +30,29 @@ because it results in having to use slow mixed type multiplication in the dense
|
||||||
Similar situations can occur in the loss function during backpropagation.
|
Similar situations can occur in the loss function during backpropagation.
|
||||||
|
|
||||||
Which means if you change your data say from `Float64` to `Float32` (which should give a speedup: see above),
|
Which means if you change your data say from `Float64` to `Float32` (which should give a speedup: see above),
|
||||||
you will see a large slow-down
|
you will see a large slow-down.
|
||||||
|
|
||||||
This can occur sneakily, because you can cause type-promotion by interacting with a numeric literals.
|
This can occur sneakily, because you can cause type-promotion by interacting with a numeric literals.
|
||||||
E.g. the following will have run into the same problem as above:
|
E.g. the following will have run into the same problem as above:
|
||||||
|
|
||||||
```
|
```
|
||||||
leaky_tanh(x) = 0.01x + tanh(x)
|
leaky_tanh(x) = 0.01*x + tanh(x)
|
||||||
```
|
```
|
||||||
|
|
||||||
While one could change your activation function (e.g. to use `0.01f0x`) to avoid this when ever your inputs change,
|
While one could change the activation function (e.g. to use `0.01f0x`), the idiomatic (and safe way) to avoid type casts whenever inputs changes is to use `oftype`:
|
||||||
the idiomatic (and safe way) is to use `oftype`.
|
|
||||||
|
|
||||||
```
|
```
|
||||||
leaky_tanh(x) = oftype(x/1, 0.01)x + tanh(x)
|
leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
## Evaluate batches as Matrices of features, rather than sequences of Vector features
|
## Evaluate batches as Matrices of features
|
||||||
|
|
||||||
While it can sometimes be tempting to process your observations (feature vectors) one at a time
|
While it can sometimes be tempting to process your observations (feature vectors) one at a time
|
||||||
e.g.
|
e.g.
|
||||||
```julia
|
```julia
|
||||||
function loss_total(xs::AbstractVector{<:Vector}, ys::AbstractVector{<:Vector})
|
function loss_total(xs::AbstractVector{<:Vector}, ys::AbstractVector{<:Vector})
|
||||||
sum(zip(xs, ys)) do (x, y_target)
|
sum(zip(xs, ys)) do (x, y_target)
|
||||||
y_pred = model(x) # evaluate the model
|
y_pred = model(x) # evaluate the model
|
||||||
return loss(y_pred, y_target)
|
return loss(y_pred, y_target)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -21,7 +21,7 @@ grads = gradient(() -> loss(x, y), θ)
|
||||||
We want to update each parameter, using the gradient, in order to improve (reduce) the loss. Here's one way to do that:
|
We want to update each parameter, using the gradient, in order to improve (reduce) the loss. Here's one way to do that:
|
||||||
|
|
||||||
```julia
|
```julia
|
||||||
using Flux: update!
|
using Flux.Optimise: update!
|
||||||
|
|
||||||
η = 0.1 # Learning Rate
|
η = 0.1 # Learning Rate
|
||||||
for p in (W, b)
|
for p in (W, b)
|
||||||
|
@ -46,11 +46,13 @@ An optimiser `update!` accepts a parameter and a gradient, and updates the param
|
||||||
All optimisers return an object that, when passed to `train!`, will update the parameters passed to it.
|
All optimisers return an object that, when passed to `train!`, will update the parameters passed to it.
|
||||||
|
|
||||||
```@docs
|
```@docs
|
||||||
|
Flux.Optimise.update!
|
||||||
Descent
|
Descent
|
||||||
Momentum
|
Momentum
|
||||||
Nesterov
|
Nesterov
|
||||||
RMSProp
|
RMSProp
|
||||||
ADAM
|
ADAM
|
||||||
|
RADAM
|
||||||
AdaMax
|
AdaMax
|
||||||
ADAGrad
|
ADAGrad
|
||||||
ADADelta
|
ADADelta
|
||||||
|
@ -61,7 +63,7 @@ ADAMW
|
||||||
|
|
||||||
## Optimiser Interface
|
## Optimiser Interface
|
||||||
|
|
||||||
Flux's optimsers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
|
Flux's optimisers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
|
||||||
|
|
||||||
In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
|
In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
|
||||||
|
|
||||||
|
@ -99,15 +101,15 @@ Flux internally calls on this function via the `update!` function. It shares the
|
||||||
|
|
||||||
## Composing Optimisers
|
## Composing Optimisers
|
||||||
|
|
||||||
Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
|
Flux defines a special kind of optimiser simply called `Optimiser` which takes in arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
|
||||||
that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
|
that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
|
||||||
|
|
||||||
```julia
|
```julia
|
||||||
opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
|
opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
|
||||||
```
|
```
|
||||||
|
|
||||||
Here we apply exponential decay to the `Descent` optimser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
|
Here we apply exponential decay to the `Descent` optimiser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
|
||||||
It is then applied like any optimser.
|
It is then applied like any optimiser.
|
||||||
|
|
||||||
```julia
|
```julia
|
||||||
w = randn(10, 10)
|
w = randn(10, 10)
|
||||||
|
|
|
@ -7,10 +7,10 @@ To actually train a model we need four things:
|
||||||
* A collection of data points that will be provided to the objective function.
|
* A collection of data points that will be provided to the objective function.
|
||||||
* An [optimiser](optimisers.md) that will update the model parameters appropriately.
|
* An [optimiser](optimisers.md) that will update the model parameters appropriately.
|
||||||
|
|
||||||
With these we can call `Flux.train!`:
|
With these we can call `train!`:
|
||||||
|
|
||||||
```julia
|
```@docs
|
||||||
Flux.train!(objective, params, data, opt)
|
Flux.Optimise.train!
|
||||||
```
|
```
|
||||||
|
|
||||||
There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
|
There are plenty of examples in the [model zoo](https://github.com/FluxML/model-zoo).
|
||||||
|
@ -32,6 +32,7 @@ Flux.train!(loss, ps, data, opt)
|
||||||
```
|
```
|
||||||
|
|
||||||
The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
|
The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
|
||||||
|
For a list of all built-in loss functions, check out the [layer reference](../models/layers.md).
|
||||||
|
|
||||||
At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
|
At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
|
||||||
|
|
||||||
|
@ -41,6 +42,8 @@ The model to be trained must have a set of tracked parameters that are used to c
|
||||||
|
|
||||||
Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
|
Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
|
||||||
|
|
||||||
|
Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. Also, for freezing model parameters, see the [Advanced Usage Guide](../models/advanced.md).
|
||||||
|
|
||||||
## Datasets
|
## Datasets
|
||||||
|
|
||||||
The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:
|
The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:
|
||||||
|
@ -56,7 +59,8 @@ data = [(x, y)]
|
||||||
```julia
|
```julia
|
||||||
data = [(x, y), (x, y), (x, y)]
|
data = [(x, y), (x, y), (x, y)]
|
||||||
# Or equivalently
|
# Or equivalently
|
||||||
data = Iterators.repeated((x, y), 3)
|
using IterTools: ncycle
|
||||||
|
data = ncycle([(x, y)], 3)
|
||||||
```
|
```
|
||||||
|
|
||||||
It's common to load the `x`s and `y`s separately. In this case you can use `zip`:
|
It's common to load the `x`s and `y`s separately. In this case you can use `zip`:
|
||||||
|
@ -67,6 +71,14 @@ ys = [rand( 10), rand( 10), rand( 10)]
|
||||||
data = zip(xs, ys)
|
data = zip(xs, ys)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Training data can be conveniently partitioned for mini-batch training using the [`Flux.Data.DataLoader`](@ref) type:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
X = rand(28, 28, 60000)
|
||||||
|
Y = rand(0:9, 60000)
|
||||||
|
data = DataLoader(X, Y, batchsize=128)
|
||||||
|
```
|
||||||
|
|
||||||
Note that, by default, `train!` only loops over the data once (a single "epoch").
|
Note that, by default, `train!` only loops over the data once (a single "epoch").
|
||||||
A convenient way to run multiple epochs from the REPL is provided by `@epochs`.
|
A convenient way to run multiple epochs from the REPL is provided by `@epochs`.
|
||||||
|
|
||||||
|
@ -83,6 +95,10 @@ julia> @epochs 2 Flux.train!(...)
|
||||||
# Train for two epochs
|
# Train for two epochs
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.@epochs
|
||||||
|
```
|
||||||
|
|
||||||
## Callbacks
|
## Callbacks
|
||||||
|
|
||||||
`train!` takes an additional argument, `cb`, that's used for callbacks so that you can observe the training process. For example:
|
`train!` takes an additional argument, `cb`, that's used for callbacks so that you can observe the training process. For example:
|
||||||
|
@ -120,7 +136,7 @@ An example follows that works similar to the default `Flux.train` but with no ca
|
||||||
You don't need callbacks if you just code the calls to your functions directly into the loop.
|
You don't need callbacks if you just code the calls to your functions directly into the loop.
|
||||||
E.g. in the places marked with comments.
|
E.g. in the places marked with comments.
|
||||||
|
|
||||||
```
|
```julia
|
||||||
function my_custom_train!(loss, ps, data, opt)
|
function my_custom_train!(loss, ps, data, opt)
|
||||||
ps = Params(ps)
|
ps = Params(ps)
|
||||||
for d in data
|
for d in data
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
# Utility Functions
|
||||||
|
|
||||||
|
Flux contains some utility functions for working with data; these functions
|
||||||
|
help create inputs for your models or batch your dataset.
|
||||||
|
Other functions can be used to initialize your layers or to regularly execute
|
||||||
|
callback functions.
|
||||||
|
|
||||||
|
## Working with Data
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.unsqueeze
|
||||||
|
Flux.stack
|
||||||
|
Flux.unstack
|
||||||
|
Flux.chunk
|
||||||
|
Flux.frequencies
|
||||||
|
Flux.batch
|
||||||
|
Flux.batchseq
|
||||||
|
Base.rpad(v::AbstractVector, n::Integer, p)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Layer Initialization
|
||||||
|
|
||||||
|
These are primarily useful if you are planning to write your own layers.
|
||||||
|
Flux initializes convolutional layers and recurrent cells with `glorot_uniform`
|
||||||
|
by default.
|
||||||
|
To change the default on an applicable layer, pass the desired function with the
|
||||||
|
`init` keyword. For example:
|
||||||
|
```jldoctest; setup = :(using Flux)
|
||||||
|
julia> conv = Conv((3, 3), 1 => 8, relu; init=Flux.glorot_normal)
|
||||||
|
Conv((3, 3), 1=>8, relu)
|
||||||
|
```
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.glorot_uniform
|
||||||
|
Flux.glorot_normal
|
||||||
|
```
|
||||||
|
|
||||||
|
## Model Abstraction
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.destructure
|
||||||
|
```
|
||||||
|
|
||||||
|
## Callback Helpers
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
Flux.throttle
|
||||||
|
Flux.stop
|
||||||
|
```
|
31
src/Flux.jl
31
src/Flux.jl
|
@ -7,16 +7,18 @@ using Zygote, MacroTools, Juno, Reexport, Statistics, Random
|
||||||
using MacroTools: @forward
|
using MacroTools: @forward
|
||||||
@reexport using NNlib
|
@reexport using NNlib
|
||||||
using Zygote: Params, @adjoint, gradient, pullback, @nograd
|
using Zygote: Params, @adjoint, gradient, pullback, @nograd
|
||||||
|
|
||||||
export gradient
|
export gradient
|
||||||
|
|
||||||
export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
|
export Chain, Dense, Maxout, RNN, LSTM, GRU, SamePad, Conv, CrossCor, ConvTranspose,
|
||||||
|
GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, flatten,
|
||||||
DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
|
DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
|
||||||
SkipConnection, params, fmap, cpu, gpu, f32, f64
|
SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!
|
||||||
|
|
||||||
include("optimise/Optimise.jl")
|
include("optimise/Optimise.jl")
|
||||||
using .Optimise
|
using .Optimise
|
||||||
using .Optimise: @epochs
|
using .Optimise: @epochs
|
||||||
export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
|
export Descent, ADAM, Momentum, Nesterov, RMSProp,
|
||||||
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
|
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
|
||||||
ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
|
ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
|
||||||
|
|
||||||
|
@ -38,24 +40,13 @@ include("data/Data.jl")
|
||||||
|
|
||||||
include("deprecations.jl")
|
include("deprecations.jl")
|
||||||
|
|
||||||
|
include("cuda/cuda.jl")
|
||||||
|
|
||||||
function __init__()
|
function __init__()
|
||||||
precompiling = ccall(:jl_generating_output, Cint, ()) != 0
|
use_cuda[] = CuArrays.functional() # Can be overridden after load with `Flux.use_cuda[] = false`
|
||||||
|
if CuArrays.functional()
|
||||||
# we don't want to include the CUDA module when precompiling,
|
if !CuArrays.has_cudnn()
|
||||||
# or we could end up replacing it at run time (triggering a warning)
|
@warn "CuArrays.jl found cuda, but did not find libcudnn. Some functionality will not be available."
|
||||||
precompiling && return
|
|
||||||
|
|
||||||
if !CuArrays.functional()
|
|
||||||
# nothing to do here, and either CuArrays or one of its dependencies will have warned
|
|
||||||
else
|
|
||||||
use_cuda[] = true
|
|
||||||
|
|
||||||
# FIXME: this functionality should be conditional at run time by checking `use_cuda`
|
|
||||||
# (or even better, get moved to CuArrays.jl as much as possible)
|
|
||||||
if CuArrays.has_cudnn()
|
|
||||||
include(joinpath(@__DIR__, "cuda/cuda.jl"))
|
|
||||||
else
|
|
||||||
@warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -3,6 +3,9 @@ module Data
|
||||||
import ..Flux
|
import ..Flux
|
||||||
import SHA
|
import SHA
|
||||||
|
|
||||||
|
using Random: shuffle!
|
||||||
|
using Base: @propagate_inbounds
|
||||||
|
|
||||||
export CMUDict, cmudict
|
export CMUDict, cmudict
|
||||||
|
|
||||||
deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...)
|
deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...)
|
||||||
|
@ -26,6 +29,9 @@ function __init__()
|
||||||
mkpath(deps())
|
mkpath(deps())
|
||||||
end
|
end
|
||||||
|
|
||||||
|
include("dataloader.jl")
|
||||||
|
export DataLoader
|
||||||
|
|
||||||
include("mnist.jl")
|
include("mnist.jl")
|
||||||
export MNIST
|
export MNIST
|
||||||
|
|
||||||
|
@ -42,4 +48,7 @@ using .Sentiment
|
||||||
include("iris.jl")
|
include("iris.jl")
|
||||||
export Iris
|
export Iris
|
||||||
|
|
||||||
|
include("housing.jl")
|
||||||
|
export Housing
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -24,18 +24,35 @@ function load()
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
phones()
|
||||||
|
|
||||||
|
Return a `Vector` containing the phones used in the CMU Pronouncing Dictionary.
|
||||||
|
"""
|
||||||
function phones()
|
function phones()
|
||||||
load()
|
load()
|
||||||
Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
|
Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
|
||||||
"\n", keepempty = false), "\t")))
|
"\n", keepempty = false), "\t")))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
symbols()
|
||||||
|
|
||||||
|
Return a `Vector` containing the symbols used in the CMU Pronouncing Dictionary.
|
||||||
|
A symbol is a phone with optional auxiliary symbols, indicating for example the
|
||||||
|
amount of stress on the phone.
|
||||||
|
"""
|
||||||
function symbols()
|
function symbols()
|
||||||
load()
|
load()
|
||||||
Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
|
Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
|
||||||
"\n", keepempty = false))
|
"\n", keepempty = false))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
rawdict()
|
||||||
|
|
||||||
|
Return the unfiltered CMU Pronouncing Dictionary.
|
||||||
|
"""
|
||||||
function rawdict()
|
function rawdict()
|
||||||
load()
|
load()
|
||||||
Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
|
Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
|
||||||
|
@ -44,6 +61,14 @@ end
|
||||||
|
|
||||||
validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
|
validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
|
||||||
|
|
||||||
|
"""
|
||||||
|
cmudict()
|
||||||
|
|
||||||
|
Return a filtered CMU Pronouncing Dictionary.
|
||||||
|
|
||||||
|
It is filtered so each word contains only ASCII characters and a combination of
|
||||||
|
word characters (as determined by the regex engine using `\\w`), '-' and '.'.
|
||||||
|
"""
|
||||||
cmudict() = filter(p -> validword(p.first), rawdict())
|
cmudict() = filter(p -> validword(p.first), rawdict())
|
||||||
|
|
||||||
alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
|
alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
|
||||||
|
|
|
@ -0,0 +1,92 @@
|
||||||
|
# Adapted from Knet's src/data.jl (author: Deniz Yuret)
|
||||||
|
|
||||||
|
struct DataLoader
|
||||||
|
data
|
||||||
|
batchsize::Int
|
||||||
|
nobs::Int
|
||||||
|
partial::Bool
|
||||||
|
imax::Int
|
||||||
|
indices::Vector{Int}
|
||||||
|
shuffle::Bool
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
DataLoader(data...; batchsize=1, shuffle=false, partial=true)
|
||||||
|
|
||||||
|
An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
|
||||||
|
(except possibly the last one).
|
||||||
|
|
||||||
|
Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in
|
||||||
|
supervised learning. The last dimension in each tensor is considered to be the observation
|
||||||
|
dimension.
|
||||||
|
|
||||||
|
If `shuffle=true`, shuffles the observations each time iterations are re-started.
|
||||||
|
If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
|
||||||
|
|
||||||
|
The original data is preserved as a tuple in the `data` field of the DataLoader.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
|
||||||
|
Xtrain = rand(10, 100)
|
||||||
|
train_loader = DataLoader(Xtrain, batchsize=2)
|
||||||
|
# iterate over 50 mini-batches of size 2
|
||||||
|
for x in train_loader
|
||||||
|
@assert size(x) == (10, 2)
|
||||||
|
...
|
||||||
|
end
|
||||||
|
|
||||||
|
train_loader.data # original dataset
|
||||||
|
|
||||||
|
Xtrain = rand(10, 100)
|
||||||
|
Ytrain = rand(100)
|
||||||
|
train_loader = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true)
|
||||||
|
for epoch in 1:100
|
||||||
|
for (x, y) in train_loader
|
||||||
|
@assert size(x) == (10, 2)
|
||||||
|
@assert size(y) == (2,)
|
||||||
|
...
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# train for 10 epochs
|
||||||
|
using IterTools: ncycle
|
||||||
|
Flux.train!(loss, ps, ncycle(train_loader, 10), opt)
|
||||||
|
"""
|
||||||
|
function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
|
||||||
|
length(data) > 0 || throw(ArgumentError("Need at least one data input"))
|
||||||
|
batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
|
||||||
|
|
||||||
|
nx = size(data[1])[end]
|
||||||
|
for i=2:length(data)
|
||||||
|
nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
|
||||||
|
end
|
||||||
|
if nx < batchsize
|
||||||
|
@warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
|
||||||
|
batchsize = nx
|
||||||
|
end
|
||||||
|
imax = partial ? nx : nx - batchsize + 1
|
||||||
|
ids = 1:min(nx, batchsize)
|
||||||
|
DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
|
||||||
|
end
|
||||||
|
|
||||||
|
getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
|
||||||
|
|
||||||
|
@propagate_inbounds function Base.iterate(d::DataLoader, i=0) # returns data in d.indices[i+1:i+batchsize]
|
||||||
|
i >= d.imax && return nothing
|
||||||
|
if d.shuffle && i == 0
|
||||||
|
shuffle!(d.indices)
|
||||||
|
end
|
||||||
|
nexti = min(i + d.batchsize, d.nobs)
|
||||||
|
ids = d.indices[i+1:nexti]
|
||||||
|
if length(d.data) == 1
|
||||||
|
batch = getdata(d.data[1], ids)
|
||||||
|
else
|
||||||
|
batch = ((getdata(x, ids) for x in d.data)...,)
|
||||||
|
end
|
||||||
|
return (batch, nexti)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.length(d::DataLoader)
|
||||||
|
n = d.nobs / d.batchsize
|
||||||
|
d.partial ? ceil(Int,n) : floor(Int,n)
|
||||||
|
end
|
|
@ -33,9 +33,10 @@ const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte")
|
||||||
|
|
||||||
Load the Fashion-MNIST images.
|
Load the Fashion-MNIST images.
|
||||||
|
|
||||||
Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
|
Each image is a 28×28 array of `Gray` colour values
|
||||||
|
(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).
|
||||||
|
|
||||||
Returns the 60,000 training images by default; pass `:test` to retreive the
|
Return the 60,000 training images by default; pass `:test` to retrieve the
|
||||||
10,000 test images.
|
10,000 test images.
|
||||||
"""
|
"""
|
||||||
function images(set = :train)
|
function images(set = :train)
|
||||||
|
@ -49,10 +50,10 @@ end
|
||||||
labels()
|
labels()
|
||||||
labels(:test)
|
labels(:test)
|
||||||
|
|
||||||
Load the labels corresponding to each of the images returned from `images()`.
|
Load the labels corresponding to each of the images returned from [`images()`](@ref).
|
||||||
Each label is a number from 0-9.
|
Each label is a number from 0-9.
|
||||||
|
|
||||||
Returns the 60,000 training labels by default; pass `:test` to retreive the
|
Return the 60,000 training labels by default; pass `:test` to retrieve the
|
||||||
10,000 test labels.
|
10,000 test labels.
|
||||||
"""
|
"""
|
||||||
function labels(set = :train)
|
function labels(set = :train)
|
||||||
|
|
|
@ -0,0 +1,136 @@
|
||||||
|
"""
|
||||||
|
1. Title: Boston Housing Data
|
||||||
|
|
||||||
|
2. Sources:
|
||||||
|
(a) Origin: This dataset was taken from the StatLib library which is
|
||||||
|
maintained at Carnegie Mellon University.
|
||||||
|
(b) Creator: Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the
|
||||||
|
demand for clean air', J. Environ. Economics & Management,
|
||||||
|
vol.5, 81-102, 1978.
|
||||||
|
(c) Date: July 7, 1993
|
||||||
|
|
||||||
|
3. Number of Instances: 506
|
||||||
|
|
||||||
|
4. Number of Attributes: 13 continuous attributes (including "class"
|
||||||
|
attribute "MEDV"), 1 binary-valued attribute.
|
||||||
|
|
||||||
|
5. Attribute Information:
|
||||||
|
|
||||||
|
1. CRIM per capita crime rate by town
|
||||||
|
2. ZN proportion of residential land zoned for lots over
|
||||||
|
25,000 sq.ft.
|
||||||
|
3. INDUS proportion of non-retail business acres per town
|
||||||
|
4. CHAS Charles River dummy variable (= 1 if tract bounds
|
||||||
|
river; 0 otherwise)
|
||||||
|
5. NOX nitric oxides concentration (parts per 10 million)
|
||||||
|
6. RM average number of rooms per dwelling
|
||||||
|
7. AGE proportion of owner-occupied units built prior to 1940
|
||||||
|
8. DIS weighted distances to five Boston employment centres
|
||||||
|
9. RAD index of accessibility to radial highways
|
||||||
|
10. TAX full-value property-tax rate per 10,000 dollars
|
||||||
|
11. PTRATIO pupil-teacher ratio by town
|
||||||
|
12. B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks
|
||||||
|
by town
|
||||||
|
13. LSTAT % lower status of the population
|
||||||
|
14. MEDV Median value of owner-occupied homes in 1000's of dollars
|
||||||
|
|
||||||
|
Downloaded From: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data
|
||||||
|
|
||||||
|
"""
|
||||||
|
module Housing
|
||||||
|
|
||||||
|
using DelimitedFiles
|
||||||
|
using ..Data: deps, download_and_verify
|
||||||
|
|
||||||
|
#Uncomment if package exists
|
||||||
|
#const cache_prefix = "https://cache.julialang.org/"
|
||||||
|
const cache_prefix = ""
|
||||||
|
|
||||||
|
function load()
|
||||||
|
isfile(deps("housing.data")) && return
|
||||||
|
|
||||||
|
@info "Downloading the Boston housing Dataset"
|
||||||
|
download_and_verify("$(cache_prefix)http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
|
||||||
|
deps("housing.data"),
|
||||||
|
"baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")
|
||||||
|
|
||||||
|
#@info "Download complete. Working on the files"
|
||||||
|
path = deps()
|
||||||
|
isfile(deps("housing.data")) && touch(joinpath(path, "tempfile.data"))
|
||||||
|
open(joinpath(path, "tempfile.data"), "a") do fout
|
||||||
|
open(deps("housing.data"), "r") do fin
|
||||||
|
for line in eachline(fin)
|
||||||
|
line = replace(lstrip(line), r" +" => s",")
|
||||||
|
println(fout, line)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
mv(joinpath(path, "tempfile.data"), deps("housing.data"), force=true)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
Gets the targets for the Boston housing dataset, a 506 element array listing the targets for each example
|
||||||
|
|
||||||
|
```jldoctest
|
||||||
|
julia> using Flux
|
||||||
|
|
||||||
|
julia> target = Flux.Data.Housing.targets()
|
||||||
|
|
||||||
|
julia> summary(target)
|
||||||
|
506×1 Array{Float64,2}
|
||||||
|
|
||||||
|
julia> target[1]
|
||||||
|
24.0
|
||||||
|
|
||||||
|
"""
|
||||||
|
function targets()
|
||||||
|
load()
|
||||||
|
housing = readdlm(deps("housing.data"), ',')
|
||||||
|
reshape(Vector{Float64}(housing[1:end,end]), (506, 1))
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Gets the names of the features provided in the dataset
|
||||||
|
|
||||||
|
"""
|
||||||
|
function feature_names()
|
||||||
|
["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"]
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Gets the features of the Boston Housing Dataset. This is a 506x13 Matrix of Float64 datatypes.
|
||||||
|
The values are in the order ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"].
|
||||||
|
It has 506 examples.
|
||||||
|
|
||||||
|
```jldoctest
|
||||||
|
julia> using Flux
|
||||||
|
|
||||||
|
julia> features = Flux.Data.Housing.features()
|
||||||
|
|
||||||
|
julia> summary(features)
|
||||||
|
506×13 Array{Float64,2}
|
||||||
|
|
||||||
|
julia> features[1, :]
|
||||||
|
13-element Array{Float64,1}:
|
||||||
|
0.00632
|
||||||
|
18.0
|
||||||
|
2.31
|
||||||
|
0.0
|
||||||
|
0.538
|
||||||
|
⋮
|
||||||
|
296.0
|
||||||
|
15.3
|
||||||
|
396.9
|
||||||
|
4.98
|
||||||
|
|
||||||
|
"""
|
||||||
|
function features()
|
||||||
|
load()
|
||||||
|
housing = readdlm(deps("housing.data"), ',')
|
||||||
|
Matrix{Float64}(housing[1:end, 1:13])
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
end
|
|
@ -2,13 +2,12 @@
|
||||||
Fisher's classic iris dataset.
|
Fisher's classic iris dataset.
|
||||||
|
|
||||||
Measurements from 3 different species of iris: setosa, versicolor and
|
Measurements from 3 different species of iris: setosa, versicolor and
|
||||||
virginica. There are 50 examples of each species.
|
virginica. There are 50 examples of each species.
|
||||||
|
|
||||||
There are 4 measurements for each example: sepal length, sepal width, petal
|
There are 4 measurements for each example: sepal length, sepal width,
|
||||||
length and petal width. The measurements are in centimeters.
|
petal length and petal width. The measurements are in centimeters.
|
||||||
|
|
||||||
The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris).
|
The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
module Iris
|
module Iris
|
||||||
|
|
||||||
|
@ -28,15 +27,12 @@ function load()
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
labels()
|
labels()
|
||||||
|
|
||||||
Get the labels of the iris dataset, a 150 element array of strings listing the
|
Get the labels of the iris dataset, a 150 element array of strings listing the
|
||||||
species of each example.
|
species of each example.
|
||||||
|
|
||||||
```jldoctest
|
```jldoctest; setup = :(Flux.Data.Iris.load())
|
||||||
julia> using Flux
|
|
||||||
|
|
||||||
julia> labels = Flux.Data.Iris.labels();
|
julia> labels = Flux.Data.Iris.labels();
|
||||||
|
|
||||||
julia> summary(labels)
|
julia> summary(labels)
|
||||||
|
@ -53,16 +49,13 @@ function labels()
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
features()
|
features()
|
||||||
|
|
||||||
Get the features of the iris dataset. This is a 4x150 matrix of Float64
|
Get the features of the iris dataset. This is a 4x150 matrix of Float64
|
||||||
elements. It has a row for each feature (sepal length, sepal width,
|
elements. It has a row for each feature (sepal length, sepal width,
|
||||||
petal length, petal width) and a column for each example.
|
petal length, petal width) and a column for each example.
|
||||||
|
|
||||||
```jldoctest
|
```jldoctest; setup = :(Flux.Data.Iris.load())
|
||||||
julia> using Flux
|
|
||||||
|
|
||||||
julia> features = Flux.Data.Iris.features();
|
julia> features = Flux.Data.Iris.features();
|
||||||
|
|
||||||
julia> summary(features)
|
julia> summary(features)
|
||||||
|
|
|
@ -83,9 +83,10 @@ getfeatures(io::IO, index::Integer) = vec(getimage(io, index))
|
||||||
|
|
||||||
Load the MNIST images.
|
Load the MNIST images.
|
||||||
|
|
||||||
Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
|
Each image is a 28×28 array of `Gray` colour values
|
||||||
|
(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).
|
||||||
|
|
||||||
Returns the 60,000 training images by default; pass `:test` to retreive the
|
Return the 60,000 training images by default; pass `:test` to retrieve the
|
||||||
10,000 test images.
|
10,000 test images.
|
||||||
"""
|
"""
|
||||||
function images(set = :train)
|
function images(set = :train)
|
||||||
|
@ -99,10 +100,10 @@ end
|
||||||
labels()
|
labels()
|
||||||
labels(:test)
|
labels(:test)
|
||||||
|
|
||||||
Load the labels corresponding to each of the images returned from `images()`.
|
Load the labels corresponding to each of the images returned from [`images()`](@ref).
|
||||||
Each label is a number from 0-9.
|
Each label is a number from 0-9.
|
||||||
|
|
||||||
Returns the 60,000 training labels by default; pass `:test` to retreive the
|
Return the 60,000 training labels by default; pass `:test` to retrieve the
|
||||||
10,000 test labels.
|
10,000 test labels.
|
||||||
"""
|
"""
|
||||||
function labels(set = :train)
|
function labels(set = :train)
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
"Stanford Sentiment Treebank dataset."
|
||||||
module Sentiment
|
module Sentiment
|
||||||
|
|
||||||
using ZipFile
|
using ZipFile
|
||||||
|
@ -39,8 +40,28 @@ function gettrees(name)
|
||||||
return parsetree.(ss)
|
return parsetree.(ss)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
train()
|
||||||
|
|
||||||
|
Return the train split of the Stanford Sentiment Treebank.
|
||||||
|
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
|
||||||
|
"""
|
||||||
train() = gettrees("train")
|
train() = gettrees("train")
|
||||||
|
|
||||||
|
"""
|
||||||
|
test()
|
||||||
|
|
||||||
|
Return the test split of the Stanford Sentiment Treebank.
|
||||||
|
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
|
||||||
|
"""
|
||||||
test() = gettrees("test")
|
test() = gettrees("test")
|
||||||
|
|
||||||
|
"""
|
||||||
|
dev()
|
||||||
|
|
||||||
|
Return the dev split of the Stanford Sentiment Treebank.
|
||||||
|
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
|
||||||
|
"""
|
||||||
dev() = gettrees("dev")
|
dev() = gettrees("dev")
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -39,6 +39,38 @@ end
|
||||||
|
|
||||||
trainable(m) = functor(m)[1]
|
trainable(m) = functor(m)[1]
|
||||||
|
|
||||||
|
"""
|
||||||
|
testmode!(m, mode = true)
|
||||||
|
|
||||||
|
Set a layer or model's test mode (see below).
|
||||||
|
Using `:auto` mode will treat any gradient computation as training.
|
||||||
|
|
||||||
|
_Note_: if you manually set a model into test mode, you need to manually place
|
||||||
|
it back into train mode during training phase.
|
||||||
|
|
||||||
|
Possible values include:
|
||||||
|
- `false` for training
|
||||||
|
- `true` for testing
|
||||||
|
- `:auto` or `nothing` for Flux to detect the mode automatically
|
||||||
|
"""
|
||||||
|
testmode!(m, mode = true) = m
|
||||||
|
|
||||||
|
"""
|
||||||
|
trainmode!(m, mode = true)
|
||||||
|
|
||||||
|
Set a layer of model's train mode (see below).
|
||||||
|
Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
|
||||||
|
|
||||||
|
_Note_: if you manually set a model into train mode, you need to manually place
|
||||||
|
it into test mode during testing phase.
|
||||||
|
|
||||||
|
Possible values include:
|
||||||
|
- `true` for training
|
||||||
|
- `false` for testing
|
||||||
|
- `:auto` or `nothing` for Flux to detect the mode automatically
|
||||||
|
"""
|
||||||
|
trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode)
|
||||||
|
|
||||||
params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
|
params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
|
||||||
|
|
||||||
function params!(p::Params, x, seen = IdSet())
|
function params!(p::Params, x, seen = IdSet())
|
||||||
|
|
|
@ -4,17 +4,23 @@
|
||||||
Chain multiple layers / functions together, so that they are called in sequence
|
Chain multiple layers / functions together, so that they are called in sequence
|
||||||
on a given input.
|
on a given input.
|
||||||
|
|
||||||
```julia
|
|
||||||
m = Chain(x -> x^2, x -> x+1)
|
|
||||||
m(5) == 26
|
|
||||||
|
|
||||||
m = Chain(Dense(10, 5), Dense(5, 2))
|
|
||||||
x = rand(10)
|
|
||||||
m(x) == m[2](m[1](x))
|
|
||||||
```
|
|
||||||
|
|
||||||
`Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
|
`Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
|
||||||
`m[1:3](x)` will calculate the output of the first three layers.
|
`m[1:3](x)` will calculate the output of the first three layers.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest
|
||||||
|
julia> m = Chain(x -> x^2, x -> x+1);
|
||||||
|
|
||||||
|
julia> m(5) == 26
|
||||||
|
true
|
||||||
|
|
||||||
|
julia> m = Chain(Dense(10, 5), Dense(5, 2));
|
||||||
|
|
||||||
|
julia> x = rand(10);
|
||||||
|
|
||||||
|
julia> m(x) == m[2](m[1](x))
|
||||||
|
true
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
struct Chain{T<:Tuple}
|
struct Chain{T<:Tuple}
|
||||||
layers::T
|
layers::T
|
||||||
|
@ -33,6 +39,8 @@ applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
|
||||||
|
|
||||||
Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
|
Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]...)
|
||||||
|
|
||||||
|
testmode!(m::Chain, mode = true) = (map(x -> testmode!(x, mode), m.layers); m)
|
||||||
|
|
||||||
function Base.show(io::IO, c::Chain)
|
function Base.show(io::IO, c::Chain)
|
||||||
print(io, "Chain(")
|
print(io, "Chain(")
|
||||||
join(io, c.layers, ", ")
|
join(io, c.layers, ", ")
|
||||||
|
@ -58,6 +66,7 @@ outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(
|
||||||
# only slightly changed to better handle interaction with Zygote @dsweber2
|
# only slightly changed to better handle interaction with Zygote @dsweber2
|
||||||
"""
|
"""
|
||||||
activations(c::Chain, input)
|
activations(c::Chain, input)
|
||||||
|
|
||||||
Calculate the forward results of each layers in Chain `c` with `input` as model input.
|
Calculate the forward results of each layers in Chain `c` with `input` as model input.
|
||||||
"""
|
"""
|
||||||
function activations(c::Chain, input)
|
function activations(c::Chain, input)
|
||||||
|
@ -76,22 +85,22 @@ extraChain(::Tuple{}, x) = ()
|
||||||
"""
|
"""
|
||||||
Dense(in::Integer, out::Integer, σ = identity)
|
Dense(in::Integer, out::Integer, σ = identity)
|
||||||
|
|
||||||
Creates a traditional `Dense` layer with parameters `W` and `b`.
|
Create a traditional `Dense` layer with parameters `W` and `b`.
|
||||||
|
|
||||||
y = σ.(W * x .+ b)
|
y = σ.(W * x .+ b)
|
||||||
|
|
||||||
The input `x` must be a vector of length `in`, or a batch of vectors represented
|
The input `x` must be a vector of length `in`, or a batch of vectors represented
|
||||||
as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.
|
as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.
|
||||||
|
|
||||||
```julia
|
# Examples
|
||||||
|
```jldoctest; setup = :(using Random; Random.seed!(0))
|
||||||
julia> d = Dense(5, 2)
|
julia> d = Dense(5, 2)
|
||||||
Dense(5, 2)
|
Dense(5, 2)
|
||||||
|
|
||||||
julia> d(rand(5))
|
julia> d(rand(5))
|
||||||
Tracked 2-element Array{Float64,1}:
|
2-element Array{Float32,1}:
|
||||||
0.00257447
|
-0.16210233
|
||||||
-0.00449443
|
0.12311903```
|
||||||
```
|
|
||||||
"""
|
"""
|
||||||
struct Dense{F,S,T}
|
struct Dense{F,S,T}
|
||||||
W::S
|
W::S
|
||||||
|
@ -143,7 +152,7 @@ outdims(l::Dense, isize) = (size(l.W)[1],)
|
||||||
"""
|
"""
|
||||||
Diagonal(in::Integer)
|
Diagonal(in::Integer)
|
||||||
|
|
||||||
Creates an element-wise linear transformation layer with learnable
|
Create an element-wise linear transformation layer with learnable
|
||||||
vectors `α` and `β`:
|
vectors `α` and `β`:
|
||||||
|
|
||||||
y = α .* x .+ β
|
y = α .* x .+ β
|
||||||
|
@ -174,18 +183,11 @@ outdims(l::Diagonal, isize) = (length(l.α),)
|
||||||
"""
|
"""
|
||||||
Maxout(over)
|
Maxout(over)
|
||||||
|
|
||||||
`Maxout` is a neural network layer, which has a number of internal layers,
|
The [Maxout](https://arxiv.org/pdf/1302.4389.pdf) layer has a number of
|
||||||
which all have the same input, and the maxout returns the elementwise maximium
|
internal layers which all receive the same input. It returns the elementwise
|
||||||
of the internal layers' outputs.
|
maximum of the internal layers' outputs.
|
||||||
|
|
||||||
Maxout over linear dense layers satisfies the univeral approximation theorem.
|
Maxout over linear dense layers satisfies the univeral approximation theorem.
|
||||||
|
|
||||||
Reference:
|
|
||||||
Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, and Yoshua Bengio.
|
|
||||||
2013. Maxout networks.
|
|
||||||
In Proceedings of the 30th International Conference on International Conference on Machine Learning - Volume 28 (ICML'13),
|
|
||||||
Sanjoy Dasgupta and David McAllester (Eds.), Vol. 28. JMLR.org III-1319-III-1327.
|
|
||||||
https://arxiv.org/pdf/1302.4389.pdf
|
|
||||||
"""
|
"""
|
||||||
struct Maxout{FS<:Tuple}
|
struct Maxout{FS<:Tuple}
|
||||||
over::FS
|
over::FS
|
||||||
|
@ -194,17 +196,18 @@ end
|
||||||
"""
|
"""
|
||||||
Maxout(f, n_alts)
|
Maxout(f, n_alts)
|
||||||
|
|
||||||
Constructs a Maxout layer over `n_alts` instances of the layer given by `f`.
|
Construct a Maxout layer over `n_alts` instances of the layer given by `f`.
|
||||||
The function takes no arguement and should return some callable layer.
|
The function takes no arguments and should return some callable layer.
|
||||||
Conventionally this is a linear dense layer.
|
Conventionally, this is a linear dense layer.
|
||||||
|
|
||||||
For example the following example which
|
# Examples
|
||||||
will construct a `Maxout` layer over 4 internal dense linear layers,
|
|
||||||
each identical in structure (784 inputs, 128 outputs).
|
This constructs a `Maxout` layer over 4 internal dense linear layers, each
|
||||||
|
identical in structure (784 inputs, 128 outputs):
|
||||||
```julia
|
```julia
|
||||||
insize = 784
|
insize = 784
|
||||||
outsize = 128
|
outsize = 128
|
||||||
Maxout(()->Dense(insize, outsize), 4)
|
Maxout(()->Dense(insize, outsize), 4)
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
function Maxout(f, n_alts)
|
function Maxout(f, n_alts)
|
||||||
|
@ -221,16 +224,18 @@ end
|
||||||
outdims(l::Maxout, isize) = outdims(first(l.over), isize)
|
outdims(l::Maxout, isize) = outdims(first(l.over), isize)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
SkipConnection(layers, connection)
|
SkipConnection(layer, connection)
|
||||||
|
|
||||||
Creates a Skip Connection, of a layer or `Chain` of consecutive layers
|
Create a skip connection which consists of a layer or `Chain` of consecutive
|
||||||
plus a shortcut connection. The connection function will combine the result of the layers
|
layers and a shortcut connection linking the block's input to the output
|
||||||
with the original input, to give the final output.
|
through a user-supplied 2-argument callable. The first argument to the callable
|
||||||
|
will be propagated through the given `layer` while the second is the unchanged,
|
||||||
|
"skipped" input.
|
||||||
|
|
||||||
The simplest 'ResNet'-type connection is just `SkipConnection(layer, +)`,
|
The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`,
|
||||||
and requires the output of the layers to be the same shape as the input.
|
and requires the output of the layers to be the same shape as the input.
|
||||||
Here is a more complicated example:
|
Here is a more complicated example:
|
||||||
```
|
```julia
|
||||||
m = Conv((3,3), 4=>7, pad=(1,1))
|
m = Conv((3,3), 4=>7, pad=(1,1))
|
||||||
x = ones(5,5,4,10);
|
x = ones(5,5,4,10);
|
||||||
size(m(x)) == (5, 5, 7, 10)
|
size(m(x)) == (5, 5, 7, 10)
|
||||||
|
|
|
@ -9,20 +9,38 @@ expand(N, i::Tuple) = i
|
||||||
expand(N, i::Integer) = ntuple(_ -> i, N)
|
expand(N, i::Integer) = ntuple(_ -> i, N)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Conv(filter::Tuple, in=>out)
|
SamePad
|
||||||
Conv(filter::Tuple, in=>out, activation)
|
|
||||||
|
|
||||||
Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
|
Padding for convolutional layers will be calculated so that outputshape == inputshape when stride = 1.
|
||||||
`in` and `out` specify the number of input and output channels respectively.
|
|
||||||
|
|
||||||
Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
|
For stride > 1 the output shape depends on the type of convolution layer.
|
||||||
giving us a 16-channel output. Output is activated with ReLU.
|
"""
|
||||||
|
struct SamePad end
|
||||||
|
|
||||||
|
calc_padding(pad, k::NTuple{N,T}, dilation, stride) where {T,N}= expand(Val(2*N), pad)
|
||||||
|
function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
|
||||||
|
#Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/pdf/1603.07285
|
||||||
|
|
||||||
|
# Effective kernel size, including dilation
|
||||||
|
k_eff = @. k + (k - 1) * (dilation - 1)
|
||||||
|
# How much total padding needs to be applied?
|
||||||
|
pad_amt = @. k_eff - 1
|
||||||
|
# In case amount of padding is odd we need to apply different amounts to each side.
|
||||||
|
return Tuple(mapfoldl(i -> [ceil(Int, i/2), floor(Int, i/2)], vcat, pad_amt))
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
Conv(filter, in => out, σ = identity; init = glorot_uniform,
|
||||||
|
stride = 1, pad = 0, dilation = 1)
|
||||||
|
|
||||||
filter = (2,2)
|
filter = (2,2)
|
||||||
in = 1
|
in = 1
|
||||||
out = 16
|
out = 16
|
||||||
Conv((2, 2), 1=>16, relu)
|
Conv((2, 2), 1=>16, relu)
|
||||||
|
|
||||||
|
Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
|
||||||
|
`in` and `out` specify the number of input and output channels respectively.
|
||||||
|
|
||||||
Data should be stored in WHCN order (width, height, # channels, batch size).
|
Data should be stored in WHCN order (width, height, # channels, batch size).
|
||||||
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
||||||
and a batch of 50 would be a `100×100×3×50` array.
|
and a batch of 50 would be a `100×100×3×50` array.
|
||||||
|
@ -31,6 +49,18 @@ Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
||||||
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
||||||
|
|
||||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||||
|
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
|
||||||
|
Apply a `Conv` layer to a 1-channel input using a 2×2 window filter size, giving us a
|
||||||
|
16-channel output. Output is activated with ReLU.
|
||||||
|
```julia
|
||||||
|
filter = (2,2)
|
||||||
|
in = 1
|
||||||
|
out = 16
|
||||||
|
Conv(filter, in => out, relu)
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
struct Conv{N,M,F,A,V}
|
struct Conv{N,M,F,A,V}
|
||||||
σ::F
|
σ::F
|
||||||
|
@ -46,18 +76,27 @@ end
|
||||||
Conv(weight::AbstractArray, bias::AbstractArray, activation)
|
Conv(weight::AbstractArray, bias::AbstractArray, activation)
|
||||||
|
|
||||||
Constructs the convolutional layer with user defined weight and bias arrays.
|
Constructs the convolutional layer with user defined weight and bias arrays.
|
||||||
All other behaviours of the Conv layer apply with regard to data order and
|
|
||||||
forward pass.
|
|
||||||
|
|
||||||
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
||||||
|
|
||||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||||
|
|
||||||
|
There is also a keyword-only constuctor available for all convoultional
|
||||||
|
layers.
|
||||||
|
|
||||||
|
```julia
|
||||||
|
weight = rand(Float32, 3, 3, 5)
|
||||||
|
bias = zeros(Float32, 5)
|
||||||
|
Conv(weight = weight,
|
||||||
|
bias = bias,
|
||||||
|
σ = sigmoid)
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
||||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||||
stride = expand(Val(N-2), stride)
|
stride = expand(Val(N-2), stride)
|
||||||
pad = expand(Val(2*(N-2)), pad)
|
|
||||||
dilation = expand(Val(N-2), dilation)
|
dilation = expand(Val(N-2), dilation)
|
||||||
|
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
|
||||||
return Conv(σ, w, b, stride, pad, dilation)
|
return Conv(σ, w, b, stride, pad, dilation)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -114,8 +153,8 @@ end
|
||||||
"""
|
"""
|
||||||
outdims(l::Conv, isize::Tuple)
|
outdims(l::Conv, isize::Tuple)
|
||||||
|
|
||||||
Calculate the output dimensions given the input dimensions, `isize`.
|
Calculate the output dimensions given the input dimensions `isize`.
|
||||||
Batch size and channel size are ignored as per `NNlib.jl`.
|
Batch size and channel size are ignored as per [NNlib.jl](https://github.com/FluxML/NNlib.jl).
|
||||||
|
|
||||||
```julia
|
```julia
|
||||||
m = Conv((3, 3), 3 => 16)
|
m = Conv((3, 3), 3 => 16)
|
||||||
|
@ -127,19 +166,23 @@ outdims(l::Conv, isize) =
|
||||||
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
|
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ConvTranspose(size, in=>out)
|
ConvTranspose(filter, in=>out)
|
||||||
ConvTranspose(size, in=>out, activation)
|
ConvTranspose(filter, in=>out, activation)
|
||||||
|
ConvTranspose(filter, in => out, σ = identity; init = glorot_uniform,
|
||||||
|
stride = 1, pad = 0, dilation = 1)
|
||||||
|
|
||||||
Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
|
Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
|
||||||
`in` and `out` specify the number of input and output channels respectively.
|
`in` and `out` specify the number of input and output channels respectively.
|
||||||
|
|
||||||
Data should be stored in WHCN order. In other words, a 100×100 RGB image would
|
Data should be stored in WHCN order (width, height, # channels, batch size).
|
||||||
be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
|
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
||||||
|
and a batch of 50 would be a `100×100×3×50` array.
|
||||||
|
|
||||||
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
||||||
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
||||||
|
|
||||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||||
|
Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1.
|
||||||
"""
|
"""
|
||||||
struct ConvTranspose{N,M,F,A,V}
|
struct ConvTranspose{N,M,F,A,V}
|
||||||
σ::F
|
σ::F
|
||||||
|
@ -155,18 +198,19 @@ end
|
||||||
ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
|
ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
|
||||||
|
|
||||||
Constructs the convolutional transpose layer with user defined weight and bias arrays.
|
Constructs the convolutional transpose layer with user defined weight and bias arrays.
|
||||||
All other behaviours of the ConvTranspose layer apply with regard to data order and
|
|
||||||
forward pass.
|
forward pass.
|
||||||
|
|
||||||
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
||||||
|
|
||||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||||
|
|
||||||
|
For keyword-only constuctor, see also [`Conv`](@ref)
|
||||||
"""
|
"""
|
||||||
function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
||||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||||
stride = expand(Val(N-2), stride)
|
stride = expand(Val(N-2), stride)
|
||||||
pad = expand(Val(2*(N-2)), pad)
|
|
||||||
dilation = expand(Val(N-2), dilation)
|
dilation = expand(Val(N-2), dilation)
|
||||||
|
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
|
||||||
return ConvTranspose(σ, w, b, stride, pad, dilation)
|
return ConvTranspose(σ, w, b, stride, pad, dilation)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -227,18 +271,22 @@ outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size
|
||||||
"""
|
"""
|
||||||
DepthwiseConv(filter::Tuple, in=>out)
|
DepthwiseConv(filter::Tuple, in=>out)
|
||||||
DepthwiseConv(filter::Tuple, in=>out, activation)
|
DepthwiseConv(filter::Tuple, in=>out, activation)
|
||||||
|
DepthwiseConv(filter, in => out, σ = identity; init = glorot_uniform,
|
||||||
|
stride = 1, pad = 0, dilation = 1)
|
||||||
|
|
||||||
Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
|
Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
|
||||||
`in` and `out` specify the number of input and output channels respectively.
|
`in` and `out` specify the number of input and output channels respectively.
|
||||||
Note that `out` must be an integer multiple of `in`.
|
Note that `out` must be an integer multiple of `in`.
|
||||||
|
|
||||||
Data should be stored in WHCN order. In other words, a 100×100 RGB image would
|
Data should be stored in WHCN order (width, height, # channels, batch size).
|
||||||
be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
|
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
||||||
|
and a batch of 50 would be a `100×100×3×50` array.
|
||||||
|
|
||||||
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
||||||
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
||||||
|
|
||||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||||
|
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
|
||||||
"""
|
"""
|
||||||
struct DepthwiseConv{N,M,F,A,V}
|
struct DepthwiseConv{N,M,F,A,V}
|
||||||
σ::F
|
σ::F
|
||||||
|
@ -254,18 +302,19 @@ end
|
||||||
DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
|
DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
|
||||||
|
|
||||||
Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
|
Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
|
||||||
All other behaviours of the `DepthwiseConv` layer apply with regard to data order and
|
|
||||||
forward pass.
|
forward pass.
|
||||||
|
|
||||||
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
||||||
|
|
||||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||||
|
|
||||||
|
For keyword-only constuctor, see also [`Conv`](@ref)
|
||||||
"""
|
"""
|
||||||
function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
||||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||||
stride = expand(Val(N-2), stride)
|
stride = expand(Val(N-2), stride)
|
||||||
pad = expand(Val(2*(N-2)), pad)
|
|
||||||
dilation = expand(Val(N-2), dilation)
|
dilation = expand(Val(N-2), dilation)
|
||||||
|
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
|
||||||
return DepthwiseConv(σ, w, b, stride, pad, dilation)
|
return DepthwiseConv(σ, w, b, stride, pad, dilation)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -328,21 +377,15 @@ outdims(l::DepthwiseConv, isize) =
|
||||||
output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
|
output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
CrossCor(size, in=>out)
|
CrossCor(filter, in=>out)
|
||||||
CrossCor(size, in=>out, activation)
|
CrossCor(filter, in=>out, activation)
|
||||||
|
CrossCor(filter, in => out, σ = identity; init = glorot_uniform,
|
||||||
|
stride = 1, pad = 0, dilation = 1)
|
||||||
|
|
||||||
Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
|
Standard cross convolutional layer. `filter` should be a tuple like `(2, 2)`.
|
||||||
`in` and `out` specify the number of input and output channels respectively.
|
`in` and `out` specify the number of input and output channels respectively.
|
||||||
|
|
||||||
Example: Applying CrossCor layer to a 1-channel input using a 2x2 window size,
|
Data should be stored in WHCN order (width, height, # channels, batch size).
|
||||||
giving us a 16-channel output. Output is activated with ReLU.
|
|
||||||
|
|
||||||
size = (2,2)
|
|
||||||
in = 1
|
|
||||||
out = 16
|
|
||||||
CrossCor((2, 2), 1=>16, relu)
|
|
||||||
|
|
||||||
Data should be stored in WHCN order (width, height, # channels, # batches).
|
|
||||||
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
||||||
and a batch of 50 would be a `100×100×3×50` array.
|
and a batch of 50 would be a `100×100×3×50` array.
|
||||||
|
|
||||||
|
@ -350,6 +393,18 @@ Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
||||||
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
||||||
|
|
||||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||||
|
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
|
||||||
|
Apply a `CrossCor` layer to a 1-channel input using a 2×2 window filter size, giving us a
|
||||||
|
16-channel output. Output is activated with ReLU.
|
||||||
|
```julia
|
||||||
|
filter = (2,2)
|
||||||
|
in = 1
|
||||||
|
out = 16
|
||||||
|
CrossCor((2, 2), 1=>16, relu)
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
struct CrossCor{N,M,F,A,V}
|
struct CrossCor{N,M,F,A,V}
|
||||||
σ::F
|
σ::F
|
||||||
|
@ -365,18 +420,19 @@ end
|
||||||
CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
|
CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
|
||||||
|
|
||||||
Constructs the standard cross convolutional layer with user defined weight and bias
|
Constructs the standard cross convolutional layer with user defined weight and bias
|
||||||
arrays. All other behaviours of the CrossCor layer apply with regard to data order and
|
arrays.
|
||||||
forward pass.
|
|
||||||
|
|
||||||
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
||||||
|
|
||||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||||
|
|
||||||
|
For keyword-only constuctor, see also [`Conv`](@ref)
|
||||||
"""
|
"""
|
||||||
function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
||||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||||
stride = expand(Val(N-2), stride)
|
stride = expand(Val(N-2), stride)
|
||||||
pad = expand(Val(2*(N-2)), pad)
|
|
||||||
dilation = expand(Val(N-2), dilation)
|
dilation = expand(Val(N-2), dilation)
|
||||||
|
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
|
||||||
return CrossCor(σ, w, b, stride, pad, dilation)
|
return CrossCor(σ, w, b, stride, pad, dilation)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -425,11 +481,62 @@ outdims(l::CrossCor, isize) =
|
||||||
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
|
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
MaxPool(k)
|
GlobalMaxPool()
|
||||||
|
|
||||||
Max pooling layer. `k` stands for the size of the window for each dimension of the input.
|
Global max pooling layer.
|
||||||
|
|
||||||
Takes the keyword arguments `pad` and `stride`.
|
Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output,
|
||||||
|
by performing max pooling on the complete (w,h)-shaped feature maps.
|
||||||
|
"""
|
||||||
|
struct GlobalMaxPool end
|
||||||
|
|
||||||
|
function (g::GlobalMaxPool)(x)
|
||||||
|
# Input size
|
||||||
|
x_size = size(x)
|
||||||
|
# Kernel size
|
||||||
|
k = x_size[1:end-2]
|
||||||
|
# Pooling dimensions
|
||||||
|
pdims = PoolDims(x, k)
|
||||||
|
|
||||||
|
return maxpool(x, pdims)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.show(io::IO, g::GlobalMaxPool)
|
||||||
|
print(io, "GlobalMaxPool()")
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
GlobalMeanPool()
|
||||||
|
|
||||||
|
Global mean pooling layer.
|
||||||
|
|
||||||
|
Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output,
|
||||||
|
by performing mean pooling on the complete (w,h)-shaped feature maps.
|
||||||
|
"""
|
||||||
|
struct GlobalMeanPool end
|
||||||
|
|
||||||
|
function (g::GlobalMeanPool)(x)
|
||||||
|
# Input size
|
||||||
|
x_size = size(x)
|
||||||
|
# Kernel size
|
||||||
|
k = x_size[1:end-2]
|
||||||
|
# Pooling dimensions
|
||||||
|
pdims = PoolDims(x, k)
|
||||||
|
|
||||||
|
return meanpool(x, pdims)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.show(io::IO, g::GlobalMeanPool)
|
||||||
|
print(io, "GlobalMeanPool()")
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
MaxPool(k; pad = 0, stride = k)
|
||||||
|
|
||||||
|
Max pooling layer. `k` is the size of the window for each dimension of the input.
|
||||||
|
|
||||||
|
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
|
||||||
|
=======
|
||||||
"""
|
"""
|
||||||
struct MaxPool{N,M}
|
struct MaxPool{N,M}
|
||||||
k::NTuple{N,Int}
|
k::NTuple{N,Int}
|
||||||
|
@ -439,8 +546,7 @@ end
|
||||||
|
|
||||||
function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
|
function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
|
||||||
stride = expand(Val(N), stride)
|
stride = expand(Val(N), stride)
|
||||||
pad = expand(Val(2*N), pad)
|
pad = calc_padding(pad, k, 1, stride)
|
||||||
|
|
||||||
return MaxPool(k, pad, stride)
|
return MaxPool(k, pad, stride)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -456,11 +562,11 @@ end
|
||||||
outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
|
outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
MeanPool(k)
|
MeanPool(k; pad = 0, stride = k)
|
||||||
|
|
||||||
Mean pooling layer. `k` stands for the size of the window for each dimension of the input.
|
Mean pooling layer. `k` is the size of the window for each dimension of the input.
|
||||||
|
|
||||||
Takes the keyword arguments `pad` and `stride`.
|
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
|
||||||
"""
|
"""
|
||||||
struct MeanPool{N,M}
|
struct MeanPool{N,M}
|
||||||
k::NTuple{N,Int}
|
k::NTuple{N,Int}
|
||||||
|
@ -470,7 +576,7 @@ end
|
||||||
|
|
||||||
function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
|
function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
|
||||||
stride = expand(Val(N), stride)
|
stride = expand(Val(N), stride)
|
||||||
pad = expand(Val(2*N), pad)
|
pad = calc_padding(pad, k, 1, stride)
|
||||||
return MeanPool(k, pad, stride)
|
return MeanPool(k, pad, stride)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -2,11 +2,23 @@ istraining() = false
|
||||||
|
|
||||||
@adjoint istraining() = true, _ -> nothing
|
@adjoint istraining() = true, _ -> nothing
|
||||||
|
|
||||||
|
_isactive(m) = isnothing(m.active) ? istraining() : m.active
|
||||||
|
|
||||||
_dropout_shape(s, ::Colon) = size(s)
|
_dropout_shape(s, ::Colon) = size(s)
|
||||||
_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
|
_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
|
||||||
|
|
||||||
_dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
|
_dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
dropout(x, p; dims = :)
|
||||||
|
|
||||||
|
The dropout function. For each input, either sets that input to `0` (with probability
|
||||||
|
`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions,
|
||||||
|
e.g. `dims=1` applies dropout along columns and `dims=2` along rows.
|
||||||
|
This is used as a regularisation, i.e. it reduces overfitting during training.
|
||||||
|
|
||||||
|
See also the [`Dropout`](@ref) layer.
|
||||||
|
"""
|
||||||
dropout(x, p; dims = :) = x
|
dropout(x, p; dims = :) = x
|
||||||
|
|
||||||
@adjoint function dropout(x, p; dims = :)
|
@adjoint function dropout(x, p; dims = :)
|
||||||
|
@ -18,22 +30,31 @@ end
|
||||||
"""
|
"""
|
||||||
Dropout(p, dims = :)
|
Dropout(p, dims = :)
|
||||||
|
|
||||||
A Dropout layer. For each input, either sets that input to `0` (with probability
|
Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input.
|
||||||
`p`) or scales it by `1/(1-p)`. The `dims` argument is to specified the unbroadcasted
|
|
||||||
dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
|
Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
|
||||||
used as a regularisation, i.e. it reduces overfitting during training. see also [`dropout`](@ref).
|
|
||||||
"""
|
"""
|
||||||
mutable struct Dropout{F,D}
|
mutable struct Dropout{F,D}
|
||||||
p::F
|
p::F
|
||||||
dims::D
|
dims::D
|
||||||
|
active::Union{Bool, Nothing}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# TODO: deprecate in v0.11
|
||||||
|
Dropout(p, dims) = Dropout(p, dims, nothing)
|
||||||
|
|
||||||
function Dropout(p; dims = :)
|
function Dropout(p; dims = :)
|
||||||
@assert 0 ≤ p ≤ 1
|
@assert 0 ≤ p ≤ 1
|
||||||
Dropout{typeof(p),typeof(dims)}(p, dims)
|
Dropout{typeof(p),typeof(dims)}(p, dims, nothing)
|
||||||
end
|
end
|
||||||
|
|
||||||
(a::Dropout)(x) = dropout(x, a.p; dims = a.dims)
|
function (a::Dropout)(x)
|
||||||
|
_isactive(a) || return x
|
||||||
|
return dropout(x, a.p; dims = a.dims)
|
||||||
|
end
|
||||||
|
|
||||||
|
testmode!(m::Dropout, mode = true) =
|
||||||
|
(m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
|
||||||
|
|
||||||
function Base.show(io::IO, d::Dropout)
|
function Base.show(io::IO, d::Dropout)
|
||||||
print(io, "Dropout(", d.p)
|
print(io, "Dropout(", d.p)
|
||||||
|
@ -43,20 +64,25 @@ end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
AlphaDropout(p)
|
AlphaDropout(p)
|
||||||
A dropout layer. It is used in Self-Normalizing Neural Networks.
|
|
||||||
(https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
|
A dropout layer. Used in
|
||||||
The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
|
[Self-Normalizing Neural Networks](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf).
|
||||||
|
The AlphaDropout layer ensures that mean and variance of activations
|
||||||
|
remain the same as before.
|
||||||
|
|
||||||
|
Does nothing to the input once [`testmode!`](@ref) is true.
|
||||||
"""
|
"""
|
||||||
mutable struct AlphaDropout{F}
|
mutable struct AlphaDropout{F}
|
||||||
p::F
|
p::F
|
||||||
function AlphaDropout(p)
|
active::Union{Bool, Nothing}
|
||||||
|
function AlphaDropout(p, active = nothing)
|
||||||
@assert 0 ≤ p ≤ 1
|
@assert 0 ≤ p ≤ 1
|
||||||
new{typeof(p)}(p)
|
new{typeof(p)}(p, active)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
function (a::AlphaDropout)(x)
|
function (a::AlphaDropout)(x)
|
||||||
istraining() || return x
|
_isactive(a) || return x
|
||||||
λ = eltype(x)(1.0507009873554804934193349852946)
|
λ = eltype(x)(1.0507009873554804934193349852946)
|
||||||
α = eltype(x)(1.6732632423543772848170429916717)
|
α = eltype(x)(1.6732632423543772848170429916717)
|
||||||
α1 = eltype(x)(-λ*α)
|
α1 = eltype(x)(-λ*α)
|
||||||
|
@ -68,12 +94,15 @@ function (a::AlphaDropout)(x)
|
||||||
return x
|
return x
|
||||||
end
|
end
|
||||||
|
|
||||||
|
testmode!(m::AlphaDropout, mode = true) =
|
||||||
|
(m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
LayerNorm(h::Integer)
|
LayerNorm(h::Integer)
|
||||||
|
|
||||||
A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be
|
A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be
|
||||||
used with recurrent hidden states of size `h`. Normalises the mean/stddev of
|
used with recurrent hidden states of size `h`. Normalises the mean and standard
|
||||||
each input before applying a per-neuron gain/bias.
|
deviation of each input before applying a per-neuron gain/bias.
|
||||||
"""
|
"""
|
||||||
struct LayerNorm{T}
|
struct LayerNorm{T}
|
||||||
diag::Diagonal{T}
|
diag::Diagonal{T}
|
||||||
|
@ -95,8 +124,8 @@ end
|
||||||
initβ = zeros, initγ = ones,
|
initβ = zeros, initγ = ones,
|
||||||
ϵ = 1e-8, momentum = .1)
|
ϵ = 1e-8, momentum = .1)
|
||||||
|
|
||||||
Batch Normalization layer. The `channels` input should be the size of the
|
[Batch Normalization](https://arxiv.org/pdf/1502.03167.pdf) layer.
|
||||||
channel dimension in your data (see below).
|
`channels` should be the size of the channel dimension in your data (see below).
|
||||||
|
|
||||||
Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
|
Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
|
||||||
a batch of feature vectors this is just the data dimension, for `WHCN` images
|
a batch of feature vectors this is just the data dimension, for `WHCN` images
|
||||||
|
@ -106,10 +135,9 @@ it's the usual channel dimension.)
|
||||||
shifts them to have a new mean and variance (corresponding to the learnable,
|
shifts them to have a new mean and variance (corresponding to the learnable,
|
||||||
per-channel `bias` and `scale` parameters).
|
per-channel `bias` and `scale` parameters).
|
||||||
|
|
||||||
See [Batch Normalization: Accelerating Deep Network Training by Reducing
|
Use [`testmode!`](@ref) during inference.
|
||||||
Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf).
|
|
||||||
|
|
||||||
Example:
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
m = Chain(
|
m = Chain(
|
||||||
Dense(28^2, 64),
|
Dense(28^2, 64),
|
||||||
|
@ -127,12 +155,16 @@ mutable struct BatchNorm{F,V,W,N}
|
||||||
σ²::W # moving std
|
σ²::W # moving std
|
||||||
ϵ::N
|
ϵ::N
|
||||||
momentum::N
|
momentum::N
|
||||||
|
active::Union{Bool, Nothing}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# TODO: deprecate in v0.11
|
||||||
|
BatchNorm(λ, β, γ, μ, σ², ϵ, momentum) = BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
|
||||||
|
|
||||||
BatchNorm(chs::Integer, λ = identity;
|
BatchNorm(chs::Integer, λ = identity;
|
||||||
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
|
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
|
||||||
BatchNorm(λ, initβ(chs), initγ(chs),
|
BatchNorm(λ, initβ(chs), initγ(chs),
|
||||||
zeros(chs), ones(chs), ϵ, momentum)
|
zeros(chs), ones(chs), ϵ, momentum, nothing)
|
||||||
|
|
||||||
trainable(bn::BatchNorm) = (bn.β, bn.γ)
|
trainable(bn::BatchNorm) = (bn.β, bn.γ)
|
||||||
|
|
||||||
|
@ -145,7 +177,7 @@ function (BN::BatchNorm)(x)
|
||||||
m = div(prod(size(x)), channels)
|
m = div(prod(size(x)), channels)
|
||||||
γ = reshape(BN.γ, affine_shape...)
|
γ = reshape(BN.γ, affine_shape...)
|
||||||
β = reshape(BN.β, affine_shape...)
|
β = reshape(BN.β, affine_shape...)
|
||||||
if !istraining()
|
if !_isactive(BN)
|
||||||
μ = reshape(BN.μ, affine_shape...)
|
μ = reshape(BN.μ, affine_shape...)
|
||||||
σ² = reshape(BN.σ², affine_shape...)
|
σ² = reshape(BN.σ², affine_shape...)
|
||||||
ϵ = BN.ϵ
|
ϵ = BN.ϵ
|
||||||
|
@ -170,41 +202,15 @@ end
|
||||||
|
|
||||||
@functor BatchNorm
|
@functor BatchNorm
|
||||||
|
|
||||||
|
testmode!(m::BatchNorm, mode = true) =
|
||||||
|
(m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
|
||||||
|
|
||||||
function Base.show(io::IO, l::BatchNorm)
|
function Base.show(io::IO, l::BatchNorm)
|
||||||
print(io, "BatchNorm($(join(size(l.β), ", "))")
|
print(io, "BatchNorm($(join(size(l.β), ", "))")
|
||||||
(l.λ == identity) || print(io, ", λ = $(l.λ)")
|
(l.λ == identity) || print(io, ", λ = $(l.λ)")
|
||||||
print(io, ")")
|
print(io, ")")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
InstanceNorm(channels::Integer, σ = identity;
|
|
||||||
initβ = zeros, initγ = ones,
|
|
||||||
ϵ = 1e-8, momentum = .1)
|
|
||||||
|
|
||||||
Instance Normalization layer. The `channels` input should be the size of the
|
|
||||||
channel dimension in your data (see below).
|
|
||||||
|
|
||||||
Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
|
|
||||||
a batch of feature vectors this is just the data dimension, for `WHCN` images
|
|
||||||
it's the usual channel dimension.)
|
|
||||||
|
|
||||||
`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
|
|
||||||
shifts them to have a new mean and variance (corresponding to the learnable,
|
|
||||||
per-channel `bias` and `scale` parameters).
|
|
||||||
|
|
||||||
See [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
|
|
||||||
|
|
||||||
Example:
|
|
||||||
```julia
|
|
||||||
m = Chain(
|
|
||||||
Dense(28^2, 64),
|
|
||||||
InstanceNorm(64, relu),
|
|
||||||
Dense(64, 10),
|
|
||||||
InstanceNorm(10),
|
|
||||||
softmax)
|
|
||||||
```
|
|
||||||
"""
|
|
||||||
expand_inst = (x, as) -> reshape(repeat(x, outer=[1, as[length(as)]]), as...)
|
expand_inst = (x, as) -> reshape(repeat(x, outer=[1, as[length(as)]]), as...)
|
||||||
|
|
||||||
mutable struct InstanceNorm{F,V,W,N}
|
mutable struct InstanceNorm{F,V,W,N}
|
||||||
|
@ -215,12 +221,44 @@ mutable struct InstanceNorm{F,V,W,N}
|
||||||
σ²::W # moving std
|
σ²::W # moving std
|
||||||
ϵ::N
|
ϵ::N
|
||||||
momentum::N
|
momentum::N
|
||||||
|
active::Union{Bool, Nothing}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# TODO: deprecate in v0.11
|
||||||
|
"""
|
||||||
|
InstanceNorm(channels::Integer, σ = identity;
|
||||||
|
initβ = zeros, initγ = ones,
|
||||||
|
ϵ = 1e-8, momentum = .1)
|
||||||
|
|
||||||
|
[Instance Normalization](https://arxiv.org/abs/1607.08022) layer.
|
||||||
|
`channels` should be the size of the channel dimension in your data (see below).
|
||||||
|
|
||||||
|
Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
|
||||||
|
a batch of feature vectors this is just the data dimension, for `WHCN` images
|
||||||
|
it's the usual channel dimension.)
|
||||||
|
|
||||||
|
`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
|
||||||
|
shifts them to have a new mean and variance (corresponding to the learnable,
|
||||||
|
per-channel `bias` and `scale` parameters).
|
||||||
|
|
||||||
|
Use [`testmode!`](@ref) during inference.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```julia
|
||||||
|
m = Chain(
|
||||||
|
Dense(28^2, 64),
|
||||||
|
InstanceNorm(64, relu),
|
||||||
|
Dense(64, 10),
|
||||||
|
InstanceNorm(10),
|
||||||
|
softmax)
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum) = InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
|
||||||
|
|
||||||
InstanceNorm(chs::Integer, λ = identity;
|
InstanceNorm(chs::Integer, λ = identity;
|
||||||
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
|
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
|
||||||
InstanceNorm(λ, initβ(chs), initγ(chs),
|
InstanceNorm(λ, initβ(chs), initγ(chs),
|
||||||
zeros(chs), ones(chs), ϵ, momentum)
|
zeros(chs), ones(chs), ϵ, momentum, nothing)
|
||||||
|
|
||||||
trainable(in::InstanceNorm) = (in.β, in.γ)
|
trainable(in::InstanceNorm) = (in.β, in.γ)
|
||||||
|
|
||||||
|
@ -237,7 +275,7 @@ function (in::InstanceNorm)(x)
|
||||||
m = div(prod(size(x)), c*bs)
|
m = div(prod(size(x)), c*bs)
|
||||||
γ, β = expand_inst(in.γ, affine_shape), expand_inst(in.β, affine_shape)
|
γ, β = expand_inst(in.γ, affine_shape), expand_inst(in.β, affine_shape)
|
||||||
|
|
||||||
if !istraining()
|
if !_isactive(in)
|
||||||
μ = expand_inst(in.μ, affine_shape)
|
μ = expand_inst(in.μ, affine_shape)
|
||||||
σ² = expand_inst(in.σ², affine_shape)
|
σ² = expand_inst(in.σ², affine_shape)
|
||||||
ϵ = in.ϵ
|
ϵ = in.ϵ
|
||||||
|
@ -263,6 +301,9 @@ end
|
||||||
|
|
||||||
@functor InstanceNorm
|
@functor InstanceNorm
|
||||||
|
|
||||||
|
testmode!(m::InstanceNorm, mode = true) =
|
||||||
|
(m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
|
||||||
|
|
||||||
function Base.show(io::IO, l::InstanceNorm)
|
function Base.show(io::IO, l::InstanceNorm)
|
||||||
print(io, "InstanceNorm($(join(size(l.β), ", "))")
|
print(io, "InstanceNorm($(join(size(l.β), ", "))")
|
||||||
(l.λ == identity) || print(io, ", λ = $(l.λ)")
|
(l.λ == identity) || print(io, ", λ = $(l.λ)")
|
||||||
|
@ -270,26 +311,27 @@ function Base.show(io::IO, l::InstanceNorm)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Group Normalization.
|
GroupNorm(chs::Integer, G::Integer, λ = identity;
|
||||||
This layer can outperform Batch-Normalization and Instance-Normalization.
|
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i),
|
||||||
|
ϵ = 1f-5, momentum = 0.1f0)
|
||||||
|
|
||||||
GroupNorm(chs::Integer, G::Integer, λ = identity;
|
[Group Normalization](https://arxiv.org/pdf/1803.08494.pdf) layer.
|
||||||
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i),
|
This layer can outperform Batch Normalization and Instance Normalization.
|
||||||
ϵ = 1f-5, momentum = 0.1f0)
|
|
||||||
|
|
||||||
``chs`` is the number of channels, the channel dimension of your input.
|
`chs` is the number of channels, the channel dimension of your input.
|
||||||
For an array of N dimensions, the (N-1)th index is the channel dimension.
|
For an array of N dimensions, the `N-1`th index is the channel dimension.
|
||||||
|
|
||||||
``G`` is the number of groups along which the statistics would be computed.
|
`G` is the number of groups along which the statistics are computed.
|
||||||
The number of channels must be an integer multiple of the number of groups.
|
The number of channels must be an integer multiple of the number of groups.
|
||||||
|
|
||||||
Example:
|
Use [`testmode!`](@ref) during inference.
|
||||||
```
|
|
||||||
m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1),
|
|
||||||
GroupNorm(32,16)) # 32 channels, 16 groups (G = 16), thus 2 channels per group used
|
|
||||||
```
|
|
||||||
|
|
||||||
Link : https://arxiv.org/pdf/1803.08494.pdf
|
# Examples
|
||||||
|
```julia
|
||||||
|
m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1),
|
||||||
|
GroupNorm(32,16))
|
||||||
|
# 32 channels, 16 groups (G = 16), thus 2 channels per group used
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
mutable struct GroupNorm{F,V,W,N,T}
|
mutable struct GroupNorm{F,V,W,N,T}
|
||||||
G::T # number of groups
|
G::T # number of groups
|
||||||
|
@ -300,12 +342,16 @@ mutable struct GroupNorm{F,V,W,N,T}
|
||||||
σ²::W # moving std
|
σ²::W # moving std
|
||||||
ϵ::N
|
ϵ::N
|
||||||
momentum::N
|
momentum::N
|
||||||
|
active::Union{Bool, Nothing}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# TODO: deprecate in v0.11
|
||||||
|
GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum) = GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, nothing)
|
||||||
|
|
||||||
GroupNorm(chs::Integer, G::Integer, λ = identity;
|
GroupNorm(chs::Integer, G::Integer, λ = identity;
|
||||||
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
|
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
|
||||||
GroupNorm(G, λ, initβ(chs), initγ(chs),
|
GroupNorm(G, λ, initβ(chs), initγ(chs),
|
||||||
zeros(G,1), ones(G,1), ϵ, momentum)
|
zeros(G,1), ones(G,1), ϵ, momentum, nothing)
|
||||||
|
|
||||||
trainable(gn::GroupNorm) = (gn.β, gn.γ)
|
trainable(gn::GroupNorm) = (gn.β, gn.γ)
|
||||||
|
|
||||||
|
@ -329,7 +375,7 @@ function(gn::GroupNorm)(x)
|
||||||
β = reshape(gn.β, affine_shape...)
|
β = reshape(gn.β, affine_shape...)
|
||||||
|
|
||||||
y = reshape(x,((size(x))[1:end-2]...,channels_per_group,groups,batches))
|
y = reshape(x,((size(x))[1:end-2]...,channels_per_group,groups,batches))
|
||||||
if !istraining()
|
if !_isactive(gn)
|
||||||
og_shape = size(x)
|
og_shape = size(x)
|
||||||
μ = reshape(gn.μ, μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
|
μ = reshape(gn.μ, μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
|
||||||
σ² = reshape(gn.σ², μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
|
σ² = reshape(gn.σ², μ_affine_shape...) # Shape : (1,1,...C/G,G,1)
|
||||||
|
@ -360,6 +406,9 @@ end
|
||||||
|
|
||||||
@functor GroupNorm
|
@functor GroupNorm
|
||||||
|
|
||||||
|
testmode!(m::GroupNorm, mode = true) =
|
||||||
|
(m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
|
||||||
|
|
||||||
function Base.show(io::IO, l::GroupNorm)
|
function Base.show(io::IO, l::GroupNorm)
|
||||||
print(io, "GroupNorm($(join(size(l.β), ", "))")
|
print(io, "GroupNorm($(join(size(l.β), ", "))")
|
||||||
(l.λ == identity) || print(io, ", λ = $(l.λ)")
|
(l.λ == identity) || print(io, ", λ = $(l.λ)")
|
||||||
|
|
|
@ -12,16 +12,16 @@ in the background. `cell` should be a model of the form:
|
||||||
|
|
||||||
h, y = cell(h, x...)
|
h, y = cell(h, x...)
|
||||||
|
|
||||||
For example, here's a recurrent network that keeps a running total of its inputs.
|
For example, here's a recurrent network that keeps a running total of its inputs:
|
||||||
|
|
||||||
```julia
|
```julia
|
||||||
accum(h, x) = (h+x, x)
|
accum(h, x) = (h + x, x)
|
||||||
rnn = Flux.Recur(accum, 0)
|
rnn = Flux.Recur(accum, 0)
|
||||||
rnn(2) # 2
|
rnn(2) # 2
|
||||||
rnn(3) # 3
|
rnn(3) # 3
|
||||||
rnn.state # 5
|
rnn.state # 5
|
||||||
rnn.(1:10) # apply to a sequence
|
rnn.(1:10) # apply to a sequence
|
||||||
rnn.state # 60
|
rnn.state # 60
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
mutable struct Recur{T}
|
mutable struct Recur{T}
|
||||||
|
@ -47,9 +47,10 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
|
||||||
|
|
||||||
Reset the hidden state of a recurrent layer back to its original value.
|
Reset the hidden state of a recurrent layer back to its original value.
|
||||||
|
|
||||||
Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
|
Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
|
||||||
|
```julia
|
||||||
rnn.state = hidden(rnn.cell)
|
rnn.state = hidden(rnn.cell)
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
reset!(m::Recur) = (m.state = m.init)
|
reset!(m::Recur) = (m.state = m.init)
|
||||||
reset!(m) = foreach(reset!, functor(m)[1])
|
reset!(m) = foreach(reset!, functor(m)[1])
|
||||||
|
@ -135,8 +136,8 @@ Base.show(io::IO, l::LSTMCell) =
|
||||||
"""
|
"""
|
||||||
LSTM(in::Integer, out::Integer)
|
LSTM(in::Integer, out::Integer)
|
||||||
|
|
||||||
Long Short Term Memory recurrent layer. Behaves like an RNN but generally
|
[Long Short Term Memory](https://www.researchgate.net/publication/13853244_Long_Short-term_Memory)
|
||||||
exhibits a longer memory span over sequences.
|
recurrent layer. Behaves like an RNN but generally exhibits a longer memory span over sequences.
|
||||||
|
|
||||||
See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
|
See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
|
||||||
for a good overview of the internals.
|
for a good overview of the internals.
|
||||||
|
@ -176,8 +177,8 @@ Base.show(io::IO, l::GRUCell) =
|
||||||
"""
|
"""
|
||||||
GRU(in::Integer, out::Integer)
|
GRU(in::Integer, out::Integer)
|
||||||
|
|
||||||
Gated Recurrent Unit layer. Behaves like an RNN but generally
|
[Gated Recurrent Unit](https://arxiv.org/abs/1406.1078) layer. Behaves like an
|
||||||
exhibits a longer memory span over sequences.
|
RNN but generally exhibits a longer memory span over sequences.
|
||||||
|
|
||||||
See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
|
See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
|
||||||
for a good overview of the internals.
|
for a good overview of the internals.
|
||||||
|
|
|
@ -1,10 +1,58 @@
|
||||||
using CuArrays
|
|
||||||
using NNlib: logsoftmax, logσ
|
|
||||||
|
|
||||||
# Cost functions
|
# Cost functions
|
||||||
|
"""
|
||||||
|
mae(ŷ, y)
|
||||||
|
|
||||||
|
Return the mean of absolute error; calculated as
|
||||||
|
`sum(abs.(ŷ .- y)) / length(y)`.
|
||||||
|
"""
|
||||||
|
mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
mse(ŷ, y)
|
||||||
|
|
||||||
|
Return the mean squared error between ŷ and y; calculated as
|
||||||
|
`sum((ŷ .- y).^2) / length(y)`.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest
|
||||||
|
julia> Flux.mse([0, 2], [1, 1])
|
||||||
|
1//1
|
||||||
|
```
|
||||||
|
"""
|
||||||
mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
|
mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
msle(ŷ, y; ϵ=eps(eltype(ŷ)))
|
||||||
|
|
||||||
|
Return the mean of the squared logarithmic errors; calculated as
|
||||||
|
`sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
|
||||||
|
The `ϵ` term provides numerical stability.
|
||||||
|
|
||||||
|
Penalizes an under-predicted estimate greater than an over-predicted estimate.
|
||||||
|
"""
|
||||||
|
msle(ŷ, y; ϵ=eps(eltype(ŷ))) = sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) * 1 // length(y)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
huber_loss(ŷ, y; δ=1.0)
|
||||||
|
|
||||||
|
Return the mean of the [Huber loss](https://en.wikipedia.org/wiki/Huber_loss)
|
||||||
|
given the prediction `ŷ` and true values `y`.
|
||||||
|
|
||||||
|
| 0.5 * |ŷ - y|, for |ŷ - y| <= δ
|
||||||
|
Huber loss = |
|
||||||
|
| δ * (|ŷ - y| - 0.5 * δ), otherwise
|
||||||
|
"""
|
||||||
|
function huber_loss(ŷ, y; δ=eltype(ŷ)(1))
|
||||||
|
abs_error = abs.(ŷ .- y)
|
||||||
|
temp = abs_error .< δ
|
||||||
|
x = eltype(ŷ)(0.5)
|
||||||
|
hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
|
||||||
|
end
|
||||||
|
|
||||||
function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
|
function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
|
||||||
return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
|
return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
|
||||||
end
|
end
|
||||||
|
@ -17,22 +65,63 @@ function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Abstr
|
||||||
return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
|
return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
crossentropy(ŷ, y; weight = nothing)
|
||||||
|
|
||||||
|
Return the cross entropy between the given probability distributions;
|
||||||
|
calculated as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
|
||||||
|
|
||||||
|
`weight` can be `Nothing`, a `Number` or an `AbstractVector`.
|
||||||
|
`weight=nothing` acts like `weight=1` but is faster.
|
||||||
|
|
||||||
|
See also: [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest
|
||||||
|
julia> Flux.crossentropy(softmax([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
|
||||||
|
3.085467254747739
|
||||||
|
```
|
||||||
|
"""
|
||||||
crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
|
crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
|
||||||
|
|
||||||
function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
|
"""
|
||||||
return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
|
logitcrossentropy(ŷ, y; weight = 1)
|
||||||
|
|
||||||
|
Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
|
||||||
|
calculated as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
|
||||||
|
|
||||||
|
`logitcrossentropy(ŷ, y)` is mathematically equivalent to
|
||||||
|
[`Flux.crossentropy(softmax(log(ŷ)), y)`](@ref) but it is more numerically stable.
|
||||||
|
|
||||||
|
See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest
|
||||||
|
julia> Flux.logitcrossentropy([-1.1491, 0.8619, 0.3127], [1, 1, 0])
|
||||||
|
3.085467254747738
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
function logitcrossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
|
||||||
|
return -sum(y .* logsoftmax(ŷ) .* weight) * 1 // size(y, 2)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
binarycrossentropy(ŷ, y; ϵ=eps(ŷ))
|
binarycrossentropy(ŷ, y; ϵ=eps(ŷ))
|
||||||
|
|
||||||
Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerical stability.
|
Return ``-y*\\log(ŷ + ϵ) - (1-y)*\\log(1-ŷ + ϵ)``. The `ϵ` term provides numerical stability.
|
||||||
|
|
||||||
julia> binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0.])
|
Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation.
|
||||||
3-element Array{Float64,1}:
|
|
||||||
1.4244
|
See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
|
||||||
0.352317
|
|
||||||
0.86167
|
# Examples
|
||||||
|
```jldoctest
|
||||||
|
julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
|
||||||
|
3-element Array{Float64,1}:
|
||||||
|
1.424397097347566
|
||||||
|
0.35231664672364077
|
||||||
|
0.8616703662235441
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
|
binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
|
||||||
|
|
||||||
|
@ -40,44 +129,52 @@ binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ
|
||||||
CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
|
CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
logitbinarycrossentropy(logŷ, y)
|
logitbinarycrossentropy(ŷ, y)
|
||||||
|
|
||||||
`logitbinarycrossentropy(logŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(logŷ), y)`
|
`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to
|
||||||
but it is more numerically stable.
|
[`Flux.binarycrossentropy(σ(log(ŷ)), y)`](@ref) but it is more numerically stable.
|
||||||
|
|
||||||
julia> logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0.])
|
See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref)
|
||||||
3-element Array{Float64,1}:
|
|
||||||
1.4244
|
# Examples
|
||||||
0.352317
|
```jldoctest
|
||||||
0.86167
|
julia> Flux.logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0])
|
||||||
|
3-element Array{Float64,1}:
|
||||||
|
1.4243970973475661
|
||||||
|
0.35231664672364094
|
||||||
|
0.8616703662235443
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
|
logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
|
||||||
|
|
||||||
# Re-definition to fix interaction with CuArrays.
|
# Re-definition to fix interaction with CuArrays.
|
||||||
CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
|
CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
normalise(x::AbstractArray; dims=1)
|
normalise(x; dims=1)
|
||||||
|
|
||||||
Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
|
Normalise `x` to mean 0 and standard deviation 1 across the dimensions given by `dims`.
|
||||||
|
Defaults to normalising over columns.
|
||||||
|
|
||||||
julia> a = reshape(collect(1:9), 3, 3)
|
```jldoctest
|
||||||
3×3 Array{Int64,2}:
|
julia> a = reshape(collect(1:9), 3, 3)
|
||||||
1 4 7
|
3×3 Array{Int64,2}:
|
||||||
2 5 8
|
1 4 7
|
||||||
3 6 9
|
2 5 8
|
||||||
|
3 6 9
|
||||||
|
|
||||||
julia> normalise(a)
|
julia> Flux.normalise(a)
|
||||||
3×3 Array{Float64,2}:
|
3×3 Array{Float64,2}:
|
||||||
-1.22474 -1.22474 -1.22474
|
-1.22474 -1.22474 -1.22474
|
||||||
0.0 0.0 0.0
|
0.0 0.0 0.0
|
||||||
1.22474 1.22474 1.22474
|
1.22474 1.22474 1.22474
|
||||||
|
|
||||||
julia> normalise(a, dims=2)
|
julia> Flux.normalise(a, dims=2)
|
||||||
3×3 Array{Float64,2}:
|
3×3 Array{Float64,2}:
|
||||||
-1.22474 0.0 1.22474
|
-1.22474 0.0 1.22474
|
||||||
-1.22474 0.0 1.22474
|
-1.22474 0.0 1.22474
|
||||||
-1.22474 0.0 1.22474
|
-1.22474 0.0 1.22474
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
function normalise(x::AbstractArray; dims=1)
|
function normalise(x::AbstractArray; dims=1)
|
||||||
μ′ = mean(x, dims = dims)
|
μ′ = mean(x, dims = dims)
|
||||||
|
@ -87,26 +184,81 @@ end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
kldivergence(ŷ, y)
|
kldivergence(ŷ, y)
|
||||||
KLDivergence is a measure of how much one probability distribution is different from the other.
|
|
||||||
It is always non-negative and zero only when both the distributions are equal everywhere.
|
Return the
|
||||||
[KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
|
[Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)
|
||||||
|
between the given probability distributions.
|
||||||
|
|
||||||
|
KL divergence is a measure of how much one probability distribution is different
|
||||||
|
from the other.
|
||||||
|
It is always non-negative and zero only when both the distributions are equal
|
||||||
|
everywhere.
|
||||||
"""
|
"""
|
||||||
function kldivergence(ŷ, y)
|
function kldivergence(ŷ, y)
|
||||||
entropy = sum(y .* log.(y)) *1 //size(y,2)
|
entropy = sum(y .* log.(y)) * 1 //size(y,2)
|
||||||
cross_entropy = crossentropy(ŷ, y)
|
cross_entropy = crossentropy(ŷ, y)
|
||||||
return entropy + cross_entropy
|
return entropy + cross_entropy
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
poisson(ŷ, y)
|
poisson(ŷ, y)
|
||||||
Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
|
|
||||||
[Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
|
Return how much the predicted distribution `ŷ` diverges from the expected Poisson
|
||||||
|
distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
|
||||||
|
|
||||||
|
[More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
|
||||||
"""
|
"""
|
||||||
poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
|
poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) * 1 // size(y,2)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
hinge(ŷ, y)
|
hinge(ŷ, y)
|
||||||
Measures the loss given the prediction ŷ and true labels y(containing 1 or -1).
|
|
||||||
[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
|
Return the [hinge loss](https://en.wikipedia.org/wiki/Hinge_loss) given the
|
||||||
|
prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as
|
||||||
|
`sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)`.
|
||||||
|
|
||||||
|
See also: [`squared_hinge`](@ref)
|
||||||
"""
|
"""
|
||||||
hinge(ŷ, y) = sum(max.(0, 1 .- ŷ .* y)) *1 // size(y,2)
|
hinge(ŷ, y) = sum(max.(0, 1 .- ŷ .* y)) * 1 // size(y, 2)
|
||||||
|
|
||||||
|
"""
|
||||||
|
squared_hinge(ŷ, y)
|
||||||
|
|
||||||
|
Return the squared hinge loss given the prediction `ŷ` and true labels `y`
|
||||||
|
(containing 1 or -1); calculated as `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`.
|
||||||
|
|
||||||
|
See also: [`hinge`](@ref)
|
||||||
|
"""
|
||||||
|
squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) * 1 // size(y, 2)
|
||||||
|
|
||||||
|
"""
|
||||||
|
dice_coeff_loss(ŷ, y; smooth=1)
|
||||||
|
|
||||||
|
Return a loss based on the dice coefficient.
|
||||||
|
Used in the [V-Net](https://arxiv.org/pdf/1606.04797v1.pdf) image segmentation
|
||||||
|
architecture.
|
||||||
|
Similar to the F1_score. Calculated as:
|
||||||
|
1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)`
|
||||||
|
"""
|
||||||
|
dice_coeff_loss(ŷ, y; smooth=eltype(ŷ)(1.0)) = 1 - (2*sum(y .* ŷ) + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
|
||||||
|
|
||||||
|
"""
|
||||||
|
tversky_loss(ŷ, y; β=0.7)
|
||||||
|
|
||||||
|
Return the [Tversky loss](https://arxiv.org/pdf/1706.05721.pdf).
|
||||||
|
Used with imbalanced data to give more weight to false negatives.
|
||||||
|
Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
|
||||||
|
Calculated as:
|
||||||
|
1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
|
||||||
|
"""
|
||||||
|
tversky_loss(ŷ, y; β=eltype(ŷ)(0.7)) = 1 - (sum(y .* ŷ) + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
|
||||||
|
|
||||||
|
"""
|
||||||
|
flatten(x::AbstractArray)
|
||||||
|
|
||||||
|
Transform (w, h, c, b)-shaped input into (w × h × c, b)-shaped output
|
||||||
|
by linearizing all values for each element in the batch.
|
||||||
|
"""
|
||||||
|
function flatten(x::AbstractArray)
|
||||||
|
return reshape(x, :, size(x)[end])
|
||||||
|
end
|
||||||
|
|
|
@ -37,30 +37,28 @@ import Adapt: adapt, adapt_structure
|
||||||
|
|
||||||
adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
|
adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
|
||||||
|
|
||||||
import .CuArrays: CuArray, cudaconvert
|
import .CuArrays: CuArray, CuArrayStyle, cudaconvert
|
||||||
import Base.Broadcast: BroadcastStyle, ArrayStyle
|
import Base.Broadcast: BroadcastStyle, ArrayStyle
|
||||||
BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
|
BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = CuArrayStyle{2}()
|
||||||
cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
|
cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
onehot(l, labels[, unk])
|
onehot(l, labels[, unk])
|
||||||
|
|
||||||
Create an [`OneHotVector`](@ref) wtih `l`-th element be `true` based on possible `labels` set.
|
Create a `OneHotVector` with its `l`-th element `true` based on the
|
||||||
If `unk` is given, it retruns `onehot(unk, labels)` if the input label `l` is not find in `labels`; otherwise
|
possible set of `labels`.
|
||||||
it will error.
|
If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
|
||||||
|
in `labels`; otherwise it will error.
|
||||||
## Examples
|
|
||||||
|
|
||||||
|
# Examples
|
||||||
```jldoctest
|
```jldoctest
|
||||||
julia> using Flux: onehot
|
julia> Flux.onehot(:b, [:a, :b, :c])
|
||||||
|
|
||||||
julia> onehot(:b, [:a, :b, :c])
|
|
||||||
3-element Flux.OneHotVector:
|
3-element Flux.OneHotVector:
|
||||||
0
|
0
|
||||||
1
|
1
|
||||||
0
|
0
|
||||||
|
|
||||||
julia> onehot(:c, [:a, :b, :c])
|
julia> Flux.onehot(:c, [:a, :b, :c])
|
||||||
3-element Flux.OneHotVector:
|
3-element Flux.OneHotVector:
|
||||||
0
|
0
|
||||||
0
|
0
|
||||||
|
@ -82,15 +80,14 @@ end
|
||||||
"""
|
"""
|
||||||
onehotbatch(ls, labels[, unk...])
|
onehotbatch(ls, labels[, unk...])
|
||||||
|
|
||||||
Create an [`OneHotMatrix`](@ref) with a batch of labels based on possible `labels` set, returns the
|
Create a `OneHotMatrix` with a batch of labels based on the
|
||||||
`onehot(unk, labels)` if given labels `ls` is not found in set `labels`.
|
possible set of `labels`.
|
||||||
|
If `unk` is given, return [`onehot(unk, labels)`](@ref) if one of the input
|
||||||
## Examples
|
labels `ls` is not found in `labels`; otherwise it will error.
|
||||||
|
|
||||||
|
# Examples
|
||||||
```jldoctest
|
```jldoctest
|
||||||
julia> using Flux: onehotbatch
|
julia> Flux.onehotbatch([:b, :a, :b], [:a, :b, :c])
|
||||||
|
|
||||||
julia> onehotbatch([:b, :a, :b], [:a, :b, :c])
|
|
||||||
3×3 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}:
|
3×3 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}:
|
||||||
0 1 0
|
0 1 0
|
||||||
1 0 1
|
1 0 1
|
||||||
|
@ -107,13 +104,12 @@ Base.argmax(xs::OneHotVector) = xs.ix
|
||||||
|
|
||||||
Inverse operations of [`onehot`](@ref).
|
Inverse operations of [`onehot`](@ref).
|
||||||
|
|
||||||
|
# Examples
|
||||||
```jldoctest
|
```jldoctest
|
||||||
julia> using Flux: onecold
|
julia> Flux.onecold([true, false, false], [:a, :b, :c])
|
||||||
|
|
||||||
julia> onecold([true, false, false], [:a, :b, :c])
|
|
||||||
:a
|
:a
|
||||||
|
|
||||||
julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c])
|
julia> Flux.onecold([0.3, 0.2, 0.5], [:a, :b, :c])
|
||||||
:c
|
:c
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
module Optimise
|
module Optimise
|
||||||
|
|
||||||
export train!,
|
export train!, update!,
|
||||||
SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
|
Descent, ADAM, Momentum, Nesterov, RMSProp,
|
||||||
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM,
|
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM,
|
||||||
InvDecay, ExpDecay, WeightDecay, stop, Optimiser
|
InvDecay, ExpDecay, WeightDecay, stop, Optimiser
|
||||||
|
|
||||||
|
|
|
@ -6,24 +6,25 @@ const ϵ = 1e-8
|
||||||
# TODO: should use weak refs
|
# TODO: should use weak refs
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Descent(η)
|
Descent(η = 0.1)
|
||||||
|
|
||||||
Classic gradient descent optimiser with learning rate `η`.
|
Classic gradient descent optimiser with learning rate `η`.
|
||||||
For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
|
For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
|
the weights.
|
||||||
|
|
||||||
## Example
|
# Examples
|
||||||
```julia-repl
|
```julia
|
||||||
opt = Descent() # uses default η (0.1)
|
opt = Descent()
|
||||||
|
|
||||||
opt = Descent(0.3) # use provided η
|
opt = Descent(0.3)
|
||||||
|
|
||||||
ps = params(model)
|
ps = params(model)
|
||||||
|
|
||||||
gs = gradient(ps) do
|
gs = gradient(ps) do
|
||||||
loss(x, y)
|
loss(x, y)
|
||||||
end
|
end
|
||||||
|
|
||||||
Flux.Optimise.update!(opt, ps, gs)
|
Flux.Optimise.update!(opt, ps, gs)
|
||||||
|
@ -40,17 +41,19 @@ function apply!(o::Descent, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Momentum(η, ρ)
|
Momentum(η = 0.01, ρ = 0.9)
|
||||||
|
|
||||||
Gradient descent with learning rate `η` and momentum `ρ`.
|
Gradient descent optimizer with learning rate `η` and momentum `ρ`.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`.
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`.
|
the weights.
|
||||||
|
- Momentum (`ρ`): Controls the acceleration of gradient descent in the
|
||||||
|
prominent direction, in effect dampening oscillations.
|
||||||
|
|
||||||
## Examples
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9
|
opt = Momentum()
|
||||||
|
|
||||||
opt = Momentum(0.01, 0.99)
|
opt = Momentum(0.01, 0.99)
|
||||||
```
|
```
|
||||||
|
@ -71,17 +74,19 @@ function apply!(o::Momentum, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Nesterov(η, ρ)
|
Nesterov(η = 0.001, ρ = 0.9)
|
||||||
|
|
||||||
Gradient descent with learning rate `η` and Nesterov momentum `ρ`.
|
Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- Nesterov Momentum (ρ): Paramters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
|
the weights.
|
||||||
|
- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
|
||||||
|
prominent direction, in effect dampening oscillations.
|
||||||
|
|
||||||
## Examples
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9
|
opt = Nesterov()
|
||||||
|
|
||||||
opt = Nesterov(0.003, 0.95)
|
opt = Nesterov(0.003, 0.95)
|
||||||
```
|
```
|
||||||
|
@ -103,23 +108,25 @@ function apply!(o::Nesterov, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
RMSProp(η, ρ)
|
RMSProp(η = 0.001, ρ = 0.9)
|
||||||
|
|
||||||
Implements the RMSProp algortihm. Often a good choice for recurrent networks. Paramters other than learning rate generally don't need tuning.
|
Optimizer using the
|
||||||
|
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
|
||||||
|
algorithm. Often a good choice for recurrent networks. Parameters other than learning rate
|
||||||
|
generally don't need tuning.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (η): Defaults to `0.001`.
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- Rho (ρ): Defaults to `0.9`.
|
the weights.
|
||||||
|
- Momentum (`ρ`): Controls the acceleration of gradient descent in the
|
||||||
|
prominent direction, in effect dampening oscillations.
|
||||||
|
|
||||||
## Examples
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = RMSProp() # uses default η = 0.001 and ρ = 0.9
|
opt = RMSProp()
|
||||||
|
|
||||||
opt = RMSProp(0.002, 0.95)
|
opt = RMSProp(0.002, 0.95)
|
||||||
```
|
```
|
||||||
|
|
||||||
## References
|
|
||||||
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
|
|
||||||
"""
|
"""
|
||||||
mutable struct RMSProp
|
mutable struct RMSProp
|
||||||
eta::Float64
|
eta::Float64
|
||||||
|
@ -137,23 +144,22 @@ function apply!(o::RMSProp, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ADAM(η, β::Tuple)
|
ADAM(η = 0.001, β::Tuple = (0.9, 0.999))
|
||||||
|
|
||||||
Implements the ADAM optimiser.
|
[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
|
||||||
|
|
||||||
## Paramters
|
# Parameters
|
||||||
- Learning Rate (`η`): Defaults to `0.001`.
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
|
the weights.
|
||||||
|
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
|
||||||
## Examples
|
second (β2) momentum estimate.
|
||||||
|
|
||||||
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
|
opt = ADAM()
|
||||||
|
|
||||||
opt = ADAM(0.001, (0.9, 0.8))
|
opt = ADAM(0.001, (0.9, 0.8))
|
||||||
```
|
```
|
||||||
## References
|
|
||||||
[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
|
|
||||||
"""
|
"""
|
||||||
mutable struct ADAM
|
mutable struct ADAM
|
||||||
eta::Float64
|
eta::Float64
|
||||||
|
@ -174,24 +180,22 @@ function apply!(o::ADAM, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
RADAM(η, β::Tuple)
|
RADAM(η = 0.001, β::Tuple = (0.9, 0.999))
|
||||||
|
|
||||||
Implements the rectified ADAM optimizer.
|
[Rectified ADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimizer.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (η): Defaults to `0.001`
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
|
the weights.
|
||||||
|
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
|
||||||
## Examples
|
second (β2) momentum estimate.
|
||||||
|
|
||||||
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
|
opt = RADAM()
|
||||||
|
|
||||||
opt = RADAM(0.001, (0.9, 0.8))
|
opt = RADAM(0.001, (0.9, 0.8))
|
||||||
```
|
```
|
||||||
|
|
||||||
## References
|
|
||||||
[RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
|
|
||||||
"""
|
"""
|
||||||
mutable struct RADAM
|
mutable struct RADAM
|
||||||
eta::Float64
|
eta::Float64
|
||||||
|
@ -219,22 +223,22 @@ function apply!(o::RADAM, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
AdaMax(η, β::Tuple)
|
AdaMax(η = 0.001, β::Tuple = (0.9, 0.999))
|
||||||
|
|
||||||
Variant of ADAM based on ∞-norm.
|
[AdaMax](https://arxiv.org/abs/1412.6980v9) is a variant of ADAM based on the ∞-norm.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (η): Defaults to `0.001`
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
|
the weights.
|
||||||
|
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
|
||||||
|
second (β2) momentum estimate.
|
||||||
|
|
||||||
## Examples
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = AdaMax() # uses default η and β
|
opt = AdaMax()
|
||||||
|
|
||||||
opt = AdaMax(0.001, (0.9, 0.995))
|
opt = AdaMax(0.001, (0.9, 0.995))
|
||||||
```
|
```
|
||||||
## References
|
|
||||||
[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser.
|
|
||||||
"""
|
"""
|
||||||
mutable struct AdaMax
|
mutable struct AdaMax
|
||||||
eta::Float64
|
eta::Float64
|
||||||
|
@ -255,23 +259,22 @@ function apply!(o::AdaMax, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ADAGrad(η)
|
ADAGrad(η = 0.1)
|
||||||
|
|
||||||
Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated.
|
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
|
||||||
|
parameter specific learning rates based on how frequently it is updated.
|
||||||
|
Parameters don't need tuning.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (η): Defaults to `0.1`
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
|
the weights.
|
||||||
|
|
||||||
## Examples
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = ADAGrad() # uses default η = 0.1
|
opt = ADAGrad()
|
||||||
|
|
||||||
opt = ADAGrad(0.001)
|
opt = ADAGrad(0.001)
|
||||||
```
|
```
|
||||||
|
|
||||||
## References
|
|
||||||
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
|
|
||||||
Parameters don't need tuning.
|
|
||||||
"""
|
"""
|
||||||
mutable struct ADAGrad
|
mutable struct ADAGrad
|
||||||
eta::Float64
|
eta::Float64
|
||||||
|
@ -288,21 +291,21 @@ function apply!(o::ADAGrad, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ADADelta(ρ)
|
ADADelta(ρ = 0.9)
|
||||||
|
|
||||||
Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning.
|
[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning
|
||||||
|
rate based on a window of past gradient updates.
|
||||||
|
Parameters don't need tuning.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`.
|
- Rho (`ρ`): Factor by which the gradient is decayed at each time step.
|
||||||
|
|
||||||
## Examples
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = ADADelta() # uses default ρ = 0.9
|
opt = ADADelta()
|
||||||
|
|
||||||
opt = ADADelta(0.89)
|
opt = ADADelta(0.89)
|
||||||
```
|
```
|
||||||
|
|
||||||
## References
|
|
||||||
[ADADelta](https://arxiv.org/abs/1212.5701) optimiser.
|
|
||||||
"""
|
"""
|
||||||
mutable struct ADADelta
|
mutable struct ADADelta
|
||||||
rho::Float64
|
rho::Float64
|
||||||
|
@ -321,22 +324,23 @@ function apply!(o::ADADelta, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
AMSGrad(η, β::Tuple)
|
AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999))
|
||||||
|
|
||||||
Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning.
|
The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
|
||||||
|
optimiser. Parameters don't need tuning.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (η): Defaults to `0.001`.
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
|
the weights.
|
||||||
|
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
|
||||||
|
second (β2) momentum estimate.
|
||||||
|
|
||||||
## Examples
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = AMSGrad() # uses default η and β
|
opt = AMSGrad()
|
||||||
|
|
||||||
opt = AMSGrad(0.001, (0.89, 0.995))
|
opt = AMSGrad(0.001, (0.89, 0.995))
|
||||||
```
|
```
|
||||||
|
|
||||||
## References
|
|
||||||
[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser.
|
|
||||||
"""
|
"""
|
||||||
mutable struct AMSGrad
|
mutable struct AMSGrad
|
||||||
eta::Float64
|
eta::Float64
|
||||||
|
@ -356,22 +360,23 @@ function apply!(o::AMSGrad, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
NADAM(η, β::Tuple)
|
NADAM(η = 0.001, β::Tuple = (0.9, 0.999))
|
||||||
|
|
||||||
Nesterov variant of ADAM. Parameters don't need tuning.
|
[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) is a Nesterov variant of ADAM.
|
||||||
|
Parameters don't need tuning.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (η): Defaults to `0.001`.
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
|
the weights.
|
||||||
|
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
|
||||||
|
second (β2) momentum estimate.
|
||||||
|
|
||||||
## Examples
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = NADAM() # uses default η and β
|
opt = NADAM()
|
||||||
|
|
||||||
opt = NADAM(0.002, (0.89, 0.995))
|
opt = NADAM(0.002, (0.89, 0.995))
|
||||||
```
|
```
|
||||||
|
|
||||||
## References
|
|
||||||
[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser.
|
|
||||||
"""
|
"""
|
||||||
mutable struct NADAM
|
mutable struct NADAM
|
||||||
eta::Float64
|
eta::Float64
|
||||||
|
@ -392,23 +397,24 @@ function apply!(o::NADAM, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ADAMW(η, β::Tuple, decay)
|
ADAMW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0)
|
||||||
|
|
||||||
Variant of ADAM defined by fixing weight decay regularization.
|
[ADAMW](https://arxiv.org/abs/1711.05101) is a variant of ADAM fixing (as in repairing) its
|
||||||
|
weight decay regularization.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (η): Defaults to `0.001`.
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999).
|
the weights.
|
||||||
- decay: Decay applied to weights during optimisation. Defaults to 0.
|
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
|
||||||
|
second (β2) momentum estimate.
|
||||||
|
- `decay`: Decay applied to weights during optimisation.
|
||||||
|
|
||||||
## Examples
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
opt = ADAMW() # uses default η, β and decay
|
opt = ADAMW()
|
||||||
|
|
||||||
opt = ADAMW(0.001, (0.89, 0.995), 0.1)
|
opt = ADAMW(0.001, (0.89, 0.995), 0.1)
|
||||||
```
|
```
|
||||||
|
|
||||||
## References
|
|
||||||
[ADAMW](https://arxiv.org/abs/1711.05101)
|
|
||||||
"""
|
"""
|
||||||
ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
|
ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
|
||||||
Optimiser(ADAM(η, β), WeightDecay(decay))
|
Optimiser(ADAM(η, β), WeightDecay(decay))
|
||||||
|
@ -441,17 +447,15 @@ function apply!(o::Optimiser, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
InvDecay(γ)
|
InvDecay(γ = 0.001)
|
||||||
|
|
||||||
Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
|
Apply inverse time decay to an optimiser, so that the effective step size at
|
||||||
```
|
iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size.
|
||||||
|
The wrapped optimiser's step size is not modified.
|
||||||
|
|
||||||
## Parameters
|
# Examples
|
||||||
- gamma (γ): Defaults to `0.001`
|
|
||||||
|
|
||||||
## Example
|
|
||||||
```julia
|
```julia
|
||||||
Optimiser(InvDecay(..), Opt(..))
|
Optimiser(InvDecay(..), Opt(..))
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
mutable struct InvDecay
|
mutable struct InvDecay
|
||||||
|
@ -470,22 +474,25 @@ function apply!(o::InvDecay, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ExpDecay(eta, decay, decay_step, clip)
|
ExpDecay(η = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4)
|
||||||
|
|
||||||
Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
|
Discount the learning rate `η` by the factor `decay` every `decay_step` steps till
|
||||||
|
a minimum of `clip`.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- Learning Rate (eta): Defaults to `0.001`.
|
- Learning rate (`η`): Amount by which gradients are discounted before updating
|
||||||
- decay: Factor by which the learning rate is discounted. Defaults to `0.1`.
|
the weights.
|
||||||
- decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`.
|
- `decay`: Factor by which the learning rate is discounted.
|
||||||
- clip: Minimum value of learning rate. Defaults to `1e-4`.
|
- `decay_step`: Schedule decay operations by setting the number of steps between
|
||||||
|
two decay operations.
|
||||||
|
- `clip`: Minimum value of learning rate.
|
||||||
|
|
||||||
## Example
|
# Examples
|
||||||
To apply exponential decay to an optimiser:
|
To apply exponential decay to an optimiser:
|
||||||
```julia
|
```julia
|
||||||
Optimiser(ExpDecay(..), Opt(..))
|
Optimiser(ExpDecay(..), Opt(..))
|
||||||
|
|
||||||
opt = Optimiser(ExpDecay(), ADAM())
|
opt = Optimiser(ExpDecay(), ADAM())
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
mutable struct ExpDecay
|
mutable struct ExpDecay
|
||||||
|
@ -509,12 +516,12 @@ function apply!(o::ExpDecay, x, Δ)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
WeightDecay(wd)
|
WeightDecay(wd = 0)
|
||||||
|
|
||||||
Decays the weight by `wd`
|
Decay weights by `wd`.
|
||||||
|
|
||||||
## Parameters
|
# Parameters
|
||||||
- weight decay (wd): 0
|
- Weight decay (`wd`)
|
||||||
"""
|
"""
|
||||||
mutable struct WeightDecay
|
mutable struct WeightDecay
|
||||||
wd::Real
|
wd::Real
|
||||||
|
|
|
@ -1,11 +1,26 @@
|
||||||
using Juno
|
using Juno
|
||||||
import Zygote: Params, gradient
|
import Zygote: Params, gradient
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
update!(x, x̄)
|
||||||
|
|
||||||
|
Update the array `x` according to `x .-= x̄`.
|
||||||
|
"""
|
||||||
function update!(x::AbstractArray, x̄)
|
function update!(x::AbstractArray, x̄)
|
||||||
x .+= x̄
|
x .-= x̄
|
||||||
return x
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
update!(opt, p, g)
|
||||||
|
update!(opt, ps::Params, gs)
|
||||||
|
|
||||||
|
Perform an update step of the parameters `ps` (or the single parameter `p`)
|
||||||
|
according to optimizer `opt` and the gradients `gs` (the gradient `g`).
|
||||||
|
|
||||||
|
As a result, the parameters are mutated and the optimizer's internal state may change.
|
||||||
|
"""
|
||||||
function update!(opt, x, x̄)
|
function update!(opt, x, x̄)
|
||||||
x .-= apply!(opt, x, x̄)
|
x .-= apply!(opt, x, x̄)
|
||||||
end
|
end
|
||||||
|
@ -28,11 +43,10 @@ struct StopException <: Exception end
|
||||||
stop()
|
stop()
|
||||||
|
|
||||||
Call `Flux.stop()` in a callback to indicate when a callback condition is met.
|
Call `Flux.stop()` in a callback to indicate when a callback condition is met.
|
||||||
This would trigger the train loop to stop and exit.
|
This will trigger the train loop to stop and exit.
|
||||||
|
|
||||||
|
# Examples
|
||||||
```julia
|
```julia
|
||||||
# Example callback:
|
|
||||||
|
|
||||||
cb = function ()
|
cb = function ()
|
||||||
accuracy() > 0.9 && Flux.stop()
|
accuracy() > 0.9 && Flux.stop()
|
||||||
end
|
end
|
||||||
|
@ -45,18 +59,19 @@ end
|
||||||
"""
|
"""
|
||||||
train!(loss, params, data, opt; cb)
|
train!(loss, params, data, opt; cb)
|
||||||
|
|
||||||
For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
|
For each datapoint `d` in `data` compute the gradient of `loss(d...)` through
|
||||||
backpropagation and calls the optimizer `opt`.
|
backpropagation and call the optimizer `opt`.
|
||||||
|
|
||||||
Takes a callback as keyword argument `cb`. For example, this will print "training"
|
In case datapoints `d` are of numeric array type, assume no splatting is needed
|
||||||
every 10 seconds:
|
and compute the gradient of `loss(d)`.
|
||||||
|
|
||||||
```julia
|
A callback is given with the keyword argument `cb`. For example, this will print
|
||||||
Flux.train!(loss, params, data, opt,
|
"training" every 10 seconds (using [`Flux.throttle`](@ref)):
|
||||||
cb = throttle(() -> println("training"), 10))
|
|
||||||
```
|
|
||||||
|
|
||||||
The callback can call `Flux.stop()` to interrupt the training loop.
|
train!(loss, params, data, opt,
|
||||||
|
cb = throttle(() -> println("training"), 10))
|
||||||
|
|
||||||
|
The callback can call [`Flux.stop`](@ref) to interrupt the training loop.
|
||||||
|
|
||||||
Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays.
|
Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays.
|
||||||
"""
|
"""
|
||||||
|
@ -65,8 +80,14 @@ function train!(loss, ps, data, opt; cb = () -> ())
|
||||||
cb = runall(cb)
|
cb = runall(cb)
|
||||||
@progress for d in data
|
@progress for d in data
|
||||||
try
|
try
|
||||||
gs = gradient(ps) do
|
if d isa AbstractArray{<:Number}
|
||||||
loss(d...)
|
gs = gradient(ps) do
|
||||||
|
loss(d)
|
||||||
|
end
|
||||||
|
else
|
||||||
|
gs = gradient(ps) do
|
||||||
|
loss(d...)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
update!(opt, ps, gs)
|
update!(opt, ps, gs)
|
||||||
cb()
|
cb()
|
||||||
|
@ -86,11 +107,12 @@ end
|
||||||
Run `body` `N` times. Mainly useful for quickly doing multiple epochs of
|
Run `body` `N` times. Mainly useful for quickly doing multiple epochs of
|
||||||
training in a REPL.
|
training in a REPL.
|
||||||
|
|
||||||
```julia
|
# Examples
|
||||||
julia> @epochs 2 println("hello")
|
```jldoctest
|
||||||
INFO: Epoch 1
|
julia> Flux.@epochs 2 println("hello")
|
||||||
|
[ Info: Epoch 1
|
||||||
hello
|
hello
|
||||||
INFO: Epoch 2
|
[ Info: Epoch 2
|
||||||
hello
|
hello
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
175
src/utils.jl
175
src/utils.jl
|
@ -1,10 +1,40 @@
|
||||||
# Arrays
|
# Arrays
|
||||||
nfan() = 1, 1 #fan_in, fan_out
|
nfan() = 1, 1 # fan_in, fan_out
|
||||||
nfan(n) = 1, n #A vector is treated as a n×1 matrix
|
nfan(n) = 1, n # A vector is treated as a n×1 matrix
|
||||||
nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
|
nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices
|
||||||
nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
|
nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of convolution kernels
|
||||||
|
|
||||||
|
"""
|
||||||
|
glorot_uniform(dims...)
|
||||||
|
|
||||||
|
Return an `Array` of size `dims` containing random variables taken from a uniform
|
||||||
|
distribution in the interval ``[-x, x]``, where `x = sqrt(24 / sum(dims)) / 2`.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest; setup = :(using Random; Random.seed!(0))
|
||||||
|
julia> Flux.glorot_uniform(2, 3)
|
||||||
|
2×3 Array{Float32,2}:
|
||||||
|
0.601094 -0.57414 -0.814925
|
||||||
|
0.900868 0.805994 0.057514
|
||||||
|
```
|
||||||
|
"""
|
||||||
glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
|
glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
|
||||||
|
|
||||||
|
"""
|
||||||
|
glorot_normal(dims...)
|
||||||
|
|
||||||
|
Return an `Array` of size `dims` containing random variables taken from a normal
|
||||||
|
distribution with mean 0 and standard deviation `sqrt(2 / sum(dims))`.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest; setup = :(using Random; Random.seed!(0))
|
||||||
|
julia> Flux.glorot_normal(3, 2)
|
||||||
|
3×2 Array{Float32,2}:
|
||||||
|
0.429505 -0.0852891
|
||||||
|
0.523935 0.371009
|
||||||
|
-0.223261 0.188052
|
||||||
|
```
|
||||||
|
"""
|
||||||
glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
|
glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
|
||||||
|
|
||||||
ones(T::Type, dims...) = Base.ones(T, dims...)
|
ones(T::Type, dims...) = Base.ones(T, dims...)
|
||||||
|
@ -13,9 +43,81 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
|
||||||
ones(dims...) = Base.ones(Float32, dims...)
|
ones(dims...) = Base.ones(Float32, dims...)
|
||||||
zeros(dims...) = Base.zeros(Float32, dims...)
|
zeros(dims...) = Base.zeros(Float32, dims...)
|
||||||
|
|
||||||
|
"""
|
||||||
|
unsqueeze(xs, dim)
|
||||||
|
|
||||||
|
Return `xs` reshaped into an `Array` one dimensionality higher than `xs`,
|
||||||
|
where `dim` indicates in which dimension `xs` is extended.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest
|
||||||
|
julia> xs = [[1, 2], [3, 4], [5, 6]]
|
||||||
|
3-element Array{Array{Int64,1},1}:
|
||||||
|
[1, 2]
|
||||||
|
[3, 4]
|
||||||
|
[5, 6]
|
||||||
|
|
||||||
|
julia> Flux.unsqueeze(xs, 1)
|
||||||
|
1×3 Array{Array{Int64,1},2}:
|
||||||
|
[1, 2] [3, 4] [5, 6]
|
||||||
|
|
||||||
|
julia> Flux.unsqueeze([1 2; 3 4], 2)
|
||||||
|
2×1×2 Array{Int64,3}:
|
||||||
|
[:, :, 1] =
|
||||||
|
1
|
||||||
|
3
|
||||||
|
|
||||||
|
[:, :, 2] =
|
||||||
|
2
|
||||||
|
4
|
||||||
|
```
|
||||||
|
"""
|
||||||
unsqueeze(xs, dim) = reshape(xs, (size(xs)[1:dim-1]..., 1, size(xs)[dim:end]...))
|
unsqueeze(xs, dim) = reshape(xs, (size(xs)[1:dim-1]..., 1, size(xs)[dim:end]...))
|
||||||
|
|
||||||
|
"""
|
||||||
|
stack(xs, dim)
|
||||||
|
|
||||||
|
Concatenate the given `Array` of `Array`s `xs` into a single `Array` along the
|
||||||
|
given dimension `dim`.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest
|
||||||
|
julia> xs = [[1, 2], [3, 4], [5, 6]]
|
||||||
|
3-element Array{Array{Int64,1},1}:
|
||||||
|
[1, 2]
|
||||||
|
[3, 4]
|
||||||
|
[5, 6]
|
||||||
|
|
||||||
|
julia> Flux.stack(xs, 1)
|
||||||
|
3×2 Array{Int64,2}:
|
||||||
|
1 2
|
||||||
|
3 4
|
||||||
|
5 6
|
||||||
|
|
||||||
|
julia> cat(xs, dims=1)
|
||||||
|
3-element Array{Array{Int64,1},1}:
|
||||||
|
[1, 2]
|
||||||
|
[3, 4]
|
||||||
|
[5, 6]
|
||||||
|
```
|
||||||
|
"""
|
||||||
stack(xs, dim) = cat(unsqueeze.(xs, dim)..., dims=dim)
|
stack(xs, dim) = cat(unsqueeze.(xs, dim)..., dims=dim)
|
||||||
|
|
||||||
|
"""
|
||||||
|
unstack(xs, dim)
|
||||||
|
|
||||||
|
Unroll the given `xs` into an `Array` of `Array`s along the given dimension `dim`.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest
|
||||||
|
julia> Flux.unstack([1 3 5 7; 2 4 6 8], 2)
|
||||||
|
4-element Array{Array{Int64,1},1}:
|
||||||
|
[1, 2]
|
||||||
|
[3, 4]
|
||||||
|
[5, 6]
|
||||||
|
[7, 8]
|
||||||
|
```
|
||||||
|
"""
|
||||||
unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]
|
unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -23,9 +125,16 @@ unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]
|
||||||
|
|
||||||
Split `xs` into `n` parts.
|
Split `xs` into `n` parts.
|
||||||
|
|
||||||
```julia
|
# Examples
|
||||||
julia> chunk(1:10, 3)
|
```jldoctest
|
||||||
3-element Array{Array{Int64,1},1}:
|
julia> Flux.chunk(1:10, 3)
|
||||||
|
3-element Array{UnitRange{Int64},1}:
|
||||||
|
1:4
|
||||||
|
5:8
|
||||||
|
9:10
|
||||||
|
|
||||||
|
julia> Flux.chunk(collect(1:10), 3)
|
||||||
|
3-element Array{SubArray{Int64,1,Array{Int64,1},Tuple{UnitRange{Int64}},true},1}:
|
||||||
[1, 2, 3, 4]
|
[1, 2, 3, 4]
|
||||||
[5, 6, 7, 8]
|
[5, 6, 7, 8]
|
||||||
[9, 10]
|
[9, 10]
|
||||||
|
@ -40,11 +149,12 @@ batchindex(xs, i) = (reverse(Base.tail(reverse(axes(xs))))..., i)
|
||||||
|
|
||||||
Count the number of times that each element of `xs` appears.
|
Count the number of times that each element of `xs` appears.
|
||||||
|
|
||||||
```julia
|
# Examples
|
||||||
julia> frequencies(['a','b','b'])
|
```jldoctest
|
||||||
|
julia> Flux.frequencies(['a','b','b'])
|
||||||
Dict{Char,Int64} with 2 entries:
|
Dict{Char,Int64} with 2 entries:
|
||||||
'b' => 2
|
|
||||||
'a' => 1
|
'a' => 1
|
||||||
|
'b' => 2
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
function frequencies(xs)
|
function frequencies(xs)
|
||||||
|
@ -60,12 +170,13 @@ head(x::Tuple) = reverse(Base.tail(reverse(x)))
|
||||||
squeezebatch(x) = reshape(x, head(size(x)))
|
squeezebatch(x) = reshape(x, head(size(x)))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
batch(xs)
|
batch(xs)
|
||||||
|
|
||||||
Batch the arrays in `xs` into a single array.
|
Batch the arrays in `xs` into a single array.
|
||||||
|
|
||||||
```julia
|
# Examples
|
||||||
julia> batch([[1,2,3],[4,5,6]])
|
```jldoctest
|
||||||
|
julia> Flux.batch([[1,2,3],[4,5,6]])
|
||||||
3×2 Array{Int64,2}:
|
3×2 Array{Int64,2}:
|
||||||
1 4
|
1 4
|
||||||
2 5
|
2 5
|
||||||
|
@ -82,6 +193,25 @@ function batch(xs)
|
||||||
return data
|
return data
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
Return the given sequence padded with `p` up to a maximum length of `n`.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```jldoctest
|
||||||
|
julia> rpad([1, 2], 4, 0)
|
||||||
|
4-element Array{Int64,1}:
|
||||||
|
1
|
||||||
|
2
|
||||||
|
0
|
||||||
|
0
|
||||||
|
|
||||||
|
julia> rpad([1, 2, 3], 2, 0)
|
||||||
|
3-element Array{Int64,1}:
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
```
|
||||||
|
"""
|
||||||
Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))]
|
Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -90,8 +220,9 @@ Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))
|
||||||
Take a list of `N` sequences, and turn them into a single sequence where each
|
Take a list of `N` sequences, and turn them into a single sequence where each
|
||||||
item is a batch of `N`. Short sequences will be padded by `pad`.
|
item is a batch of `N`. Short sequences will be padded by `pad`.
|
||||||
|
|
||||||
```julia
|
# Examples
|
||||||
julia> batchseq([[1, 2, 3], [4, 5]], 0)
|
```jldoctest
|
||||||
|
julia> Flux.batchseq([[1, 2, 3], [4, 5]], 0)
|
||||||
3-element Array{Array{Int64,1},1}:
|
3-element Array{Array{Int64,1},1}:
|
||||||
[1, 4]
|
[1, 4]
|
||||||
[2, 5]
|
[2, 5]
|
||||||
|
@ -148,11 +279,15 @@ end
|
||||||
# Other
|
# Other
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Returns a function that when invoked, will only be triggered at most once
|
throttle(f, timeout; leading=true, trailing=false)
|
||||||
during `timeout` seconds. Normally, the throttled function will run
|
|
||||||
as much as it can, without ever going more than once per `wait` duration;
|
Return a function that when invoked, will only be triggered at most once
|
||||||
but if you'd like to disable the execution on the leading edge, pass
|
during `timeout` seconds.
|
||||||
`leading=false`. To enable execution on the trailing edge, ditto.
|
|
||||||
|
Normally, the throttled function will run as much as it can, without ever
|
||||||
|
going more than once per `wait` duration; but if you'd like to disable the
|
||||||
|
execution on the leading edge, pass `leading=false`. To enable execution on
|
||||||
|
the trailing edge, pass `trailing=true`.
|
||||||
"""
|
"""
|
||||||
function throttle(f, timeout; leading=true, trailing=false)
|
function throttle(f, timeout; leading=true, trailing=false)
|
||||||
cooldown = true
|
cooldown = true
|
||||||
|
|
|
@ -58,6 +58,13 @@ end
|
||||||
@test y[3,:] isa CuArray
|
@test y[3,:] isa CuArray
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@testset "restructure gpu" begin
|
||||||
|
dudt = Dense(1,1) |> gpu
|
||||||
|
p,re = Flux.destructure(dudt)
|
||||||
|
foo(x) = sum(re(p)(x))
|
||||||
|
@test gradient(foo, cu(rand(1)))[1] isa CuArray
|
||||||
|
end
|
||||||
|
|
||||||
if CuArrays.has_cudnn()
|
if CuArrays.has_cudnn()
|
||||||
@info "Testing Flux/CUDNN"
|
@info "Testing Flux/CUDNN"
|
||||||
include("cudnn.jl")
|
include("cudnn.jl")
|
||||||
|
|
92
test/data.jl
92
test/data.jl
|
@ -1,22 +1,86 @@
|
||||||
using Flux.Data
|
@testset "DataLoader" begin
|
||||||
using Test
|
X = reshape([1:10;], (2, 5))
|
||||||
|
Y = [1:5;]
|
||||||
|
|
||||||
@test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
|
d = DataLoader(X, batchsize=2)
|
||||||
|
batches = collect(d)
|
||||||
|
@test length(batches) == 3
|
||||||
|
@test batches[1] == X[:,1:2]
|
||||||
|
@test batches[2] == X[:,3:4]
|
||||||
|
@test batches[3] == X[:,5:5]
|
||||||
|
|
||||||
@test length(CMUDict.phones()) == 39
|
d = DataLoader(X, batchsize=2, partial=false)
|
||||||
|
batches = collect(d)
|
||||||
|
@test length(batches) == 2
|
||||||
|
@test batches[1] == X[:,1:2]
|
||||||
|
@test batches[2] == X[:,3:4]
|
||||||
|
|
||||||
@test length(CMUDict.symbols()) == 84
|
d = DataLoader(X, Y, batchsize=2)
|
||||||
|
batches = collect(d)
|
||||||
|
@test length(batches) == 3
|
||||||
|
@test length(batches[1]) == 2
|
||||||
|
@test length(batches[2]) == 2
|
||||||
|
@test length(batches[3]) == 2
|
||||||
|
@test batches[1][1] == X[:,1:2]
|
||||||
|
@test batches[1][2] == Y[1:2]
|
||||||
|
@test batches[2][1] == X[:,3:4]
|
||||||
|
@test batches[2][2] == Y[3:4]
|
||||||
|
@test batches[3][1] == X[:,5:5]
|
||||||
|
@test batches[3][2] == Y[5:5]
|
||||||
|
|
||||||
@test MNIST.images()[1] isa Matrix
|
# test interaction with `train!`
|
||||||
@test MNIST.labels() isa Vector{Int64}
|
θ = ones(2)
|
||||||
|
X = zeros(2, 10)
|
||||||
|
loss(x) = sum((x .- θ).^2)
|
||||||
|
d = DataLoader(X)
|
||||||
|
Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
|
||||||
|
@test norm(θ) < 1e-4
|
||||||
|
|
||||||
@test FashionMNIST.images()[1] isa Matrix
|
# test interaction with `train!`
|
||||||
@test FashionMNIST.labels() isa Vector{Int64}
|
θ = zeros(2)
|
||||||
|
X = ones(2, 10)
|
||||||
|
Y = fill(2, 10)
|
||||||
|
loss(x, y) = sum((y - x'*θ).^2)
|
||||||
|
d = DataLoader(X, Y)
|
||||||
|
Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
|
||||||
|
@test norm(θ .- 1) < 1e-10
|
||||||
|
end
|
||||||
|
|
||||||
@test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
|
@testset "CMUDict" begin
|
||||||
|
@test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args
|
||||||
|
|
||||||
@test Iris.features() isa Matrix
|
@test length(CMUDict.phones()) == 39
|
||||||
@test size(Iris.features()) == (4,150)
|
|
||||||
|
|
||||||
@test Iris.labels() isa Vector{String}
|
@test length(CMUDict.symbols()) == 84
|
||||||
@test size(Iris.labels()) == (150,)
|
end
|
||||||
|
|
||||||
|
@testset "MNIST" begin
|
||||||
|
@test MNIST.images()[1] isa Matrix
|
||||||
|
@test MNIST.labels() isa Vector{Int64}
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "FashionMNIST" begin
|
||||||
|
@test FashionMNIST.images()[1] isa Matrix
|
||||||
|
@test FashionMNIST.labels() isa Vector{Int64}
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Sentiment" begin
|
||||||
|
@test Data.Sentiment.train() isa Vector{Data.Tree{Any}}
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Iris" begin
|
||||||
|
@test Iris.features() isa Matrix
|
||||||
|
@test size(Iris.features()) == (4,150)
|
||||||
|
|
||||||
|
@test Iris.labels() isa Vector{String}
|
||||||
|
@test size(Iris.labels()) == (150,)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@testset "Housing" begin
|
||||||
|
@test Housing.features() isa Matrix # test broken due to SSL certifate expiration problem
|
||||||
|
@test size(Housing.features()) == (506, 13)
|
||||||
|
|
||||||
|
@test Housing.targets() isa Array{Float64}
|
||||||
|
@test size(Housing.targets()) == (506, 1)
|
||||||
|
end
|
||||||
|
|
|
@ -4,6 +4,10 @@ using Flux: gradient
|
||||||
|
|
||||||
@testset "Pooling" begin
|
@testset "Pooling" begin
|
||||||
x = randn(Float32, 10, 10, 3, 2)
|
x = randn(Float32, 10, 10, 3, 2)
|
||||||
|
gmp = GlobalMaxPool()
|
||||||
|
@test size(gmp(x)) == (1, 1, 3, 2)
|
||||||
|
gmp = GlobalMeanPool()
|
||||||
|
@test size(gmp(x)) == (1, 1, 3, 2)
|
||||||
mp = MaxPool((2, 2))
|
mp = MaxPool((2, 2))
|
||||||
@test mp(x) == maxpool(x, PoolDims(x, 2))
|
@test mp(x) == maxpool(x, PoolDims(x, 2))
|
||||||
mp = MeanPool((2, 2))
|
mp = MeanPool((2, 2))
|
||||||
|
@ -188,3 +192,27 @@ end
|
||||||
m = MeanPool((2, 2); stride = 2, pad = 3)
|
m = MeanPool((2, 2); stride = 2, pad = 3)
|
||||||
@test Flux.outdims(m, (5, 5)) == (5, 5)
|
@test Flux.outdims(m, (5, 5)) == (5, 5)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, CrossCor), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
|
||||||
|
data = ones(Float32, (k .+ 3)..., 1,1)
|
||||||
|
l = ltype(k, 1=>1, pad=SamePad())
|
||||||
|
@test size(l(data)) == size(data)
|
||||||
|
|
||||||
|
l = ltype(k, 1=>1, pad=SamePad(), dilation = k .÷ 2)
|
||||||
|
@test size(l(data)) == size(data)
|
||||||
|
|
||||||
|
stride = 3
|
||||||
|
l = ltype(k, 1=>1, pad=SamePad(), stride = stride)
|
||||||
|
if ltype == ConvTranspose
|
||||||
|
@test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] .- stride .+ 1
|
||||||
|
else
|
||||||
|
@test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ stride)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
|
||||||
|
data = ones(Float32, (k .+ 3)..., 1,1)
|
||||||
|
|
||||||
|
l = ltype(k, pad=SamePad())
|
||||||
|
@test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ k)
|
||||||
|
end
|
||||||
|
|
|
@ -1,30 +1,32 @@
|
||||||
using Flux, Test, Statistics
|
using Flux, Test, Statistics
|
||||||
using Zygote: pullback
|
using Zygote: pullback
|
||||||
|
|
||||||
trainmode(f, x...) = pullback(f, x...)[1]
|
evalwgrad(f, x...) = pullback(f, x...)[1]
|
||||||
trainmode(f) = (x...) -> trainmode(f, x...)
|
|
||||||
|
|
||||||
@testset "Dropout" begin
|
@testset "Dropout" begin
|
||||||
x = [1.,2.,3.]
|
x = [1.,2.,3.]
|
||||||
@test x == Dropout(0.1)(x)
|
@test x == Dropout(0.1)(x)
|
||||||
@test x == trainmode(Dropout(0), x)
|
@test x == evalwgrad(Dropout(0), x)
|
||||||
@test zero(x) == trainmode(Dropout(1), x)
|
@test zero(x) == evalwgrad(Dropout(1), x)
|
||||||
|
|
||||||
x = rand(100)
|
x = rand(100)
|
||||||
m = Dropout(0.9)
|
m = Dropout(0.9)
|
||||||
y = trainmode(m, x)
|
y = evalwgrad(m, x)
|
||||||
@test count(a->a==0, y) > 50
|
@test count(a->a==0, y) > 50
|
||||||
y = m(x)
|
testmode!(m, true)
|
||||||
|
y = evalwgrad(m, x) # should override istraining
|
||||||
@test count(a->a==0, y) == 0
|
@test count(a->a==0, y) == 0
|
||||||
y = trainmode(m, x)
|
testmode!(m, false)
|
||||||
|
y = evalwgrad(m, x)
|
||||||
@test count(a->a==0, y) > 50
|
@test count(a->a==0, y) > 50
|
||||||
|
|
||||||
x = rand(Float32, 100)
|
x = rand(Float32, 100)
|
||||||
m = Chain(Dense(100,100),
|
m = Chain(Dense(100,100),
|
||||||
Dropout(0.9))
|
Dropout(0.9))
|
||||||
y = trainmode(m, x)
|
y = evalwgrad(m, x)
|
||||||
@test count(a->a == 0, y) > 50
|
@test count(a->a == 0, y) > 50
|
||||||
y = m(x)
|
testmode!(m, true)
|
||||||
|
y = evalwgrad(m, x) # should override istraining
|
||||||
@test count(a->a == 0, y) == 0
|
@test count(a->a == 0, y) == 0
|
||||||
|
|
||||||
x = rand(100, 50)
|
x = rand(100, 50)
|
||||||
|
@ -49,7 +51,7 @@ end
|
||||||
# initial m.σ is 1
|
# initial m.σ is 1
|
||||||
# initial m.μ is 0
|
# initial m.μ is 0
|
||||||
|
|
||||||
y = trainmode(m, x)
|
y = evalwgrad(m, x)
|
||||||
@test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5)
|
@test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5)
|
||||||
# julia> x
|
# julia> x
|
||||||
# 2×3 Array{Float64,2}:
|
# 2×3 Array{Float64,2}:
|
||||||
|
@ -82,19 +84,19 @@ end
|
||||||
@test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
|
@test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
|
||||||
end
|
end
|
||||||
|
|
||||||
let m = trainmode(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
|
let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
|
||||||
y = reshape(permutedims(x, [2, 1, 3]), 2, :)
|
y = reshape(permutedims(x, [2, 1, 3]), 2, :)
|
||||||
y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
|
y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
|
||||||
@test m(x) == y
|
@test m(x) == y
|
||||||
end
|
end
|
||||||
|
|
||||||
let m = trainmode(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
|
let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
|
||||||
y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
|
y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
|
||||||
y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
|
y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
|
||||||
@test m(x) == y
|
@test m(x) == y
|
||||||
end
|
end
|
||||||
|
|
||||||
let m = trainmode(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
|
let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
|
||||||
y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
|
y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
|
||||||
y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
|
y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
|
||||||
@test m(x) == y
|
@test m(x) == y
|
||||||
|
@ -117,7 +119,7 @@ end
|
||||||
x = Float64.(x)
|
x = Float64.(x)
|
||||||
@test m.β == [0, 0] # initβ(2)
|
@test m.β == [0, 0] # initβ(2)
|
||||||
@test m.γ == [1, 1] # initγ(2)
|
@test m.γ == [1, 1] # initγ(2)
|
||||||
y = trainmode(m, x)
|
y = evalwgrad(m, x)
|
||||||
|
|
||||||
#julia> x
|
#julia> x
|
||||||
#[:, :, 1] =
|
#[:, :, 1] =
|
||||||
|
@ -162,7 +164,7 @@ end
|
||||||
@test isapprox(y, sigmoid.((x .- expand_inst(m.μ, affine_shape)) ./ sqrt.(expand_inst(m.σ², affine_shape) .+ m.ϵ)), atol = 1.0e-7)
|
@test isapprox(y, sigmoid.((x .- expand_inst(m.μ, affine_shape)) ./ sqrt.(expand_inst(m.σ², affine_shape) .+ m.ϵ)), atol = 1.0e-7)
|
||||||
end
|
end
|
||||||
|
|
||||||
let m = trainmode(InstanceNorm(2)), sizes = (2, 4, 1, 2, 3),
|
let m = trainmode!(InstanceNorm(2)), sizes = (2, 4, 1, 2, 3),
|
||||||
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
||||||
y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
|
y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
|
||||||
y = reshape(m(y), sizes...)
|
y = reshape(m(y), sizes...)
|
||||||
|
@ -172,14 +174,14 @@ end
|
||||||
# check that μ, σ², and the output are the correct size for higher rank tensors
|
# check that μ, σ², and the output are the correct size for higher rank tensors
|
||||||
let m = InstanceNorm(2), sizes = (5, 5, 3, 4, 2, 6),
|
let m = InstanceNorm(2), sizes = (5, 5, 3, 4, 2, 6),
|
||||||
x = reshape(Float32.(collect(1:prod(sizes))), sizes)
|
x = reshape(Float32.(collect(1:prod(sizes))), sizes)
|
||||||
y = trainmode(m, x)
|
y = evalwgrad(m, x)
|
||||||
@test size(m.μ) == (sizes[end - 1], )
|
@test size(m.μ) == (sizes[end - 1], )
|
||||||
@test size(m.σ²) == (sizes[end - 1], )
|
@test size(m.σ²) == (sizes[end - 1], )
|
||||||
@test size(y) == sizes
|
@test size(y) == sizes
|
||||||
end
|
end
|
||||||
|
|
||||||
# show that instance norm is equal to batch norm when channel and batch dims are squashed
|
# show that instance norm is equal to batch norm when channel and batch dims are squashed
|
||||||
let m_inorm = trainmode(InstanceNorm(2)), m_bnorm = trainmode(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
|
let m_inorm = trainmode!(InstanceNorm(2)), m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
|
||||||
x = reshape(Float32.(collect(1:prod(sizes))), sizes)
|
x = reshape(Float32.(collect(1:prod(sizes))), sizes)
|
||||||
@test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes)
|
@test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes)
|
||||||
end
|
end
|
||||||
|
@ -204,7 +206,7 @@ if VERSION >= v"1.1"
|
||||||
@test m.β == [0, 0, 0, 0] # initβ(32)
|
@test m.β == [0, 0, 0, 0] # initβ(32)
|
||||||
@test m.γ == [1, 1, 1, 1] # initγ(32)
|
@test m.γ == [1, 1, 1, 1] # initγ(32)
|
||||||
|
|
||||||
y = trainmode(m, x)
|
y = evalwgrad(m, x)
|
||||||
|
|
||||||
#julia> x
|
#julia> x
|
||||||
#[:, :, 1] =
|
#[:, :, 1] =
|
||||||
|
@ -263,7 +265,7 @@ if VERSION >= v"1.1"
|
||||||
@test isapprox(y, out, atol = 1.0e-7)
|
@test isapprox(y, out, atol = 1.0e-7)
|
||||||
end
|
end
|
||||||
|
|
||||||
let m = trainmode(GroupNorm(2,2)), sizes = (2, 4, 1, 2, 3),
|
let m = trainmode!(GroupNorm(2,2)), sizes = (2, 4, 1, 2, 3),
|
||||||
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
||||||
y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
|
y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
|
||||||
y = reshape(m(y), sizes...)
|
y = reshape(m(y), sizes...)
|
||||||
|
@ -273,20 +275,20 @@ if VERSION >= v"1.1"
|
||||||
# check that μ, σ², and the output are the correct size for higher rank tensors
|
# check that μ, σ², and the output are the correct size for higher rank tensors
|
||||||
let m = GroupNorm(4,2), sizes = (5, 5, 3, 4, 4, 6),
|
let m = GroupNorm(4,2), sizes = (5, 5, 3, 4, 4, 6),
|
||||||
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
||||||
y = trainmode(m, x)
|
y = evalwgrad(m, x)
|
||||||
@test size(m.μ) == (m.G,1)
|
@test size(m.μ) == (m.G,1)
|
||||||
@test size(m.σ²) == (m.G,1)
|
@test size(m.σ²) == (m.G,1)
|
||||||
@test size(y) == sizes
|
@test size(y) == sizes
|
||||||
end
|
end
|
||||||
|
|
||||||
# show that group norm is the same as instance norm when the group size is the same as the number of channels
|
# show that group norm is the same as instance norm when the group size is the same as the number of channels
|
||||||
let IN = trainmode(InstanceNorm(4)), GN = trainmode(GroupNorm(4,4)), sizes = (2,2,3,4,5),
|
let IN = trainmode!(InstanceNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,5),
|
||||||
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
||||||
@test IN(x) ≈ GN(x)
|
@test IN(x) ≈ GN(x)
|
||||||
end
|
end
|
||||||
|
|
||||||
# show that group norm is the same as batch norm for a group of size 1 and batch of size 1
|
# show that group norm is the same as batch norm for a group of size 1 and batch of size 1
|
||||||
let BN = trainmode(BatchNorm(4)), GN = trainmode(GroupNorm(4,4)), sizes = (2,2,3,4,1),
|
let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,1),
|
||||||
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
|
||||||
@test BN(x) ≈ GN(x)
|
@test BN(x) ≈ GN(x)
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
using Test
|
using Test
|
||||||
using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
|
using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
|
||||||
σ, binarycrossentropy, logitbinarycrossentropy
|
σ, binarycrossentropy, logitbinarycrossentropy, flatten
|
||||||
|
|
||||||
const ϵ = 1e-7
|
const ϵ = 1e-7
|
||||||
|
|
||||||
|
@ -13,6 +13,20 @@ const ϵ = 1e-7
|
||||||
@test mse(ŷ, y) ≈ (.1^2 + .9^2)/2
|
@test mse(ŷ, y) ≈ (.1^2 + .9^2)/2
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@testset "mae" begin
|
||||||
|
@test Flux.mae(ŷ, y) ≈ 1/2
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "huber_loss" begin
|
||||||
|
@test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
|
||||||
|
end
|
||||||
|
|
||||||
|
y = [123.0,456.0,789.0]
|
||||||
|
ŷ = [345.0,332.0,789.0]
|
||||||
|
@testset "msle" begin
|
||||||
|
@test Flux.msle(ŷ, y) ≈ 0.38813985859136585
|
||||||
|
end
|
||||||
|
|
||||||
# Now onehot y's
|
# Now onehot y's
|
||||||
y = onehotbatch([1, 1, 0, 0], 0:1)
|
y = onehotbatch([1, 1, 0, 0], 0:1)
|
||||||
ŷ = [.1 .9; .9 .1; .9 .1; .1 .9]'
|
ŷ = [.1 .9; .9 .1; .9 .1; .1 .9]'
|
||||||
|
@ -51,31 +65,50 @@ const ϵ = 1e-7
|
||||||
end
|
end
|
||||||
|
|
||||||
y = [1 2 3]
|
y = [1 2 3]
|
||||||
y1 = [4.0 5.0 6.0]
|
ŷ = [4.0 5.0 6.0]
|
||||||
@testset "kldivergence" begin
|
@testset "kldivergence" begin
|
||||||
@test Flux.kldivergence(y, y1) ≈ 4.761838062403337
|
@test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
|
||||||
@test Flux.kldivergence(y, y) ≈ 0
|
@test Flux.kldivergence(y, y) ≈ 0
|
||||||
end
|
end
|
||||||
|
|
||||||
y = [1 2 3 4]
|
y = [1 2 3 4]
|
||||||
y1 = [5.0 6.0 7.0 8.0]
|
ŷ = [5.0 6.0 7.0 8.0]
|
||||||
@testset "hinge" begin
|
@testset "hinge" begin
|
||||||
@test Flux.hinge(y, y1) ≈ 0
|
@test Flux.hinge(ŷ, y) ≈ 0
|
||||||
@test Flux.hinge(y, 0.5 .* y) ≈ 0.125
|
@test Flux.hinge(y, 0.5 .* y) ≈ 0.125
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@testset "squared_hinge" begin
|
||||||
|
@test Flux.squared_hinge(ŷ, y) ≈ 0
|
||||||
|
@test Flux.squared_hinge(y, 0.5 .* y) ≈ 0.0625
|
||||||
|
end
|
||||||
|
|
||||||
y = [0.1 0.2 0.3]
|
y = [0.1 0.2 0.3]
|
||||||
y1 = [0.4 0.5 0.6]
|
ŷ = [0.4 0.5 0.6]
|
||||||
@testset "poisson" begin
|
@testset "poisson" begin
|
||||||
@test Flux.poisson(y, y1) ≈ 1.0160455586700767
|
@test Flux.poisson(ŷ, y) ≈ 0.6278353988097339
|
||||||
@test Flux.poisson(y, y) ≈ 0.5044459776946685
|
@test Flux.poisson(y, y) ≈ 0.5044459776946685
|
||||||
end
|
end
|
||||||
|
|
||||||
|
y = [1.0 0.5 0.3 2.4]
|
||||||
|
ŷ = [0 1.4 0.5 1.2]
|
||||||
|
@testset "dice_coeff_loss" begin
|
||||||
|
@test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999
|
||||||
|
@test Flux.dice_coeff_loss(y, y) ≈ 0.0
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "tversky_loss" begin
|
||||||
|
@test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
|
||||||
|
@test Flux.tversky_loss(ŷ, y, β = 0.8) ≈ -0.09490740740740744
|
||||||
|
@test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
|
||||||
|
end
|
||||||
|
|
||||||
@testset "no spurious promotions" begin
|
@testset "no spurious promotions" begin
|
||||||
for T in (Float32, Float64)
|
for T in (Float32, Float64)
|
||||||
y = rand(T, 2)
|
y = rand(T, 2)
|
||||||
ŷ = rand(T, 2)
|
ŷ = rand(T, 2)
|
||||||
for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
|
for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,
|
||||||
|
Flux.mae, Flux.huber_loss, Flux.msle, Flux.squared_hinge, Flux.dice_coeff_loss, Flux.tversky_loss)
|
||||||
fwd, back = Flux.pullback(f, ŷ, y)
|
fwd, back = Flux.pullback(f, ŷ, y)
|
||||||
@test fwd isa T
|
@test fwd isa T
|
||||||
@test eltype(back(one(T))[1]) == T
|
@test eltype(back(one(T))[1]) == T
|
||||||
|
@ -83,3 +116,10 @@ const ϵ = 1e-7
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@testset "helpers" begin
|
||||||
|
@testset "flatten" begin
|
||||||
|
x = randn(Float32, 10, 10, 3, 2)
|
||||||
|
@test size(flatten(x)) == (300, 2)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
|
@ -1,32 +1,50 @@
|
||||||
using Flux, Test, Random, Statistics, Documenter
|
using Flux
|
||||||
using Random
|
using Flux.Data
|
||||||
|
using Test
|
||||||
|
using Random, Statistics, LinearAlgebra
|
||||||
|
using Documenter
|
||||||
|
using IterTools: ncycle
|
||||||
|
|
||||||
Random.seed!(0)
|
Random.seed!(0)
|
||||||
|
|
||||||
@testset "Flux" begin
|
@testset "Flux" begin
|
||||||
|
|
||||||
@info "Testing Basics"
|
@testset "Utils" begin
|
||||||
|
include("utils.jl")
|
||||||
|
end
|
||||||
|
|
||||||
include("utils.jl")
|
@testset "Onehot" begin
|
||||||
include("onehot.jl")
|
include("onehot.jl")
|
||||||
include("optimise.jl")
|
end
|
||||||
include("data.jl")
|
|
||||||
|
|
||||||
@info "Testing Layers"
|
@testset "Optimise" begin
|
||||||
|
include("optimise.jl")
|
||||||
|
end
|
||||||
|
|
||||||
include("layers/basic.jl")
|
@testset "Data" begin
|
||||||
include("layers/normalisation.jl")
|
include("data.jl")
|
||||||
include("layers/stateless.jl")
|
end
|
||||||
include("layers/conv.jl")
|
|
||||||
|
|
||||||
if Flux.use_cuda[]
|
@testset "Layers" begin
|
||||||
include("cuda/cuda.jl")
|
include("layers/basic.jl")
|
||||||
else
|
include("layers/normalisation.jl")
|
||||||
@warn "CUDA unavailable, not testing GPU support"
|
include("layers/stateless.jl")
|
||||||
end
|
include("layers/conv.jl")
|
||||||
|
end
|
||||||
|
|
||||||
if VERSION >= v"1.2"
|
@testset "CUDA" begin
|
||||||
doctest(Flux)
|
if Flux.use_cuda[]
|
||||||
end
|
include("cuda/cuda.jl")
|
||||||
|
else
|
||||||
|
@warn "CUDA unavailable, not testing GPU support"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
@testset "Docs" begin
|
||||||
|
if VERSION >= v"1.4"
|
||||||
|
DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
|
||||||
|
doctest(Flux)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end # testset Flux
|
||||||
|
|
Loading…
Reference in New Issue