Compare commits

..

5 Commits

Author SHA1 Message Date
CarloLucibello
ba92f9a140 Merge branch 'cl-docs' of https://github.com/FluxML/Flux.jl into cl-docs 2020-03-03 18:36:33 +01:00
CarloLucibello
4516978caa deprecate 2020-03-03 18:25:46 +01:00
Carlo Lucibello
19df897de7
Merge pull request #1059 from findmyway/add_doc_for_functor
Make really good clear examples and explination of @functor  in docs
2020-03-03 10:36:00 +01:00
CarloLucibello
94d95442ab docs for functor.jl 2020-03-03 09:39:06 +01:00
Jun Tian
64b4a6a80c add doc for functor 2020-03-01 09:44:06 +08:00
51 changed files with 902 additions and 1998 deletions

View File

@ -1,12 +0,0 @@
[Please delete this text and describe your change here.
For bugfixes, please detail the bug and include a test case which your patch fixes.
If you are adding a new feature, please clearly describe the design, its rationale, the possible alternatives considered.
It is easiest to merge new features when there is clear precedent in other systems; we need to know we're taking
the right direction since it can be hard to change later.]
### PR Checklist
- [ ] Tests are added
- [ ] Entry in NEWS.md
- [ ] Documentation, if applicable
- [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes).

View File

@ -6,8 +6,16 @@ on:
jobs: jobs:
CompatHelper: CompatHelper:
runs-on: ubuntu-latest runs-on: ${{ matrix.os }}
strategy:
matrix:
julia-version: [1.3]
julia-arch: [x64]
os: [ubuntu-latest]
steps: steps:
- uses: julia-actions/setup-julia@latest
with:
version: ${{ matrix.julia-version }}
- name: Pkg.add("CompatHelper") - name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")' run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main() - name: CompatHelper.main()

View File

@ -7,7 +7,6 @@ os:
julia: julia:
- 1.3 - 1.3
- 1
- nightly - nightly
notifications: notifications:

View File

@ -8,77 +8,71 @@ version = "0.5.0"
[[AbstractTrees]] [[AbstractTrees]]
deps = ["Markdown"] deps = ["Markdown"]
git-tree-sha1 = "33e450545eaf7699da1a6e755f9ea65f14077a45" git-tree-sha1 = "86d092c2599f1f7bb01668bf8eb3412f98d61e47"
uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
version = "0.3.3" version = "0.3.2"
[[Adapt]] [[Adapt]]
deps = ["LinearAlgebra"] deps = ["LinearAlgebra"]
git-tree-sha1 = "fd04049c7dd78cfef0b06cdc1f0f181467655712" git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "1.1.0" version = "1.0.1"
[[ArrayLayouts]] [[ArrayLayouts]]
deps = ["FillArrays", "LinearAlgebra"] deps = ["FillArrays", "LinearAlgebra"]
git-tree-sha1 = "a504dca2ac7eda8761c8f7c1ed52427a1be75a3c" git-tree-sha1 = "bc779df8d73be70e4e05a63727d3a4dfb4c52b1f"
uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a" uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
version = "0.2.6" version = "0.1.5"
[[Base64]] [[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]] [[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"] deps = ["Libdl", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10" version = "0.5.8"
[[CEnum]] [[CEnum]]
git-tree-sha1 = "1b77a77c3b28e0b3f413f7567c9bb8dd9bdccd14" git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.3.0" version = "0.2.0"
[[CUDAapi]] [[CUDAapi]]
deps = ["Libdl", "Logging"] deps = ["Libdl", "Logging"]
git-tree-sha1 = "831b825d10104bd29e28f6da93312a976830717b" git-tree-sha1 = "d7ceadd8f821177d05b897c0517e94633db535fe"
uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
version = "4.0.0" version = "3.1.0"
[[CUDAdrv]] [[CUDAdrv]]
deps = ["CEnum", "CUDAapi", "Printf"] deps = ["CEnum", "CUDAapi", "Printf"]
git-tree-sha1 = "f56bbf18c86bcff7a961a32a4947a5abb2963a29" git-tree-sha1 = "01e90fa34e25776bc7c8661183d4519149ebfe59"
uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
version = "6.3.0" version = "6.0.0"
[[CUDAnative]] [[CUDAnative]]
deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "ExprTools", "GPUCompiler", "LLVM", "Libdl", "Pkg", "Printf"] deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
git-tree-sha1 = "ac86db2b05fdfec96b011e25a504ffe7476e8a68" git-tree-sha1 = "f86269ff60ebe082a2806ecbce51f3cadc68afe9"
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
version = "3.1.0" version = "2.10.2"
[[CodeTracking]]
deps = ["InteractiveUtils", "UUIDs"]
git-tree-sha1 = "cab4da992adc0a64f63fa30d2db2fd8bec40cab4"
uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
version = "0.5.11"
[[CodecZlib]] [[CodecZlib]]
deps = ["TranscodingStreams", "Zlib_jll"] deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da" git-tree-sha1 = "05916673a2627dd91b4969ff8ba6941bc85a960e"
uuid = "944b1d66-785c-5afd-91f1-9de20f533193" uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
version = "0.7.0" version = "0.6.0"
[[ColorTypes]] [[ColorTypes]]
deps = ["FixedPointNumbers", "Random"] deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "c73d9cfc2a9d8433dc77f5bff4bddf46b1d78c20" git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.10.3" version = "0.9.1"
[[Colors]] [[Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"] deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
git-tree-sha1 = "1e9bba7984e78aa8cdeea7f9f7cc984ad4e4b1c7" git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.2" version = "0.11.2"
[[CommonSubexpressions]] [[CommonSubexpressions]]
deps = ["Test"] deps = ["Test"]
@ -88,32 +82,26 @@ version = "0.2.0"
[[CompilerSupportLibraries_jll]] [[CompilerSupportLibraries_jll]]
deps = ["Libdl", "Pkg"] deps = ["Libdl", "Pkg"]
git-tree-sha1 = "7c4f882c41faa72118841185afc58a2eb00ef612" git-tree-sha1 = "b57c5d019367c90f234a7bc7e24ff0a84971da5d"
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "0.3.3+0" version = "0.2.0+1"
[[Cthulhu]]
deps = ["CodeTracking", "InteractiveUtils", "REPL", "UUIDs", "Unicode"]
git-tree-sha1 = "f3643e78353199d3097821e806348bd83f364155"
uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
version = "1.1.1"
[[CuArrays]] [[CuArrays]]
deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"] deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
git-tree-sha1 = "1582b74d2322df7dd94549d4ac9d095e0f20e884" git-tree-sha1 = "7c20c5a45bb245cf248f454d26966ea70255b271"
uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
version = "2.2.1" version = "1.7.2"
[[DataAPI]] [[DataAPI]]
git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32" git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.3.0" version = "1.1.0"
[[DataStructures]] [[DataStructures]]
deps = ["InteractiveUtils", "OrderedCollections"] deps = ["InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "af6d9c86e191c917c2276fbede1137e8ea20157f" git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.17.17" version = "0.17.10"
[[Dates]] [[Dates]]
deps = ["Printf"] deps = ["Printf"]
@ -139,55 +127,52 @@ version = "1.0.1"
deps = ["Random", "Serialization", "Sockets"] deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[ExprTools]] [[FFTW]]
git-tree-sha1 = "6f0517056812fd6aa3af23d4b70d5325a2ae4e95" deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
version = "0.1.1" uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
version = "1.2.0"
[[FFTW_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "ddb57f4cf125243b4aa4908c94d73a805f3cbf2c"
uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
version = "3.3.9+4"
[[FillArrays]] [[FillArrays]]
deps = ["LinearAlgebra", "Random", "SparseArrays"] deps = ["LinearAlgebra", "Random", "SparseArrays"]
git-tree-sha1 = "44f561e293987ffc84272cd3d2b14b0b93123d63" git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
version = "0.8.10" version = "0.8.5"
[[FixedPointNumbers]] [[FixedPointNumbers]]
git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238" git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa"
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.8.0" version = "0.7.1"
[[ForwardDiff]] [[ForwardDiff]]
deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"] deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "869540e4367122fbffaace383a5bdc34d6e5e5ac" git-tree-sha1 = "88b082d492be6b63f967b6c96b352e25ced1a34c"
uuid = "f6369f11-7733-5829-9624-2563aa707210" uuid = "f6369f11-7733-5829-9624-2563aa707210"
version = "0.10.10" version = "0.10.9"
[[Functors]]
deps = ["MacroTools"]
git-tree-sha1 = "f40adc6422f548176bb4351ebd29e4abf773040a"
uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
version = "0.1.0"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[GPUArrays]] [[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"] deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
git-tree-sha1 = "d887693eb1bd5e1fd573262a978745481895ec7d" git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "3.4.1" version = "2.0.1"
[[GPUCompiler]]
deps = ["Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs"]
git-tree-sha1 = "5275aa268ecd09640b32560e1eae90c78816e4d1"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.2.0"
[[IRTools]] [[IRTools]]
deps = ["InteractiveUtils", "MacroTools", "Test"] deps = ["InteractiveUtils", "MacroTools", "Test"]
git-tree-sha1 = "90ee39f9beaaa186e4968417ea2b8ed5673c91c0" git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
uuid = "7869d1d1-7146-5819-86e3-90919afe41df" uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
version = "0.3.3" version = "0.3.1"
[[IntelOpenMP_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
version = "2018.0.3+0"
[[InteractiveUtils]] [[InteractiveUtils]]
deps = ["Markdown"] deps = ["Markdown"]
@ -195,18 +180,17 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[Juno]] [[Juno]]
deps = ["Base64", "Logging", "Media", "Profile"] deps = ["Base64", "Logging", "Media", "Profile"]
git-tree-sha1 = "a686b0cf235fa3e491b79b4783c2d2382292b436" git-tree-sha1 = "4f2249fb58cfb140eeb89428e31791e2f8959d8c"
uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d" uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
version = "0.8.2" version = "0.8.0"
[[LLVM]] [[LLVM]]
deps = ["CEnum", "Libdl", "Printf", "Unicode"] deps = ["CEnum", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "dd3f584c3dbefe39b2a8fbafa1a3b77e31e21255" git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0" uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "1.5.1" version = "1.3.3"
[[LibGit2]] [[LibGit2]]
deps = ["Printf"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[Libdl]] [[Libdl]]
@ -219,11 +203,17 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]] [[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MKL_jll]]
deps = ["IntelOpenMP_jll", "Libdl", "Pkg"]
git-tree-sha1 = "720629cc8cbd12c146ca01b661fd1a6cf66e2ff4"
uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
version = "2019.0.117+2"
[[MacroTools]] [[MacroTools]]
deps = ["Markdown", "Random"] deps = ["DataStructures", "Markdown", "Random"]
git-tree-sha1 = "f7d2e3f654af75f01ec49be82c231c382214223a" git-tree-sha1 = "07ee65e03e28ca88bc9a338a3726ae0c3efaa94b"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.5" version = "0.5.4"
[[Markdown]] [[Markdown]]
deps = ["Base64"] deps = ["Base64"]
@ -257,17 +247,18 @@ version = "0.3.3"
[[OpenSpecFun_jll]] [[OpenSpecFun_jll]]
deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"] deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
git-tree-sha1 = "d51c416559217d974a1113522d5919235ae67a87" git-tree-sha1 = "d110040968b9afe95c6bd9c6233570b0fe8abd22"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.3+3" version = "0.5.3+2"
[[OrderedCollections]] [[OrderedCollections]]
git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3" deps = ["Random", "Serialization", "Test"]
git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.2.0" version = "1.1.0"
[[Pkg]] [[Pkg]]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Printf]] [[Printf]]
@ -319,15 +310,15 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]] [[SpecialFunctions]]
deps = ["OpenSpecFun_jll"] deps = ["OpenSpecFun_jll"]
git-tree-sha1 = "d8d8b8a9f4119829410ecd706da4cc8594a1e020" git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b" uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "0.10.3" version = "0.10.0"
[[StaticArrays]] [[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"] deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "5c06c0aeb81bef54aed4b3f446847905eb6cbda0" git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
uuid = "90137ffa-7385-5640-81b9-e52037218182" uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "0.12.3" version = "0.12.1"
[[Statistics]] [[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"] deps = ["LinearAlgebra", "SparseArrays"]
@ -335,9 +326,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[StatsBase]] [[StatsBase]]
deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
git-tree-sha1 = "a6102b1f364befdb05746f386b67c6b7e3262c45" git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
version = "0.33.0" version = "0.32.1"
[[Test]] [[Test]]
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
@ -345,9 +336,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TimerOutputs]] [[TimerOutputs]]
deps = ["Printf"] deps = ["Printf"]
git-tree-sha1 = "f458ca23ff80e46a630922c555d838303e4b9603" git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.6" version = "0.5.3"
[[TranscodingStreams]] [[TranscodingStreams]]
deps = ["Random", "Test"] deps = ["Random", "Test"]
@ -364,21 +355,21 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[ZipFile]] [[ZipFile]]
deps = ["Libdl", "Printf", "Zlib_jll"] deps = ["Libdl", "Printf", "Zlib_jll"]
git-tree-sha1 = "254975fef2fc526583bb9b7c9420fe66ffe09f2f" git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
version = "0.9.2" version = "0.9.1"
[[Zlib_jll]] [[Zlib_jll]]
deps = ["Libdl", "Pkg"] deps = ["Libdl", "Pkg"]
git-tree-sha1 = "a2e0d558f6031002e380a90613b199e37a8565bf" git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e"
uuid = "83775a58-1f1d-513f-b197-d71354ab007a" uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.11+10" version = "1.2.11+8"
[[Zygote]] [[Zygote]]
deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "Future", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"] deps = ["ArrayLayouts", "DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
git-tree-sha1 = "707ceea58e2bd0ff3077ab13a92f8355181d3ee4" git-tree-sha1 = "7dc5fdb4917ac5a84e199ae654316a01cd4a278b"
uuid = "e88e6eb3-aa80-5325-afca-941959d7151f" uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
version = "0.4.20" version = "0.4.9"
[[ZygoteRules]] [[ZygoteRules]]
deps = ["MacroTools"] deps = ["MacroTools"]

16
NEWS.md
View File

@ -1,19 +1,3 @@
# v0.11
* Change to `DataLoader`'s constructor [https://github.com/FluxML/Flux.jl/pull/1152]
* Use `DataLoader` with `NamedTuple`s, so that tensors can be accessed by name [https://github.com/FluxML/Flux.jl/pull/1221].
* Error if Dense layers weights and biases are not arrays [https://github.com/FluxML/Flux.jl/pull/1218].
# v0.10.5
* Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`.
* Added option to set `bias` to [Flux.Zeros](https://github.com/FluxML/Flux.jl/pull/873) to eliminating `bias` from being trained.
* Added `GlobalMaxPool` and `GlobalMeanPool` [layers](https://github.com/FluxML/Flux.jl/pull/950) for performing global pooling operations.
* Added `ClipValue` and `ClipNorm` in this [pr](https://github.com/FluxML/Flux.jl/pull/1133) to `Flux.Optimise` to provide a cleaner API for gradient clipping.
* Added new kwarg-only [constructors](https://github.com/FluxML/Flux.jl/pull/873) for the various convolutional layers.
* Documented the convolutional layer constructors accepting `weight` and `bias` keyword arguments to supply custom arrays for those fields.
* Testing suite improvements now test for gradients of all layers along with GPU support.
* Functors have now moved to [Functors.jl](https://github.com/FluxML/Flux.jl/pull/1174) to allow for their use outside of Flux.
* Added [helper functions](https://github.com/FluxML/Flux.jl/pull/873) `Flux.convfilter` and `Flux.depthwiseconvfilter` to construct weight arrays for convolutions outside of layer constructors so as to not have to depend on the default layers for custom implementations.
# v0.10.0 # v0.10.0
* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669) * The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
- The dependency on Tracker.jl has been removed. - The dependency on Tracker.jl has been removed.

View File

@ -1,6 +1,6 @@
name = "Flux" name = "Flux"
uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
version = "0.11.0-DEV" version = "0.10.2"
[deps] [deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@ -9,9 +9,7 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d" Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@ -27,19 +25,18 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
[compat] [compat]
AbstractTrees = "0.2, 0.3" AbstractTrees = "0.2, 0.3"
Adapt = "1, 2.0" Adapt = "1"
CodecZlib = "0.5, 0.6, 0.7" CodecZlib = "0.5, 0.6"
Colors = "0.8, 0.9, 0.10, 0.11, 0.12" Colors = "0.8, 0.9, 0.10, 0.11"
CuArrays = "2" CuArrays = "1.6"
Functors = "0.1"
Juno = "0.5, 0.6, 0.7, 0.8" Juno = "0.5, 0.6, 0.7, 0.8"
MacroTools = "0.3, 0.4, 0.5" MacroTools = "0.3, 0.4, 0.5"
NNlib = "0.6" NNlib = "0.6"
Reexport = "0.2" Reexport = "0.2"
StatsBase = "0" StatsBase = "0"
ZipFile = "0.7, 0.8, 0.9" ZipFile = "0.7, 0.8, 0.9"
Zygote = "0.4.13" Zygote = "0.4"
julia = "1.3" julia = "1"
[extras] [extras]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

View File

@ -1,8 +1,6 @@
using Documenter, Flux, NNlib using Documenter, Flux, NNlib
DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
makedocs(modules=[Flux, NNlib], makedocs(modules=[Flux, NNlib],
doctest = VERSION >= v"1.4",
sitename = "Flux", sitename = "Flux",
pages = ["Home" => "index.md", pages = ["Home" => "index.md",
"Building Models" => "Building Models" =>
@ -10,7 +8,6 @@ makedocs(modules=[Flux, NNlib],
"Recurrence" => "models/recurrence.md", "Recurrence" => "models/recurrence.md",
"Regularisation" => "models/regularisation.md", "Regularisation" => "models/regularisation.md",
"Model Reference" => "models/layers.md", "Model Reference" => "models/layers.md",
"Advanced Model Building" => "models/advanced.md",
"NNlib" => "models/nnlib.md"], "NNlib" => "models/nnlib.md"],
"Handling Data" => "Handling Data" =>
["One-Hot Encoding" => "data/onehot.md", ["One-Hot Encoding" => "data/onehot.md",
@ -21,16 +18,12 @@ makedocs(modules=[Flux, NNlib],
"GPU Support" => "gpu.md", "GPU Support" => "gpu.md",
"Saving & Loading" => "saving.md", "Saving & Loading" => "saving.md",
"The Julia Ecosystem" => "ecosystem.md", "The Julia Ecosystem" => "ecosystem.md",
"Utility Functions" => "utilities.md",
"Performance Tips" => "performance.md", "Performance Tips" => "performance.md",
"Datasets" => "datasets.md",
"Community" => "community.md"], "Community" => "community.md"],
format = Documenter.HTML( format = Documenter.HTML(assets = ["assets/flux.css"],
analytics = "UA-36890222-9", analytics = "UA-36890222-9",
assets = ["assets/flux.css"], prettyurls = haskey(ENV, "CI")))
prettyurls = get(ENV, "CI", nothing) == "true"),
)
deploydocs(repo = "github.com/FluxML/Flux.jl.git", deploydocs(repo = "github.com/FluxML/Flux.jl.git",
target = "build", target = "build",
push_preview = true) push_preview = true)

View File

@ -3,4 +3,4 @@ Flux provides the `DataLoader` type in the `Flux.Data` module to handle iteratio
```@docs ```@docs
Flux.Data.DataLoader Flux.Data.DataLoader
``` ```

View File

@ -7,15 +7,15 @@ julia> using Flux: onehot, onecold
julia> onehot(:b, [:a, :b, :c]) julia> onehot(:b, [:a, :b, :c])
3-element Flux.OneHotVector: 3-element Flux.OneHotVector:
0 false
1 true
0 false
julia> onehot(:c, [:a, :b, :c]) julia> onehot(:c, [:a, :b, :c])
3-element Flux.OneHotVector: 3-element Flux.OneHotVector:
0 false
0 false
1 true
``` ```
The inverse is `onecold` (which can take a general probability distribution, as well as just booleans). The inverse is `onecold` (which can take a general probability distribution, as well as just booleans).
@ -31,11 +31,6 @@ julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c])
:c :c
``` ```
```@docs
Flux.onehot
Flux.onecold
```
## Batches ## Batches
`onehotbatch` creates a batch (matrix) of one-hot vectors, and `onecold` treats matrices as batches. `onehotbatch` creates a batch (matrix) of one-hot vectors, and `onecold` treats matrices as batches.
@ -57,7 +52,3 @@ julia> onecold(ans, [:a, :b, :c])
``` ```
Note that these operations returned `OneHotVector` and `OneHotMatrix` rather than `Array`s. `OneHotVector`s behave like normal vectors but avoid any unnecessary cost compared to using an integer index directly. For example, multiplying a matrix with a one-hot vector simply slices out the relevant row of the matrix under the hood. Note that these operations returned `OneHotVector` and `OneHotMatrix` rather than `Array`s. `OneHotVector`s behave like normal vectors but avoid any unnecessary cost compared to using an integer index directly. For example, multiplying a matrix with a one-hot vector simply slices out the relevant row of the matrix under the hood.
```@docs
Flux.onehotbatch
```

View File

@ -1,20 +0,0 @@
# Datasets
Flux includes several standard machine learning datasets.
```@docs
Flux.Data.Iris.features()
Flux.Data.Iris.labels()
Flux.Data.MNIST.images()
Flux.Data.MNIST.labels()
Flux.Data.FashionMNIST.images()
Flux.Data.FashionMNIST.labels()
Flux.Data.CMUDict.phones()
Flux.Data.CMUDict.symbols()
Flux.Data.CMUDict.rawdict()
Flux.Data.CMUDict.cmudict()
Flux.Data.Sentiment.train()
Flux.Data.Sentiment.test()
Flux.Data.Sentiment.dev()
```

View File

@ -38,6 +38,40 @@ m = fmap(cu, m)
d(cu(rand(10))) d(cu(rand(10)))
``` ```
However, if you create a customized model, `fmap` may not work out of the box.
```julia
julia> struct ActorCritic{A, C}
actor::A
critic::C
end
julia> m = ActorCritic(ones(2,2), ones(2))
ActorCritic{Array{Float64,2},Array{Float64,1}}([1.0 1.0; 1.0 1.0], [1.0, 1.0])
julia> fmap(cu, m)
ActorCritic{Array{Float64,2},Array{Float64,1}}([1.0 1.0; 1.0 1.0], [1.0, 1.0])
```
As you can see, nothing changed after `fmap(cu, m)`. The reason is that `Flux` doesn't know your customized model structure. To make it work as expected, you need the `@functor` macro.
```julia
julia> Flux.@functor ActorCritic
julia> fmap(cu, m)
ActorCritic{CuArray{Float32,2,Nothing},CuArray{Float32,1,Nothing}}(Float32[1.0 1.0; 1.0 1.0], Float32[1.0, 1.0])
```
Now you can see that the inner fields of `actor` and `critic` are transformed into `CuArray`. So what does the `@functor` macro do here? Basically, it will create a function like this:
```julia
Flux.functor(m::ActorCritic) = (actor = m.actor, critic=m.critic), fields -> ActorCritic(fields...)
```
And the `functor` will be called recursively in `fmap`. As you can see, the result of `functor` contains two parts, a *destructure* part and a *reconstrucutre* part. The first part is to make the customized model structure into `trainable` data structure known to `Flux` (here is a `NamedTuple`). The goal is to turn `m` into `(actor=cu(ones(2,2)), critic=cu(ones(2)))`. The second part is to turn the result back into a `ActorCritic`, so that we can get `ActorCritic(cu(ones(2,2)),cu(ones(2)))`.
By default, the `@functor` macro will transform all the fields in your customized structure. In some cases, you may only want to transform several fields. Then you just specify those fields manually like `Flux.@functor ActorCritic (actor,)` (note that the fields part must be a tuple). And make sure the `ActorCritic(actor)` constructor is also implemented.
As a convenience, Flux provides the `gpu` function to convert models and data to the GPU if one is available. By default, it'll do nothing, but loading `CuArrays` will cause it to move data to the GPU instead. As a convenience, Flux provides the `gpu` function to convert models and data to the GPU if one is available. By default, it'll do nothing, but loading `CuArrays` will cause it to move data to the GPU instead.
```julia ```julia
@ -73,4 +107,4 @@ julia> x |> cpu
0.235164 0.235164
0.192538 0.192538
``` ```

View File

@ -1,73 +0,0 @@
# Advanced Model Building and Customisation
Here we will try and describe usage of some more advanced features that Flux provides to give more control over model building.
## Customising Parameter Collection for a Model
Taking reference from our example `Affine` layer from the [basics](basics.md#Building-Layers-1).
By default all the fields in the `Affine` type are collected as its parameters, however, in some cases it may be desired to hold other metadata in our "layers" that may not be needed for training, and are hence supposed to be ignored while the parameters are collected. With Flux, it is possible to mark the fields of our layers that are trainable in two ways.
The first way of achieving this is through overloading the `trainable` function.
```julia-repl
julia> @functor Affine
julia> a = Affine(rand(3,3), rand(3))
Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955])
julia> Flux.params(a) # default behavior
Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])
julia> Flux.trainable(a::Affine) = (a.W,)
julia> Flux.params(a)
Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])
```
Only the fields returned by `trainable` will be collected as trainable parameters of the layer when calling `Flux.params`.
Another way of achieving this is through the `@functor` macro directly. Here, we can mark the fields we are interested in by grouping them in the second argument:
```julia
Flux.@functor Affine (W,)
```
However, doing this requires the `struct` to have a corresponding constructor that accepts those parameters.
## Freezing Layer Parameters
When it is desired to not include all the model parameters (for e.g. transfer learning), we can simply not pass in those layers into our call to `params`.
Consider a simple multi-layer perceptron model where we want to avoid optimising the first two `Dense` layers. We can obtain
this using the slicing features `Chain` provides:
```julia
m = Chain(
Dense(784, 64, relu),
Dense(64, 64, relu),
Dense(32, 10)
)
ps = Flux.params(m[3:end])
```
The `Zygote.Params` object `ps` now holds a reference to only the parameters of the layers passed to it.
During training, the gradients will only be computed for (and applied to) the last `Dense` layer, therefore only that would have its parameters changed.
`Flux.params` also takes multiple inputs to make it easy to collect parameters from heterogenous models with a single call. A simple demonstration would be if we wanted to omit optimising the second `Dense` layer in the previous example. It would look something like this:
```julia
Flux.params(m[1], m[3:end])
```
Sometimes, a more fine-tuned control is needed.
We can freeze a specific parameter of a specific layer which already entered a `Params` object `ps`,
by simply deleting it from `ps`:
```julia
ps = params(m)
delete!(ps, m[2].b)
```

View File

@ -32,6 +32,8 @@ julia> gradient(f, [2, 1], [2, 0])
But machine learning models can have *hundreds* of parameters! To handle this, Flux lets you work with collections of parameters, via `params`. You can get the gradient of all parameters used in a program without explicitly passing them in. But machine learning models can have *hundreds* of parameters! To handle this, Flux lets you work with collections of parameters, via `params`. You can get the gradient of all parameters used in a program without explicitly passing them in.
```jldoctest basics ```jldoctest basics
julia> using Flux
julia> x = [2, 1]; julia> x = [2, 1];
julia> y = [2, 0]; julia> y = [2, 0];
@ -67,8 +69,8 @@ b = rand(2)
predict(x) = W*x .+ b predict(x) = W*x .+ b
function loss(x, y) function loss(x, y)
ŷ = predict(x) = predict(x)
sum((y .- ŷ).^2) sum((y .- ).^2)
end end
x, y = rand(5), rand(2) # Dummy data x, y = rand(5), rand(2) # Dummy data
@ -218,8 +220,6 @@ Flux.@functor Affine
This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md). This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
For some more helpful tricks, including parameter freezing, please checkout the [advanced usage guide](advanced.md).
## Utility functions ## Utility functions
Flux provides some utility functions to help you generate models in an automated fashion. Flux provides some utility functions to help you generate models in an automated fashion.
@ -238,5 +238,5 @@ Currently limited to the following layers:
- `MeanPool` - `MeanPool`
```@docs ```@docs
Flux.outdims outdims
``` ```

View File

@ -14,17 +14,10 @@ These layers are used to build convolutional neural networks (CNNs).
```@docs ```@docs
Conv Conv
MaxPool MaxPool
GlobalMaxPool
MeanPool MeanPool
GlobalMeanPool
DepthwiseConv DepthwiseConv
ConvTranspose ConvTranspose
CrossCor CrossCor
SamePad
flatten
Flux.Zeros
Flux.convfilter
Flux.depthwiseconvfilter
``` ```
## Recurrent Layers ## Recurrent Layers
@ -36,7 +29,6 @@ RNN
LSTM LSTM
GRU GRU
Flux.Recur Flux.Recur
Flux.reset!
``` ```
## Other General Purpose Layers ## Other General Purpose Layers
@ -54,31 +46,26 @@ SkipConnection
These layers don't affect the structure of the network but may improve training times or reduce overfitting. These layers don't affect the structure of the network but may improve training times or reduce overfitting.
```@docs ```@docs
Flux.normalise
BatchNorm BatchNorm
Flux.dropout
Dropout Dropout
Flux.dropout
AlphaDropout AlphaDropout
LayerNorm LayerNorm
InstanceNorm
GroupNorm GroupNorm
``` ```
### Testmode ### Testmode
Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `Flux.testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified. Many normalisation layers behave differently under training and inference (testing). By default, Flux will automatically determine when a layer evaluation is part of training or inference. Still, depending on your use case, it may be helpful to manually specify when these layers should be treated as being trained or not. For this, Flux provides `testmode!`. When called on a model (e.g. a layer or chain of layers), this function will place the model into the mode specified.
```@docs ```@docs
Flux.testmode! testmode!
trainmode! trainmode!
``` ```
## Cost Functions ## Cost Functions
```@docs ```@docs
Flux.mae
Flux.mse Flux.mse
Flux.msle
Flux.huber_loss
Flux.crossentropy Flux.crossentropy
Flux.logitcrossentropy Flux.logitcrossentropy
Flux.binarycrossentropy Flux.binarycrossentropy
@ -86,7 +73,4 @@ Flux.logitbinarycrossentropy
Flux.kldivergence Flux.kldivergence
Flux.poisson Flux.poisson
Flux.hinge Flux.hinge
Flux.squared_hinge
Flux.dice_coeff_loss
Flux.tversky_loss
``` ```

View File

@ -7,27 +7,17 @@ Flux re-exports all of the functions exported by the [NNlib](https://github.com/
Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on. Non-linearities that go between layers of your model. Note that, unless otherwise stated, activation functions operate on scalars. To apply them to an array you can call `σ.(xs)`, `relu.(xs)` and so on.
```@docs ```@docs
NNlib.celu
NNlib.elu NNlib.elu
NNlib.gelu NNlib.gelu
NNlib.hardsigmoid
NNlib.hardtanh
NNlib.leakyrelu NNlib.leakyrelu
NNlib.lisht
NNlib.logcosh NNlib.logcosh
NNlib.logsigmoid NNlib.logsigmoid
NNlib.mish
NNlib.relu NNlib.relu
NNlib.relu6
NNlib.rrelu
NNlib.selu NNlib.selu
NNlib.sigmoid NNlib.sigmoid
NNlib.softplus NNlib.softplus
NNlib.softshrink
NNlib.softsign NNlib.softsign
NNlib.swish NNlib.swish
NNlib.tanhshrink
NNlib.trelu
``` ```
## Softmax ## Softmax
@ -58,4 +48,4 @@ NNlib.batched_mul
NNlib.batched_mul! NNlib.batched_mul!
NNlib.batched_adjoint NNlib.batched_adjoint
NNlib.batched_transpose NNlib.batched_transpose
``` ```

View File

@ -64,7 +64,3 @@ julia> activations(c, rand(10))
julia> sum(norm, ans) julia> sum(norm, ans)
2.1166067f0 2.1166067f0
``` ```
```@docs
Flux.activations
```

View File

@ -39,7 +39,7 @@ E.g. the following will have run into the same problem as above:
leaky_tanh(x) = 0.01*x + tanh(x) leaky_tanh(x) = 0.01*x + tanh(x)
``` ```
While one could change the activation function (e.g. to use `0.01f0*x`), the idiomatic (and safe way) to avoid type casts whenever inputs changes is to use `oftype`: While one could change the activation function (e.g. to use `0.01f0x`), the idiomatic (and safe way) to avoid type casts whenever inputs changes is to use `oftype`:
``` ```
leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x) leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x)
``` ```
@ -52,7 +52,7 @@ e.g.
```julia ```julia
function loss_total(xs::AbstractVector{<:Vector}, ys::AbstractVector{<:Vector}) function loss_total(xs::AbstractVector{<:Vector}, ys::AbstractVector{<:Vector})
sum(zip(xs, ys)) do (x, y_target) sum(zip(xs, ys)) do (x, y_target)
y_pred = model(x) # evaluate the model y_pred = model(x) # evaluate the model
return loss(y_pred, y_target) return loss(y_pred, y_target)
end end
end end

View File

@ -52,7 +52,6 @@ Momentum
Nesterov Nesterov
RMSProp RMSProp
ADAM ADAM
RADAM
AdaMax AdaMax
ADAGrad ADAGrad
ADADelta ADADelta
@ -80,7 +79,7 @@ Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked. The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
```julia ```julia
function Flux.Optimise.apply!(o::Momentum, x, Δ) function apply!(o::Momentum, x, Δ)
η, ρ = o.eta, o.rho η, ρ = o.eta, o.rho
v = get!(o.velocity, x, zero(x))::typeof(x) v = get!(o.velocity, x, zero(x))::typeof(x)
@. v = ρ * v - η * Δ @. v = ρ * v - η * Δ
@ -140,16 +139,3 @@ ExpDecay
InvDecay InvDecay
WeightDecay WeightDecay
``` ```
## Gradient Clipping
Gradient clipping is useful for training recurrent neural networks, which have a tendency to suffer from the exploding gradient problem. An example usage is
```julia
opt = Optimiser(ClipValue(1e-3), ADAM(1e-3))
```
```@docs
ClipValue
ClipNorm
```

View File

@ -32,7 +32,6 @@ Flux.train!(loss, ps, data, opt)
``` ```
The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want. The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
For a list of all built-in loss functions, check out the [layer reference](../models/layers.md).
At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations. At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
@ -42,8 +41,6 @@ The model to be trained must have a set of tracked parameters that are used to c
Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values. Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
Handling all the parameters on a layer by layer basis is explained in the [Layer Helpers](../models/basics.md) section. Also, for freezing model parameters, see the [Advanced Usage Guide](../models/advanced.md).
## Datasets ## Datasets
The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point: The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:
@ -95,10 +92,6 @@ julia> @epochs 2 Flux.train!(...)
# Train for two epochs # Train for two epochs
``` ```
```@docs
Flux.@epochs
```
## Callbacks ## Callbacks
`train!` takes an additional argument, `cb`, that's used for callbacks so that you can observe the training process. For example: `train!` takes an additional argument, `cb`, that's used for callbacks so that you can observe the training process. For example:
@ -142,7 +135,7 @@ function my_custom_train!(loss, ps, data, opt)
for d in data for d in data
gs = gradient(ps) do gs = gradient(ps) do
training_loss = loss(d...) training_loss = loss(d...)
# Insert whatever code you want here that needs Training loss, e.g. logging # Insert what ever code you want here that needs Training loss, e.g. logging
return training_loss return training_loss
end end
# insert what ever code you want here that needs gradient # insert what ever code you want here that needs gradient

View File

@ -1,49 +0,0 @@
# Utility Functions
Flux contains some utility functions for working with data; these functions
help create inputs for your models or batch your dataset.
Other functions can be used to initialize your layers or to regularly execute
callback functions.
## Working with Data
```@docs
Flux.unsqueeze
Flux.stack
Flux.unstack
Flux.chunk
Flux.frequencies
Flux.batch
Flux.batchseq
Base.rpad(v::AbstractVector, n::Integer, p)
```
## Layer Initialization
These are primarily useful if you are planning to write your own layers.
Flux initializes convolutional layers and recurrent cells with `glorot_uniform`
by default.
To change the default on an applicable layer, pass the desired function with the
`init` keyword. For example:
```jldoctest; setup = :(using Flux)
julia> conv = Conv((3, 3), 1 => 8, relu; init=Flux.glorot_normal)
Conv((3, 3), 1=>8, relu)
```
```@docs
Flux.glorot_uniform
Flux.glorot_normal
```
## Model Abstraction
```@docs
Flux.destructure
```
## Callback Helpers
```@docs
Flux.throttle
Flux.stop
```

View File

@ -3,33 +3,29 @@ module Flux
# Zero Flux Given # Zero Flux Given
using Base: tail using Base: tail
using Statistics, Random, LinearAlgebra using Zygote, MacroTools, Juno, Reexport, Statistics, Random
using Zygote, MacroTools, Juno, Reexport
using MacroTools: @forward using MacroTools: @forward
@reexport using NNlib @reexport using NNlib
using Zygote: Params, @adjoint, gradient, pullback, @nograd using Zygote: Params, @adjoint, gradient, pullback, @nograd
export gradient export gradient
export Chain, Dense, Maxout, RNN, LSTM, GRU, SamePad, Conv, CrossCor, ConvTranspose, export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, flatten,
DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm, DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode! SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!
include("optimise/Optimise.jl") include("optimise/Optimise.jl")
using .Optimise using .Optimise
using .Optimise: @epochs using .Optimise: @epochs
export Descent, ADAM, Momentum, Nesterov, RMSProp, export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay, ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
ClipValue, ClipNorm
using CuArrays using CuArrays
const use_cuda = Ref(false) const use_cuda = Ref(false)
include("utils.jl") include("utils.jl")
include("zeros.jl")
include("onehot.jl") include("onehot.jl")
include("functor.jl") include("functor.jl")
@ -41,15 +37,26 @@ include("layers/normalise.jl")
include("data/Data.jl") include("data/Data.jl")
include("deprecations.jl") include("deprecated.jl")
include("cuda/cuda.jl")
function __init__() function __init__()
use_cuda[] = CuArrays.functional() # Can be overridden after load with `Flux.use_cuda[] = false` precompiling = ccall(:jl_generating_output, Cint, ()) != 0
if CuArrays.functional()
if !CuArrays.has_cudnn() # we don't want to include the CUDA module when precompiling,
@warn "CuArrays.jl found cuda, but did not find libcudnn. Some functionality will not be available." # or we could end up replacing it at run time (triggering a warning)
precompiling && return
if !CuArrays.functional()
# nothing to do here, and either CuArrays or one of its dependencies will have warned
else
use_cuda[] = true
# FIXME: this functionality should be conditional at run time by checking `use_cuda`
# (or even better, get moved to CuArrays.jl as much as possible)
if CuArrays.has_cudnn()
include(joinpath(@__DIR__, "cuda/cuda.jl"))
else
@warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
end end
end end
end end

View File

@ -51,6 +51,4 @@ export Iris
include("housing.jl") include("housing.jl")
export Housing export Housing
@deprecate DataLoader(x...; kws...) DataLoader(x; kws...)
end end

View File

@ -24,35 +24,18 @@ function load()
end end
end end
"""
phones()
Return a `Vector` containing the phones used in the CMU Pronouncing Dictionary.
"""
function phones() function phones()
load() load()
Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String), Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
"\n", keepempty = false), "\t"))) "\n", keepempty = false), "\t")))
end end
"""
symbols()
Return a `Vector` containing the symbols used in the CMU Pronouncing Dictionary.
A symbol is a phone with optional auxiliary symbols, indicating for example the
amount of stress on the phone.
"""
function symbols() function symbols()
load() load()
Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String), Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
"\n", keepempty = false)) "\n", keepempty = false))
end end
"""
rawdict()
Return the unfiltered CMU Pronouncing Dictionary.
"""
function rawdict() function rawdict()
load() load()
Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
@ -61,14 +44,6 @@ end
validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s) validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
"""
cmudict()
Return a filtered CMU Pronouncing Dictionary.
It is filtered so each word contains only ASCII characters and a combination of
word characters (as determined by the regex engine using `\\w`), '-' and '.'.
"""
cmudict() = filter(p -> validword(p.first), rawdict()) cmudict() = filter(p -> validword(p.first), rawdict())
alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.'] alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']

View File

@ -1,7 +1,7 @@
# Adapted from Knet's src/data.jl (author: Deniz Yuret) # Adapted from Knet's src/data.jl (author: Deniz Yuret)
struct DataLoader{D} struct DataLoader
data::D data
batchsize::Int batchsize::Int
nobs::Int nobs::Int
partial::Bool partial::Bool
@ -11,43 +11,37 @@ struct DataLoader{D}
end end
""" """
DataLoader(data; batchsize=1, shuffle=false, partial=true) DataLoader(data...; batchsize=1, shuffle=false, partial=true)
An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
(except possibly the last one). (except possibly the last one).
Takes as input a single data tensor, or a tuple (or a named tuple) of tensors. Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in
The last dimension in each tensor is considered to be the observation dimension. supervised learning. The last dimension in each tensor is considered to be the observation
dimension.
If `shuffle=true`, shuffles the observations each time iterations are re-started. If `shuffle=true`, shuffles the observations each time iterations are re-started.
If `partial=false`, drops the last mini-batch if it is smaller than the batchsize. If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
The original data is preserved in the `data` field of the DataLoader. The original data is preserved as a tuple in the `data` field of the DataLoader.
Usage example: Example usage:
Xtrain = rand(10, 100) Xtrain = rand(10, 100)
train_loader = DataLoader(Xtrain, batchsize=2) train_loader = DataLoader(Xtrain, batchsize=2)
# iterate over 50 mini-batches of size 2 # iterate over 50 mini-batches of size 2
for x in train_loader for x in train_loader:
@assert size(x) == (10, 2) @assert size(x) == (10, 2)
... ...
end end
train_loader.data # original dataset train_loader.data # original dataset
# similar, but yielding tuples
train_loader = DataLoader((Xtrain,), batchsize=2)
for (x,) in train_loader
@assert size(x) == (10, 2)
...
end
Xtrain = rand(10, 100) Xtrain = rand(10, 100)
Ytrain = rand(100) Ytrain = rand(100)
train_loader = DataLoader((Xtrain, Ytrain), batchsize=2, shuffle=true) train_loader = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true)
for epoch in 1:100 for epoch in 1:100
for (x, y) in train_loader for (x, y) in train_loader:
@assert size(x) == (10, 2) @assert size(x) == (10, 2)
@assert size(y) == (2,) @assert size(y) == (2,)
... ...
@ -57,26 +51,26 @@ Usage example:
# train for 10 epochs # train for 10 epochs
using IterTools: ncycle using IterTools: ncycle
Flux.train!(loss, ps, ncycle(train_loader, 10), opt) Flux.train!(loss, ps, ncycle(train_loader, 10), opt)
# can use NamedTuple to name tensors
train_loader = DataLoader((images=Xtrain, labels=Ytrain), batchsize=2, shuffle=true)
for datum in train_loader
@assert size(datum.images) == (10, 2)
@assert size(datum.labels) == (2,)
end
""" """
function DataLoader(data; batchsize=1, shuffle=false, partial=true) function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
length(data) > 0 || throw(ArgumentError("Need at least one data input"))
batchsize > 0 || throw(ArgumentError("Need positive batchsize")) batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
n = _nobs(data) nx = size(data[1])[end]
if n < batchsize for i=2:length(data)
@warn "Number of observations less than batchsize, decreasing the batchsize to $n" nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
batchsize = n
end end
imax = partial ? n : n - batchsize + 1 if nx < batchsize
DataLoader(data, batchsize, n, partial, imax, [1:n;], shuffle) @warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
batchsize = nx
end
imax = partial ? nx : nx - batchsize + 1
ids = 1:min(nx, batchsize)
DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
end end
getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
@propagate_inbounds function Base.iterate(d::DataLoader, i=0) # returns data in d.indices[i+1:i+batchsize] @propagate_inbounds function Base.iterate(d::DataLoader, i=0) # returns data in d.indices[i+1:i+batchsize]
i >= d.imax && return nothing i >= d.imax && return nothing
if d.shuffle && i == 0 if d.shuffle && i == 0
@ -84,27 +78,15 @@ end
end end
nexti = min(i + d.batchsize, d.nobs) nexti = min(i + d.batchsize, d.nobs)
ids = d.indices[i+1:nexti] ids = d.indices[i+1:nexti]
batch = _getobs(d.data, ids) if length(d.data) == 1
batch = getdata(d.data[1], ids)
else
batch = ((getdata(x, ids) for x in d.data)...,)
end
return (batch, nexti) return (batch, nexti)
end end
function Base.length(d::DataLoader) function Base.length(d::DataLoader)
n = d.nobs / d.batchsize n = d.nobs / d.batchsize
d.partial ? ceil(Int,n) : floor(Int,n) d.partial ? ceil(Int,n) : floor(Int,n)
end end
_nobs(data::AbstractArray) = size(data)[end]
function _nobs(data::Union{Tuple, NamedTuple})
length(data) > 0 || throw(ArgumentError("Need at least one data input"))
n = _nobs(data[1])
if !all(x -> _nobs(x) == n, Base.tail(data))
throw(DimensionMismatch("All data should contain same number of observations"))
end
return n
end
_getobs(data::AbstractArray, i) = data[ntuple(i -> Colon(), Val(ndims(data) - 1))..., i]
_getobs(data::Union{Tuple, NamedTuple}, i) = map(Base.Fix2(_getobs, i), data)
Base.eltype(::DataLoader{D}) where D = D

View File

@ -33,10 +33,9 @@ const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte")
Load the Fashion-MNIST images. Load the Fashion-MNIST images.
Each image is a 28×28 array of `Gray` colour values Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).
Return the 60,000 training images by default; pass `:test` to retrieve the Returns the 60,000 training images by default; pass `:test` to retreive the
10,000 test images. 10,000 test images.
""" """
function images(set = :train) function images(set = :train)
@ -50,10 +49,10 @@ end
labels() labels()
labels(:test) labels(:test)
Load the labels corresponding to each of the images returned from [`images()`](@ref). Load the labels corresponding to each of the images returned from `images()`.
Each label is a number from 0-9. Each label is a number from 0-9.
Return the 60,000 training labels by default; pass `:test` to retrieve the Returns the 60,000 training labels by default; pass `:test` to retreive the
10,000 test labels. 10,000 test labels.
""" """
function labels(set = :train) function labels(set = :train)

View File

@ -50,7 +50,7 @@ function load()
isfile(deps("housing.data")) && return isfile(deps("housing.data")) && return
@info "Downloading the Boston housing Dataset" @info "Downloading the Boston housing Dataset"
download_and_verify("$(cache_prefix)http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data", download_and_verify("$(cache_prefix)https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
deps("housing.data"), deps("housing.data"),
"baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a") "baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")

View File

@ -2,12 +2,13 @@
Fisher's classic iris dataset. Fisher's classic iris dataset.
Measurements from 3 different species of iris: setosa, versicolor and Measurements from 3 different species of iris: setosa, versicolor and
virginica. There are 50 examples of each species. virginica. There are 50 examples of each species.
There are 4 measurements for each example: sepal length, sepal width, There are 4 measurements for each example: sepal length, sepal width, petal
petal length and petal width. The measurements are in centimeters. length and petal width. The measurements are in centimeters.
The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris). The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris).
""" """
module Iris module Iris
@ -32,7 +33,9 @@ end
Get the labels of the iris dataset, a 150 element array of strings listing the Get the labels of the iris dataset, a 150 element array of strings listing the
species of each example. species of each example.
```jldoctest; setup = :(Flux.Data.Iris.load()) ```jldoctest
julia> using Flux
julia> labels = Flux.Data.Iris.labels(); julia> labels = Flux.Data.Iris.labels();
julia> summary(labels) julia> summary(labels)
@ -51,11 +54,13 @@ end
""" """
features() features()
Get the features of the iris dataset. This is a 4x150 matrix of Float64 Get the features of the iris dataset. This is a 4x150 matrix of Float64
elements. It has a row for each feature (sepal length, sepal width, elements. It has a row for each feature (sepal length, sepal width,
petal length, petal width) and a column for each example. petal length, petal width) and a column for each example.
```jldoctest; setup = :(Flux.Data.Iris.load()) ```jldoctest
julia> using Flux
julia> features = Flux.Data.Iris.features(); julia> features = Flux.Data.Iris.features();
julia> summary(features) julia> summary(features)

View File

@ -83,10 +83,9 @@ getfeatures(io::IO, index::Integer) = vec(getimage(io, index))
Load the MNIST images. Load the MNIST images.
Each image is a 28×28 array of `Gray` colour values Each image is a 28×28 array of `Gray` colour values (see Colors.jl).
(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).
Return the 60,000 training images by default; pass `:test` to retrieve the Returns the 60,000 training images by default; pass `:test` to retreive the
10,000 test images. 10,000 test images.
""" """
function images(set = :train) function images(set = :train)
@ -100,10 +99,10 @@ end
labels() labels()
labels(:test) labels(:test)
Load the labels corresponding to each of the images returned from [`images()`](@ref). Load the labels corresponding to each of the images returned from `images()`.
Each label is a number from 0-9. Each label is a number from 0-9.
Return the 60,000 training labels by default; pass `:test` to retrieve the Returns the 60,000 training labels by default; pass `:test` to retreive the
10,000 test labels. 10,000 test labels.
""" """
function labels(set = :train) function labels(set = :train)

View File

@ -1,4 +1,3 @@
"Stanford Sentiment Treebank dataset."
module Sentiment module Sentiment
using ZipFile using ZipFile
@ -40,28 +39,8 @@ function gettrees(name)
return parsetree.(ss) return parsetree.(ss)
end end
"""
train()
Return the train split of the Stanford Sentiment Treebank.
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
"""
train() = gettrees("train") train() = gettrees("train")
"""
test()
Return the test split of the Stanford Sentiment Treebank.
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
"""
test() = gettrees("test") test() = gettrees("test")
"""
dev()
Return the dev split of the Stanford Sentiment Treebank.
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
"""
dev() = gettrees("dev") dev() = gettrees("dev")
end end

14
src/deprecated.jl Normal file
View File

@ -0,0 +1,14 @@
import Base: @deprecate
#### remove in v 0.11 #####
@deprecate param(x) x
@deprecate data(x) x
@deprecate mapleaves(f, x) fmap(f, x)
macro treelike(args...)
functorm(args...)
end
#############################

View File

@ -1,2 +0,0 @@
@deprecate param(x) x
@deprecate data(x) x

View File

@ -1,6 +1,86 @@
import Adapt: adapt, adapt_storage import Adapt: adapt, adapt_storage
using Zygote: IdSet using Zygote: IdSet
import Functors: @functor, functor, fmap
"""
functor(x) -> func, re
We have `x == re(func)`.
Return `func = ()` and `re = _ -> x` for leaf objects.
"""
function functor end
# by default, every object is a leaf
functor(x) = (), _ -> x
functor(x::Tuple) = x, y -> y
functor(x::NamedTuple) = x, y -> y
functor(x::AbstractArray) = x, y -> y
functor(x::AbstractArray{<:Number}) = (), _ -> x
function makefunctor(m::Module, T, fs = fieldnames(T))
@eval m begin
Flux.functor(x::$T) = ($([:($f=x.$f) for f in fs]...),), y -> $T(y...)
end
end
function functorm(T, fs = nothing)
fs == nothing || isexpr(fs, :tuple) || error("@functor T (a, b)")
fs = fs == nothing ? [] : [:($(map(QuoteNode, fs.args)...),)]
:(makefunctor(@__MODULE__, $(esc(T)), $(fs...)))
end
"""
@functor T fields...
Given a type `T` and a subset of its fieldnames `fields`,
create a [`functor`](@ref) function :
functor(x::T) -> func, re
where
func: (field1 = x.field1, field2 = x.field2, ....)
re: y -> T(y...)
If no `fields` argument is given, all internal fields will be considered.
"""
macro functor(args...)
functorm(args...)
end
"""
isleaf(x)
Check if variable `x` is a *leaf* according to the definition:
isleaf(x) = functor(x)[1] === ()
See [`functor`](@ref).
"""
isleaf(x) = functor(x)[1] === ()
function fmap1(f, x)
func, re = functor(x)
re(map(f, func))
end
"""
fmap(f, m)
Applies function `f` to each leaf (see [`isleaf`](@ref)) in `m` and reconstructs
`m` from the transformed leaves.
Example:
gpu(m) = fmap(CuArrays.cu, m)
"""
function fmap(f, x; cache = IdDict())
haskey(cache, x) && return cache[x]
cache[x] = isleaf(x) ? f(x) : fmap1(x -> fmap(f, x, cache = cache), x)
end
trainable(m) = functor(m)[1] trainable(m) = functor(m)[1]
@ -24,7 +104,7 @@ testmode!(m, mode = true) = m
trainmode!(m, mode = true) trainmode!(m, mode = true)
Set a layer of model's train mode (see below). Set a layer of model's train mode (see below).
Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`). Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
_Note_: if you manually set a model into train mode, you need to manually place _Note_: if you manually set a model into train mode, you need to manually place
it into test mode during testing phase. it into test mode during testing phase.
@ -46,18 +126,43 @@ function params!(p::Params, x, seen = IdSet())
end end
end end
function params(m...) """
params(x...)
Recursively scans the inputs for trainable params
and collects them into a `Zygote.Params` object `ps`.
***Usage***
W = rand(5, 3)
b = zeros(5)
m = Dense(W, b)
ps = params(W, b)
ps = params([W, b]) # equivalent form
ps = params(m) # equivalent form
x = rand(3)
y = rand(5)
loss(W, b) = sum(((W*x + b) - y).^2)
loss(m) = sum((m(x) - y).^2)
# Gradient computation.
# Returns a tuple of 2 of arrays containing the gradients.
gs = gradient((W, b) -> loss(W, b), W, b)
# Gradient behaves differently with Params.
# ps is not fed as an argument to the loss.
# Returns a Zygote.Grads object.
gs = gradient(() -> loss(m), ps)
"""
function params(x...)
ps = Params() ps = Params()
params!(ps, m) params!(ps, x)
return ps return ps
end end
# Deprecated stuff
macro treelike(args...)
functorm(args...)
end
mapleaves(f, x) = fmap(f, x)
function loadparams!(m, xs) function loadparams!(m, xs)
for (p, x) in zip(params(m), xs) for (p, x) in zip(params(m), xs)
size(p) == size(x) || size(p) == size(x) ||
@ -67,10 +172,21 @@ function loadparams!(m, xs)
end end
# CPU/GPU movement conveniences # CPU/GPU movement conveniences
"""
cpu(m)
Move model or data `m` to the cpu. Makes
copies only if needed.
"""
cpu(m) = fmap(x -> adapt(Array, x), m) cpu(m) = fmap(x -> adapt(Array, x), m)
gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x """
gpu(m)
Move model or data `m` to the gpu device if available,
otherwise do nothing. Makes copies only if needed.
"""
gpu(m) = use_cuda[] ? fmap(CuArrays.cu, m) : m
# Precision # Precision

View File

@ -4,23 +4,17 @@
Chain multiple layers / functions together, so that they are called in sequence Chain multiple layers / functions together, so that they are called in sequence
on a given input. on a given input.
```julia
m = Chain(x -> x^2, x -> x+1)
m(5) == 26
m = Chain(Dense(10, 5), Dense(5, 2))
x = rand(10)
m(x) == m[2](m[1](x))
```
`Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`. `Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
`m[1:3](x)` will calculate the output of the first three layers. `m[1:3](x)` will calculate the output of the first three layers.
# Examples
```jldoctest
julia> m = Chain(x -> x^2, x -> x+1);
julia> m(5) == 26
true
julia> m = Chain(Dense(10, 5), Dense(5, 2));
julia> x = rand(10);
julia> m(x) == m[2](m[1](x))
true
```
""" """
struct Chain{T<:Tuple} struct Chain{T<:Tuple}
layers::T layers::T
@ -30,7 +24,7 @@ end
@forward Chain.layers Base.getindex, Base.length, Base.first, Base.last, @forward Chain.layers Base.getindex, Base.length, Base.first, Base.last,
Base.iterate, Base.lastindex Base.iterate, Base.lastindex
functor(::Type{<:Chain}, c) = c.layers, ls -> Chain(ls...) functor(c::Chain) = c.layers, ls -> Chain(ls...)
applychain(::Tuple{}, x) = x applychain(::Tuple{}, x) = x
applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x)) applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
@ -66,7 +60,6 @@ outdims(c::Chain, isize) = foldl(∘, map(l -> (x -> outdims(l, x)), c.layers))(
# only slightly changed to better handle interaction with Zygote @dsweber2 # only slightly changed to better handle interaction with Zygote @dsweber2
""" """
activations(c::Chain, input) activations(c::Chain, input)
Calculate the forward results of each layers in Chain `c` with `input` as model input. Calculate the forward results of each layers in Chain `c` with `input` as model input.
""" """
function activations(c::Chain, input) function activations(c::Chain, input)
@ -85,24 +78,24 @@ extraChain(::Tuple{}, x) = ()
""" """
Dense(in::Integer, out::Integer, σ = identity) Dense(in::Integer, out::Integer, σ = identity)
Create a traditional `Dense` layer with parameters `W` and `b`. Creates a traditional `Dense` layer with parameters `W` and `b`.
y = σ.(W * x .+ b) y = σ.(W * x .+ b)
The input `x` must be a vector of length `in`, or a batch of vectors represented The input `x` must be a vector of length `in`, or a batch of vectors represented
as an `in × N` matrix. The out `y` will be a vector or batch of length `out`. as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.
# Examples ```julia
```jldoctest; setup = :(using Random; Random.seed!(0))
julia> d = Dense(5, 2) julia> d = Dense(5, 2)
Dense(5, 2) Dense(5, 2)
julia> d(rand(5)) julia> d(rand(5))
2-element Array{Float32,1}: Tracked 2-element Array{Float64,1}:
-0.16210233 0.00257447
0.12311903``` -0.00449443
```
""" """
struct Dense{F,S<:AbstractArray,T<:AbstractArray} struct Dense{F,S,T}
W::S W::S
b::T b::T
σ::F σ::F
@ -152,7 +145,7 @@ outdims(l::Dense, isize) = (size(l.W)[1],)
""" """
Diagonal(in::Integer) Diagonal(in::Integer)
Create an element-wise linear transformation layer with learnable Creates an element-wise linear transformation layer with learnable
vectors `α` and `β`: vectors `α` and `β`:
y = α .* x .+ β y = α .* x .+ β
@ -183,11 +176,18 @@ outdims(l::Diagonal, isize) = (length(l.α),)
""" """
Maxout(over) Maxout(over)
The [Maxout](https://arxiv.org/pdf/1302.4389.pdf) layer has a number of `Maxout` is a neural network layer, which has a number of internal layers,
internal layers which all receive the same input. It returns the elementwise which all have the same input, and the maxout returns the elementwise maximium
maximum of the internal layers' outputs. of the internal layers' outputs.
Maxout over linear dense layers satisfies the univeral approximation theorem. Maxout over linear dense layers satisfies the univeral approximation theorem.
Reference:
Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, and Yoshua Bengio.
2013. Maxout networks.
In Proceedings of the 30th International Conference on International Conference on Machine Learning - Volume 28 (ICML'13),
Sanjoy Dasgupta and David McAllester (Eds.), Vol. 28. JMLR.org III-1319-III-1327.
https://arxiv.org/pdf/1302.4389.pdf
""" """
struct Maxout{FS<:Tuple} struct Maxout{FS<:Tuple}
over::FS over::FS
@ -196,18 +196,17 @@ end
""" """
Maxout(f, n_alts) Maxout(f, n_alts)
Construct a Maxout layer over `n_alts` instances of the layer given by `f`. Constructs a Maxout layer over `n_alts` instances of the layer given by `f`.
The function takes no arguments and should return some callable layer. The function takes no arguement and should return some callable layer.
Conventionally, this is a linear dense layer. Conventionally this is a linear dense layer.
# Examples For example the following example which
will construct a `Maxout` layer over 4 internal dense linear layers,
This constructs a `Maxout` layer over 4 internal dense linear layers, each each identical in structure (784 inputs, 128 outputs).
identical in structure (784 inputs, 128 outputs):
```julia ```julia
insize = 784 insize = 784
outsize = 128 outsize = 128
Maxout(()->Dense(insize, outsize), 4) Maxout(()->Dense(insize, outsize), 4)
``` ```
""" """
function Maxout(f, n_alts) function Maxout(f, n_alts)
@ -224,18 +223,16 @@ end
outdims(l::Maxout, isize) = outdims(first(l.over), isize) outdims(l::Maxout, isize) = outdims(first(l.over), isize)
""" """
SkipConnection(layer, connection) SkipConnection(layers, connection)
Create a skip connection which consists of a layer or `Chain` of consecutive Creates a Skip Connection, of a layer or `Chain` of consecutive layers
layers and a shortcut connection linking the block's input to the output plus a shortcut connection. The connection function will combine the result of the layers
through a user-supplied 2-argument callable. The first argument to the callable with the original input, to give the final output.
will be propagated through the given `layer` while the second is the unchanged,
"skipped" input.
The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`, The simplest 'ResNet'-type connection is just `SkipConnection(layer, +)`,
and requires the output of the layers to be the same shape as the input. and requires the output of the layers to be the same shape as the input.
Here is a more complicated example: Here is a more complicated example:
```julia ```
m = Conv((3,3), 4=>7, pad=(1,1)) m = Conv((3,3), 4=>7, pad=(1,1))
x = ones(5,5,4,10); x = ones(5,5,4,10);
size(m(x)) == (5, 5, 7, 10) size(m(x)) == (5, 5, 7, 10)

View File

@ -7,60 +7,26 @@ _convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+
expand(N, i::Tuple) = i expand(N, i::Tuple) = i
expand(N, i::Integer) = ntuple(_ -> i, N) expand(N, i::Integer) = ntuple(_ -> i, N)
""" """
SamePad Conv(size, in=>out)
Conv(size, in=>out, relu)
Padding for convolutional layers will be calculated so that outputshape == inputshape when stride = 1. Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively.
For stride > 1 the output shape depends on the type of convolution layer. Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
""" giving us a 16-channel output. Output is activated with ReLU.
struct SamePad end
calc_padding(pad, k::NTuple{N,T}, dilation, stride) where {T,N}= expand(Val(2*N), pad) size = (2,2)
function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
#Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/pdf/1603.07285
# Effective kernel size, including dilation
k_eff = @. k + (k - 1) * (dilation - 1)
# How much total padding needs to be applied?
pad_amt = @. k_eff - 1
# In case amount of padding is odd we need to apply different amounts to each side.
return Tuple(mapfoldl(i -> [ceil(Int, i/2), floor(Int, i/2)], vcat, pad_amt))
end
"""
Conv(filter, in => out, σ = identity; init = glorot_uniform,
stride = 1, pad = 0, dilation = 1)
filter = (2,2)
in = 1 in = 1
out = 16 out = 16
Conv((2, 2), 1=>16, relu) Conv((2, 2), 1=>16, relu)
Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively.
Data should be stored in WHCN order (width, height, # channels, batch size). Data should be stored in WHCN order (width, height, # channels, batch size).
In other words, a 100×100 RGB image would be a `100×100×3×1` array, In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array. and a batch of 50 would be a `100×100×3×50` array.
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`. Takes the keyword arguments `pad`, `stride` and `dilation`.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
# Examples
Apply a `Conv` layer to a 1-channel input using a 2×2 window filter size, giving us a
16-channel output. Output is activated with ReLU.
```julia
filter = (2,2)
in = 1
out = 16
Conv(filter, in => out, relu)
```
""" """
struct Conv{N,M,F,A,V} struct Conv{N,M,F,A,V}
σ::F σ::F
@ -71,68 +37,25 @@ struct Conv{N,M,F,A,V}
dilation::NTuple{N,Int} dilation::NTuple{N,Int}
end end
""" function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
Conv(weight::AbstractArray, bias::AbstractArray)
Conv(weight::AbstractArray, bias::AbstractArray, activation)
Constructs the convolutional layer with user defined weight and bias arrays.
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
There is also a keyword-only constuctor available for all convoultional
layers.
```julia
weight = rand(Float32, 3, 3, 5)
bias = zeros(Float32, 5)
Conv(weight = weight,
bias = bias,
σ = sigmoid)
```
"""
function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N} stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride) stride = expand(Val(N-2), stride)
pad = expand(Val(2*(N-2)), pad)
dilation = expand(Val(N-2), dilation) dilation = expand(Val(N-2), dilation)
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
return Conv(σ, w, b, stride, pad, dilation) return Conv(σ, w, b, stride, pad, dilation)
end end
function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}}, Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N} init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation) Conv(init(k..., ch...), zeros(ch[2]), σ,
end stride = stride, pad = pad, dilation = dilation)
"""
convfilter(filter::Tuple, in=>out)
Constructs a standard convolutional weight matrix with given `filter` and
channels from `in` to `out`.
Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
distribution.
See also: [`depthwiseconvfilter`](@ref)
"""
convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
init = glorot_uniform) where N = init(filter..., ch...)
function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
Conv(weight, bias, σ,
stride = stride, pad = pad, dilation = dilation)
end
@functor Conv @functor Conv
function (c::Conv)(x::AbstractArray) function (c::Conv)(x::AbstractArray)
# TODO: breaks gpu broadcast :( # TODO: breaks gpu broadcast :(
# ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1))) # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
σ, b = c.σ, reshape(c.bias, ntuple(_->1, length(c.stride))..., :, 1) σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation) cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
σ.(conv(x, c.weight, cdims) .+ b) σ.(conv(x, c.weight, cdims) .+ b)
end end
@ -153,8 +76,8 @@ end
""" """
outdims(l::Conv, isize::Tuple) outdims(l::Conv, isize::Tuple)
Calculate the output dimensions given the input dimensions `isize`. Calculate the output dimensions given the input dimensions, `isize`.
Batch size and channel size are ignored as per [NNlib.jl](https://github.com/FluxML/NNlib.jl). Batch size and channel size are ignored as per `NNlib.jl`.
```julia ```julia
m = Conv((3, 3), 3 => 16) m = Conv((3, 3), 3 => 16)
@ -166,23 +89,16 @@ outdims(l::Conv, isize) =
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation)) output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
""" """
ConvTranspose(filter, in=>out) ConvTranspose(size, in=>out)
ConvTranspose(filter, in=>out, activation) ConvTranspose(size, in=>out, relu)
ConvTranspose(filter, in => out, σ = identity; init = glorot_uniform,
stride = 1, pad = 0, dilation = 1)
Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`. Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively. `in` and `out` specify the number of input and output channels respectively.
Data should be stored in WHCN order (width, height, # channels, batch size). Data should be stored in WHCN order. In other words, a 100×100 RGB image would
In other words, a 100×100 RGB image would be a `100×100×3×1` array, be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
and a batch of 50 would be a `100×100×3×50` array.
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`. Takes the keyword arguments `pad`, `stride` and `dilation`.
Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1.
""" """
struct ConvTranspose{N,M,F,A,V} struct ConvTranspose{N,M,F,A,V}
σ::F σ::F
@ -193,39 +109,18 @@ struct ConvTranspose{N,M,F,A,V}
dilation::NTuple{N,Int} dilation::NTuple{N,Int}
end end
""" function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
ConvTranspose(weight::AbstractArray, bias::AbstractArray) stride = 1, pad = 0, dilation = 1) where {T,N}
ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
Constructs the convolutional transpose layer with user defined weight and bias arrays.
forward pass.
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
For keyword-only constuctor, see also [`Conv`](@ref)
"""
function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride) stride = expand(Val(N-2), stride)
pad = expand(Val(2*(N-2)), pad)
dilation = expand(Val(N-2), dilation) dilation = expand(Val(N-2), dilation)
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
return ConvTranspose(σ, w, b, stride, pad, dilation) return ConvTranspose(σ, w, b, stride, pad, dilation)
end end
function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}}, ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N} init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation) ConvTranspose(init(k..., reverse(ch)...), zeros(ch[2]), σ,
end
function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
ConvTranspose(weight, bias, σ,
stride = stride, pad = pad, dilation = dilation) stride = stride, pad = pad, dilation = dilation)
end
@functor ConvTranspose @functor ConvTranspose
@ -237,9 +132,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
batch_size = size(x)[end] batch_size = size(x)[end]
# Create DenseConvDims() that looks like the corresponding conv() # Create DenseConvDims() that looks like the corresponding conv()
return DenseConvDims((I..., C_in, batch_size), size(c.weight); return DenseConvDims((I..., C_in, batch_size), size(c.weight);
stride=c.stride, stride=c.stride,
padding=c.pad, padding=c.pad,
dilation=c.dilation, dilation=c.dilation,
) )
end end
@ -250,7 +145,7 @@ function (c::ConvTranspose)(x::AbstractArray)
# ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1))) # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1) σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
cdims = conv_transpose_dims(c, x) cdims = conv_transpose_dims(c, x)
σ.(∇conv_data(x, c.weight, cdims) .+ b) return σ.(∇conv_data(x, c.weight, cdims) .+ b)
end end
function Base.show(io::IO, l::ConvTranspose) function Base.show(io::IO, l::ConvTranspose)
@ -269,24 +164,17 @@ end
outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad) outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
""" """
DepthwiseConv(filter::Tuple, in=>out) DepthwiseConv(size, in=>out)
DepthwiseConv(filter::Tuple, in=>out, activation) DepthwiseConv(size, in=>out, relu)
DepthwiseConv(filter, in => out, σ = identity; init = glorot_uniform,
stride = 1, pad = 0, dilation = 1)
Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`. Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively. `in` and `out` specify the number of input and output channels respectively.
Note that `out` must be an integer multiple of `in`. Note that `out` must be an integer multiple of `in`.
Data should be stored in WHCN order (width, height, # channels, batch size). Data should be stored in WHCN order. In other words, a 100×100 RGB image would
In other words, a 100×100 RGB image would be a `100×100×3×1` array, be a `100×100×3` array, and a batch of 50 would be a `100×100×3×50` array.
and a batch of 50 would be a `100×100×3×50` array.
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`. Takes the keyword arguments `pad`, `stride` and `dilation`.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
""" """
struct DepthwiseConv{N,M,F,A,V} struct DepthwiseConv{N,M,F,A,V}
σ::F σ::F
@ -297,54 +185,20 @@ struct DepthwiseConv{N,M,F,A,V}
dilation::NTuple{N,Int} dilation::NTuple{N,Int}
end end
""" function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
DepthwiseConv(weight::AbstractArray, bias::AbstractArray) stride = 1, pad = 0, dilation = 1) where {T,N}
DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
forward pass.
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
For keyword-only constuctor, see also [`Conv`](@ref)
"""
function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride) stride = expand(Val(N-2), stride)
pad = expand(Val(2*(N-2)), pad)
dilation = expand(Val(N-2), dilation) dilation = expand(Val(N-2), dilation)
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
return DepthwiseConv(σ, w, b, stride, pad, dilation) return DepthwiseConv(σ, w, b, stride, pad, dilation)
end end
function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
end
"""
depthwiseconvfilter(filter::Tuple, in=>out)
Constructs a depthwise convolutional weight array defined by `filter` and channels
from `in` to `out`.
Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
distribution.
See also: [`convfilter`](@ref)
"""
depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1, init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N
weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
@assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels" @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
return DepthwiseConv( return DepthwiseConv(
weight, init(k..., div(ch[2], ch[1]), ch[1]),
bias, zeros(ch[2]),
σ; σ;
stride = stride, stride = stride,
pad = pad, pad = pad,
@ -377,34 +231,25 @@ outdims(l::DepthwiseConv, isize) =
output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation)) output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
""" """
CrossCor(filter, in=>out) CrossCor(size, in=>out)
CrossCor(filter, in=>out, activation) CrossCor(size, in=>out, relu)
CrossCor(filter, in => out, σ = identity; init = glorot_uniform,
stride = 1, pad = 0, dilation = 1)
Standard cross convolutional layer. `filter` should be a tuple like `(2, 2)`. Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively. `in` and `out` specify the number of input and output channels respectively.
Data should be stored in WHCN order (width, height, # channels, batch size). Example: Applying CrossCor layer to a 1-channel input using a 2x2 window size,
giving us a 16-channel output. Output is activated with ReLU.
size = (2,2)
in = 1
out = 16
CrossCor((2, 2), 1=>16, relu)
Data should be stored in WHCN order (width, height, # channels, # batches).
In other words, a 100×100 RGB image would be a `100×100×3×1` array, In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array. and a batch of 50 would be a `100×100×3×50` array.
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`. Takes the keyword arguments `pad`, `stride` and `dilation`.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
# Examples
Apply a `CrossCor` layer to a 1-channel input using a 2×2 window filter size, giving us a
16-channel output. Output is activated with ReLU.
```julia
filter = (2,2)
in = 1
out = 16
CrossCor((2, 2), 1=>16, relu)
```
""" """
struct CrossCor{N,M,F,A,V} struct CrossCor{N,M,F,A,V}
σ::F σ::F
@ -415,39 +260,18 @@ struct CrossCor{N,M,F,A,V}
dilation::NTuple{N,Int} dilation::NTuple{N,Int}
end end
""" function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
CrossCor(weight::AbstractArray, bias::AbstractArray) stride = 1, pad = 0, dilation = 1) where {T,N}
CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
Constructs the standard cross convolutional layer with user defined weight and bias
arrays.
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
For keyword-only constuctor, see also [`Conv`](@ref)
"""
function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride) stride = expand(Val(N-2), stride)
pad = expand(Val(2*(N-2)), pad)
dilation = expand(Val(N-2), dilation) dilation = expand(Val(N-2), dilation)
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
return CrossCor(σ, w, b, stride, pad, dilation) return CrossCor(σ, w, b, stride, pad, dilation)
end end
function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}}, CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N} init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation) CrossCor(init(k..., ch...), zeros(ch[2]), σ,
end
function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
CrossCor(weight, bias, σ,
stride = stride, pad = pad, dilation = dilation) stride = stride, pad = pad, dilation = dilation)
end
@functor CrossCor @functor CrossCor
@ -481,62 +305,11 @@ outdims(l::CrossCor, isize) =
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation)) output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
""" """
GlobalMaxPool() MaxPool(k)
Global max pooling layer. Max pooling layer. `k` stands for the size of the window for each dimension of the input.
Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output, Takes the keyword arguments `pad` and `stride`.
by performing max pooling on the complete (w,h)-shaped feature maps.
"""
struct GlobalMaxPool end
function (g::GlobalMaxPool)(x)
# Input size
x_size = size(x)
# Kernel size
k = x_size[1:end-2]
# Pooling dimensions
pdims = PoolDims(x, k)
return maxpool(x, pdims)
end
function Base.show(io::IO, g::GlobalMaxPool)
print(io, "GlobalMaxPool()")
end
"""
GlobalMeanPool()
Global mean pooling layer.
Transforms (w,h,c,b)-shaped input into (1,1,c,b)-shaped output,
by performing mean pooling on the complete (w,h)-shaped feature maps.
"""
struct GlobalMeanPool end
function (g::GlobalMeanPool)(x)
# Input size
x_size = size(x)
# Kernel size
k = x_size[1:end-2]
# Pooling dimensions
pdims = PoolDims(x, k)
return meanpool(x, pdims)
end
function Base.show(io::IO, g::GlobalMeanPool)
print(io, "GlobalMeanPool()")
end
"""
MaxPool(k; pad = 0, stride = k)
Max pooling layer. `k` is the size of the window for each dimension of the input.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
=======
""" """
struct MaxPool{N,M} struct MaxPool{N,M}
k::NTuple{N,Int} k::NTuple{N,Int}
@ -546,7 +319,8 @@ end
function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
stride = expand(Val(N), stride) stride = expand(Val(N), stride)
pad = calc_padding(pad, k, 1, stride) pad = expand(Val(2*N), pad)
return MaxPool(k, pad, stride) return MaxPool(k, pad, stride)
end end
@ -562,11 +336,11 @@ end
outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad)) outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
""" """
MeanPool(k; pad = 0, stride = k) MeanPool(k)
Mean pooling layer. `k` is the size of the window for each dimension of the input. Mean pooling layer. `k` stands for the size of the window for each dimension of the input.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride. Takes the keyword arguments `pad` and `stride`.
""" """
struct MeanPool{N,M} struct MeanPool{N,M}
k::NTuple{N,Int} k::NTuple{N,Int}
@ -576,7 +350,7 @@ end
function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
stride = expand(Val(N), stride) stride = expand(Val(N), stride)
pad = calc_padding(pad, k, 1, stride) pad = expand(Val(2*N), pad)
return MeanPool(k, pad, stride) return MeanPool(k, pad, stride)
end end
@ -589,4 +363,4 @@ function Base.show(io::IO, m::MeanPool)
print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")") print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
end end
outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad)) outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))

View File

@ -10,14 +10,14 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
_dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0) _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
""" """
dropout(x, p; dims = :) dropout(p, dims = :)
The dropout function. For each input, either sets that input to `0` (with probability Dropout function. For each input, either sets that input to `0` (with probability
`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions, `p`) or scales it by `1/(1-p)`. The `dims` argument is to specify the unbroadcasted
e.g. `dims=1` applies dropout along columns and `dims=2` along rows. dimensions, i.e. `dims=1` does dropout along columns and `dims=2` along rows. This is
This is used as a regularisation, i.e. it reduces overfitting during training. used as a regularisation, i.e. it reduces overfitting during training.
See also the [`Dropout`](@ref) layer. See also [`Dropout`](@ref).
""" """
dropout(x, p; dims = :) = x dropout(x, p; dims = :) = x
@ -30,9 +30,9 @@ end
""" """
Dropout(p, dims = :) Dropout(p, dims = :)
Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input. A Dropout layer. In the forward pass, applies the [`dropout`](@ref) function on the input.
Does nothing to the input once [`Flux.testmode!`](@ref) is `true`. Does nothing to the input once [`testmode!`](@ref) is false.
""" """
mutable struct Dropout{F,D} mutable struct Dropout{F,D}
p::F p::F
@ -40,9 +40,6 @@ mutable struct Dropout{F,D}
active::Union{Bool, Nothing} active::Union{Bool, Nothing}
end end
# TODO: deprecate in v0.11
Dropout(p, dims) = Dropout(p, dims, nothing)
function Dropout(p; dims = :) function Dropout(p; dims = :)
@assert 0 p 1 @assert 0 p 1
Dropout{typeof(p),typeof(dims)}(p, dims, nothing) Dropout{typeof(p),typeof(dims)}(p, dims, nothing)
@ -64,13 +61,12 @@ end
""" """
AlphaDropout(p) AlphaDropout(p)
A dropout layer. It is used in Self-Normalizing Neural Networks.
(https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
A dropout layer. Used in Does nothing to the input once [`testmode!`](@ref) is false.
[Self-Normalizing Neural Networks](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf).
The AlphaDropout layer ensures that mean and variance of activations
remain the same as before.
Does nothing to the input once [`testmode!`](@ref) is true.
""" """
mutable struct AlphaDropout{F} mutable struct AlphaDropout{F}
p::F p::F
@ -101,8 +97,8 @@ testmode!(m::AlphaDropout, mode = true) =
LayerNorm(h::Integer) LayerNorm(h::Integer)
A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be A [normalisation layer](https://arxiv.org/pdf/1607.06450.pdf) designed to be
used with recurrent hidden states of size `h`. Normalises the mean and standard used with recurrent hidden states of size `h`. Normalises the mean/stddev of
deviation of each input before applying a per-neuron gain/bias. each input before applying a per-neuron gain/bias.
""" """
struct LayerNorm{T} struct LayerNorm{T}
diag::Diagonal{T} diag::Diagonal{T}
@ -124,8 +120,8 @@ end
initβ = zeros, initγ = ones, initβ = zeros, initγ = ones,
ϵ = 1e-8, momentum = .1) ϵ = 1e-8, momentum = .1)
[Batch Normalization](https://arxiv.org/pdf/1502.03167.pdf) layer. Batch Normalization layer. The `channels` input should be the size of the
`channels` should be the size of the channel dimension in your data (see below). channel dimension in your data (see below).
Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
a batch of feature vectors this is just the data dimension, for `WHCN` images a batch of feature vectors this is just the data dimension, for `WHCN` images
@ -137,7 +133,10 @@ per-channel `bias` and `scale` parameters).
Use [`testmode!`](@ref) during inference. Use [`testmode!`](@ref) during inference.
# Examples See [Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift](https://arxiv.org/pdf/1502.03167.pdf).
Example:
```julia ```julia
m = Chain( m = Chain(
Dense(28^2, 64), Dense(28^2, 64),
@ -158,9 +157,6 @@ mutable struct BatchNorm{F,V,W,N}
active::Union{Bool, Nothing} active::Union{Bool, Nothing}
end end
# TODO: deprecate in v0.11
BatchNorm(λ, β, γ, μ, σ², ϵ, momentum) = BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
BatchNorm(chs::Integer, λ = identity; BatchNorm(chs::Integer, λ = identity;
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) = initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
BatchNorm(λ, initβ(chs), initγ(chs), BatchNorm(λ, initβ(chs), initγ(chs),
@ -211,6 +207,37 @@ function Base.show(io::IO, l::BatchNorm)
print(io, ")") print(io, ")")
end end
"""
InstanceNorm(channels::Integer, σ = identity;
initβ = zeros, initγ = ones,
ϵ = 1e-8, momentum = .1)
Instance Normalization layer. The `channels` input should be the size of the
channel dimension in your data (see below).
Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
a batch of feature vectors this is just the data dimension, for `WHCN` images
it's the usual channel dimension.)
`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
shifts them to have a new mean and variance (corresponding to the learnable,
per-channel `bias` and `scale` parameters).
Use [`testmode!`](@ref) during inference.
See [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
Example:
```julia
m = Chain(
Dense(28^2, 64),
InstanceNorm(64, relu),
Dense(64, 10),
InstanceNorm(10),
softmax)
```
"""
expand_inst = (x, as) -> reshape(repeat(x, outer=[1, as[length(as)]]), as...) expand_inst = (x, as) -> reshape(repeat(x, outer=[1, as[length(as)]]), as...)
mutable struct InstanceNorm{F,V,W,N} mutable struct InstanceNorm{F,V,W,N}
@ -224,37 +251,6 @@ mutable struct InstanceNorm{F,V,W,N}
active::Union{Bool, Nothing} active::Union{Bool, Nothing}
end end
# TODO: deprecate in v0.11
"""
InstanceNorm(channels::Integer, σ = identity;
initβ = zeros, initγ = ones,
ϵ = 1e-8, momentum = .1)
[Instance Normalization](https://arxiv.org/abs/1607.08022) layer.
`channels` should be the size of the channel dimension in your data (see below).
Given an array with `N` dimensions, call the `N-1`th the channel dimension. (For
a batch of feature vectors this is just the data dimension, for `WHCN` images
it's the usual channel dimension.)
`InstanceNorm` computes the mean and variance for each each `W×H×1×1` slice and
shifts them to have a new mean and variance (corresponding to the learnable,
per-channel `bias` and `scale` parameters).
Use [`testmode!`](@ref) during inference.
# Examples
```julia
m = Chain(
Dense(28^2, 64),
InstanceNorm(64, relu),
Dense(64, 10),
InstanceNorm(10),
softmax)
```
"""
InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum) = InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, nothing)
InstanceNorm(chs::Integer, λ = identity; InstanceNorm(chs::Integer, λ = identity;
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) = initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
InstanceNorm(λ, initβ(chs), initγ(chs), InstanceNorm(λ, initβ(chs), initγ(chs),
@ -311,27 +307,28 @@ function Base.show(io::IO, l::InstanceNorm)
end end
""" """
GroupNorm(chs::Integer, G::Integer, λ = identity; Group Normalization.
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), This layer can outperform Batch-Normalization and Instance-Normalization.
ϵ = 1f-5, momentum = 0.1f0)
[Group Normalization](https://arxiv.org/pdf/1803.08494.pdf) layer. GroupNorm(chs::Integer, G::Integer, λ = identity;
This layer can outperform Batch Normalization and Instance Normalization. initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i),
ϵ = 1f-5, momentum = 0.1f0)
`chs` is the number of channels, the channel dimension of your input. ``chs`` is the number of channels, the channel dimension of your input.
For an array of N dimensions, the `N-1`th index is the channel dimension. For an array of N dimensions, the (N-1)th index is the channel dimension.
`G` is the number of groups along which the statistics are computed. ``G`` is the number of groups along which the statistics would be computed.
The number of channels must be an integer multiple of the number of groups. The number of channels must be an integer multiple of the number of groups.
Use [`testmode!`](@ref) during inference. Use [`testmode!`](@ref) during inference.
# Examples Example:
```julia
m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1),
GroupNorm(32,16))
# 32 channels, 16 groups (G = 16), thus 2 channels per group used
``` ```
m = Chain(Conv((3,3), 1=>32, leakyrelu;pad = 1),
GroupNorm(32,16)) # 32 channels, 16 groups (G = 16), thus 2 channels per group used
```
Link : https://arxiv.org/pdf/1803.08494.pdf
""" """
mutable struct GroupNorm{F,V,W,N,T} mutable struct GroupNorm{F,V,W,N,T}
G::T # number of groups G::T # number of groups
@ -345,9 +342,6 @@ mutable struct GroupNorm{F,V,W,N,T}
active::Union{Bool, Nothing} active::Union{Bool, Nothing}
end end
# TODO: deprecate in v0.11
GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum) = GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, nothing)
GroupNorm(chs::Integer, G::Integer, λ = identity; GroupNorm(chs::Integer, G::Integer, λ = identity;
initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) = initβ = (i) -> zeros(Float32, i), initγ = (i) -> ones(Float32, i), ϵ = 1f-5, momentum = 0.1f0) =
GroupNorm(G, λ, initβ(chs), initγ(chs), GroupNorm(G, λ, initβ(chs), initγ(chs),

View File

@ -12,16 +12,16 @@ in the background. `cell` should be a model of the form:
h, y = cell(h, x...) h, y = cell(h, x...)
For example, here's a recurrent network that keeps a running total of its inputs: For example, here's a recurrent network that keeps a running total of its inputs.
```julia ```julia
accum(h, x) = (h + x, x) accum(h, x) = (h+x, x)
rnn = Flux.Recur(accum, 0) rnn = Flux.Recur(accum, 0)
rnn(2) # 2 rnn(2) # 2
rnn(3) # 3 rnn(3) # 3
rnn.state # 5 rnn.state # 5
rnn.(1:10) # apply to a sequence rnn.(1:10) # apply to a sequence
rnn.state # 60 rnn.state # 60
``` ```
""" """
mutable struct Recur{T} mutable struct Recur{T}
@ -47,10 +47,9 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
Reset the hidden state of a recurrent layer back to its original value. Reset the hidden state of a recurrent layer back to its original value.
Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to: Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to
```julia
rnn.state = hidden(rnn.cell) rnn.state = hidden(rnn.cell)
```
""" """
reset!(m::Recur) = (m.state = m.init) reset!(m::Recur) = (m.state = m.init)
reset!(m) = foreach(reset!, functor(m)[1]) reset!(m) = foreach(reset!, functor(m)[1])
@ -136,8 +135,8 @@ Base.show(io::IO, l::LSTMCell) =
""" """
LSTM(in::Integer, out::Integer) LSTM(in::Integer, out::Integer)
[Long Short Term Memory](https://www.researchgate.net/publication/13853244_Long_Short-term_Memory) Long Short Term Memory recurrent layer. Behaves like an RNN but generally
recurrent layer. Behaves like an RNN but generally exhibits a longer memory span over sequences. exhibits a longer memory span over sequences.
See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
for a good overview of the internals. for a good overview of the internals.
@ -177,8 +176,8 @@ Base.show(io::IO, l::GRUCell) =
""" """
GRU(in::Integer, out::Integer) GRU(in::Integer, out::Integer)
[Gated Recurrent Unit](https://arxiv.org/abs/1406.1078) layer. Behaves like an Gated Recurrent Unit layer. Behaves like an RNN but generally
RNN but generally exhibits a longer memory span over sequences. exhibits a longer memory span over sequences.
See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
for a good overview of the internals. for a good overview of the internals.

View File

@ -1,106 +1,41 @@
# Cost functions # Cost functions
"""
mae(, y)
Return the mean of absolute error; calculated as
`sum(abs.(ŷ .- y)) / length(y)`.
"""
mae(, y) = sum(abs.( .- y)) * 1 // length(y)
""" """
mse(, y) mse(, y)
Return the mean squared error between and y; calculated as Return the mean squared error `sum((ŷ .- y).^2) / length(y)`.
`sum((ŷ .- y).^2) / length(y)`.
# Examples
```jldoctest
julia> Flux.mse([0, 2], [1, 1])
1//1
```
""" """
mse(, y) = sum(( .- y).^2) * 1 // length(y) mse(, y) = sum(( .- y).^2) * 1 // length(y)
"""
msle(, y; ϵ=eps(eltype()))
Return the mean of the squared logarithmic errors; calculated as
`sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
The `ϵ` term provides numerical stability.
Penalizes an under-predicted estimate greater than an over-predicted estimate.
"""
msle(, y; ϵ=eps(eltype())) = sum((log.( .+ ϵ) .- log.(y .+ ϵ)).^2) * 1 // length(y)
"""
huber_loss(, y; δ=1.0)
Return the mean of the [Huber loss](https://en.wikipedia.org/wiki/Huber_loss)
given the prediction `` and true values `y`.
| 0.5 * | - y|, for | - y| <= δ
Huber loss = |
| δ * (| - y| - 0.5 * δ), otherwise
"""
#TODO: remove dropgrad when Zygote can handle this function with CuArrays
function huber_loss(, y; δ=eltype()(1))
abs_error = abs.( .- y)
temp = Zygote.dropgrad(abs_error .< δ)
x = eltype()(0.5)
hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
return -sum(xlogy.(y, )) * 1 // size(y, 2) return -sum(y .* log.()) * 1 // size(y, 2)
end end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
return -sum(xlogy.(y, )) .* weight * 1 // size(y, 2) return -sum(y .* log.()) .* weight * 1 // size(y, 2)
end end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
return -sum(xlogy.(y, ) .* weight) * 1 // size(y, 2) return -sum(y .* log.() .* weight) * 1 // size(y, 2)
end end
""" """
crossentropy(, y; weight = nothing) crossentropy(, y; weight=1)
Return the cross entropy between the given probability distributions; Return the crossentropy computed as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
calculated as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
`weight` can be `Nothing`, a `Number` or an `AbstractVector`. See also [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref).
`weight=nothing` acts like `weight=1` but is faster.
See also: [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
# Examples
```jldoctest
julia> Flux.crossentropy(softmax([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
3.085467254747739
```
""" """
crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(, y, weight) crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(, y, weight)
""" """
logitcrossentropy(, y; weight = 1) logitcrossentropy(, y; weight=1)
Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation; Return the crossentropy computed after a [softmax](@ref) operation:
calculated as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
`logitcrossentropy(ŷ, y)` is mathematically equivalent to -sum(y .* logsoftmax() .* weight) / size(y, 2)
[`Flux.crossentropy(softmax(ŷ), y)`](@ref) but it is more numerically stable.
See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref) See also [`crossentropy`](@ref), [`binarycrossentropy`](@ref).
# Examples
```jldoctest
julia> Flux.logitcrossentropy([-1.1491, 0.8619, 0.3127], [1, 1, 0])
3.085467254747738
```
""" """
function logitcrossentropy(::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1) function logitcrossentropy(::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
return -sum(y .* logsoftmax() .* weight) * 1 // size(y, 2) return -sum(y .* logsoftmax() .* weight) * 1 // size(y, 2)
@ -109,22 +44,11 @@ end
""" """
binarycrossentropy(, y; ϵ=eps()) binarycrossentropy(, y; ϵ=eps())
Return ``-y*\\log( + ϵ) - (1-y)*\\log(1- + ϵ)``. The `ϵ` term provides numerical stability. Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerical stability.
Typically, the prediction `` is given by the output of a [`sigmoid`](@ref) activation. Typically, the prediction `` is given by the output of a [`sigmoid`](@ref) activation.
See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
# Examples
```jldoctest
julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
3-element Array{Float64,1}:
1.424397097347566
0.35231664672364077
0.8616703662235441
```
""" """
binarycrossentropy(, y; ϵ=eps()) = -xlogy(y, + ϵ) - xlogy(1 - y, 1 - + ϵ) binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ)
# Re-definition to fix interaction with CuArrays. # Re-definition to fix interaction with CuArrays.
CuArrays.@cufunc binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ) CuArrays.@cufunc binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ)
@ -132,19 +56,10 @@ CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1
""" """
logitbinarycrossentropy(ŷ, y) logitbinarycrossentropy(ŷ, y)
`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `binarycrossentropy(σ(ŷ), y)`
[`Flux.binarycrossentropy(σ(ŷ), y)`](@ref) but it is more numerically stable. but it is more numerically stable.
See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref) See also [`binarycrossentropy`](@ref), [`sigmoid`](@ref), [`logsigmoid`](@ref).
# Examples
```jldoctest
julia> Flux.logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0])
3-element Array{Float64,1}:
1.4243970973475661
0.35231664672364094
0.8616703662235443
```
""" """
logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ() logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ()
@ -154,27 +69,26 @@ CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
""" """
normalise(x; dims=1) normalise(x; dims=1)
Normalise `x` to mean 0 and standard deviation 1 across the dimensions given by `dims`. Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
Defaults to normalising over columns.
```jldoctest ```julia-repl
julia> a = reshape(collect(1:9), 3, 3) julia> a = reshape(collect(1:9), 3, 3)
3×3 Array{Int64,2}: 3×3 Array{Int64,2}:
1 4 7 1 4 7
2 5 8 2 5 8
3 6 9 3 6 9
julia> Flux.normalise(a) julia> normalise(a)
3×3 Array{Float64,2}: 3×3 Array{Float64,2}:
-1.22474 -1.22474 -1.22474 -1.22474 -1.22474 -1.22474
0.0 0.0 0.0 0.0 0.0 0.0
1.22474 1.22474 1.22474 1.22474 1.22474 1.22474
julia> Flux.normalise(a, dims=2) julia> normalise(a, dims=2)
3×3 Array{Float64,2}: 3×3 Array{Float64,2}:
-1.22474 0.0 1.22474 -1.22474 0.0 1.22474
-1.22474 0.0 1.22474 -1.22474 0.0 1.22474
-1.22474 0.0 1.22474 -1.22474 0.0 1.22474
``` ```
""" """
function normalise(x::AbstractArray; dims=1) function normalise(x::AbstractArray; dims=1)
@ -186,17 +100,12 @@ end
""" """
kldivergence(, y) kldivergence(, y)
Return the KLDivergence is a measure of how much one probability distribution is different from the other.
[Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) It is always non-negative and zero only when both the distributions are equal everywhere.
between the given probability distributions. [KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
KL divergence is a measure of how much one probability distribution is different
from the other.
It is always non-negative and zero only when both the distributions are equal
everywhere.
""" """
function kldivergence(, y) function kldivergence(, y)
entropy = sum(xlogx.(y)) * 1 //size(y,2) entropy = sum(y .* log.(y)) *1 //size(y,2)
cross_entropy = crossentropy(, y) cross_entropy = crossentropy(, y)
return entropy + cross_entropy return entropy + cross_entropy
end end
@ -204,93 +113,15 @@ end
""" """
poisson(, y) poisson(, y)
Return how much the predicted distribution `` diverges from the expected Poisson Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`. [Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
[More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
""" """
poisson(, y) = sum( .- xlogy.(y, )) * 1 // size(y,2) poisson(, y) = sum( .- y .* log.()) *1 // size(y,2)
""" """
hinge(, y) hinge(, y)
Return the [hinge loss](https://en.wikipedia.org/wiki/Hinge_loss) given the Measures the loss given the prediction `` and true labels `y` (containing 1 or -1).
prediction `` and true labels `y` (containing 1 or -1); calculated as [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
`sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)`.
See also: [`squared_hinge`](@ref)
""" """
hinge(, y) = sum(max.(0, 1 .- .* y)) * 1 // size(y, 2) hinge(, y) = sum(max.(0, 1 .- .* y)) *1 // size(y,2)
"""
squared_hinge(, y)
Return the squared hinge loss given the prediction `` and true labels `y`
(containing 1 or -1); calculated as `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`.
See also: [`hinge`](@ref)
"""
squared_hinge(, y) = sum((max.(0, 1 .- .* y)).^2) * 1 // size(y, 2)
"""
dice_coeff_loss(, y; smooth=1)
Return a loss based on the dice coefficient.
Used in the [V-Net](https://arxiv.org/pdf/1606.04797v1.pdf) image segmentation
architecture.
Similar to the F1_score. Calculated as:
1 - 2*sum(| .* y| + smooth) / (sum(.^2) + sum(y.^2) + smooth)`
"""
dice_coeff_loss(, y; smooth=eltype()(1.0)) = 1 - (2*sum(y .* ) + smooth) / (sum(y.^2) + sum(.^2) + smooth)
"""
tversky_loss(, y; β=0.7)
Return the [Tversky loss](https://arxiv.org/pdf/1706.05721.pdf).
Used with imbalanced data to give more weight to false negatives.
Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
Calculated as:
1 - sum(|y .* | + 1) / (sum(y .* + β*(1 .- y) .* + (1 - β)*y .* (1 .- )) + 1)
"""
tversky_loss(, y; β=eltype()(0.7)) = 1 - (sum(y .* ) + 1) / (sum(y .* + β*(1 .- y) .* + (1 - β)*y .* (1 .- )) + 1)
"""
flatten(x::AbstractArray)
Transform (w, h, c, b)-shaped input into (w × h × c, b)-shaped output
by linearizing all values for each element in the batch.
"""
function flatten(x::AbstractArray)
return reshape(x, :, size(x)[end])
end
"""
xlogx(x)
Return `x * log(x)` for `x ≥ 0`, handling `x = 0` by taking the downward limit.
"""
function xlogx(x)
result = x * log(x)
ifelse(iszero(x), zero(result), result)
end
CuArrays.@cufunc function xlogx(x)
result = x * log(x)
ifelse(iszero(x), zero(result), result)
end
"""
xlogy(x, y)
Return `x * log(y)` for `y > 0` with correct limit at `x = 0`.
"""
function xlogy(x, y)
result = x * log(y)
ifelse(iszero(x), zero(result), result)
end
CuArrays.@cufunc function xlogy(x, y)
result = x * log(y)
ifelse(iszero(x), zero(result), result)
end
@adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric)
res = xlogy.(x, y)
res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
end

View File

@ -27,8 +27,7 @@ Base.getindex(xs::OneHotMatrix, ::Colon, ::Colon) = OneHotMatrix(xs.height, copy
Base.getindex(xs::OneHotMatrix, i::Integer, ::Colon) = map(x -> x[i], xs.data) Base.getindex(xs::OneHotMatrix, i::Integer, ::Colon) = map(x -> x[i], xs.data)
# remove workaround when https://github.com/JuliaGPU/CuArrays.jl/issues/676 is fixed A::AbstractMatrix * B::OneHotMatrix = A[:, map(x->x.ix, B.data)]
A::AbstractMatrix * B::OneHotMatrix = A[:, cpu(map(x->x.ix, B.data))]
Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...]) Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...])
@ -38,28 +37,30 @@ import Adapt: adapt, adapt_structure
adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data)) adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
import .CuArrays: CuArray, CuArrayStyle, cudaconvert import .CuArrays: CuArray, cudaconvert
import Base.Broadcast: BroadcastStyle, ArrayStyle import Base.Broadcast: BroadcastStyle, ArrayStyle
BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = CuArrayStyle{2}() BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data)) cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
""" """
onehot(l, labels[, unk]) onehot(l, labels[, unk])
Create a `OneHotVector` with its `l`-th element `true` based on the Create an [`OneHotVector`](@ref) wtih `l`-th element be `true` based on possible `labels` set.
possible set of `labels`. If `unk` is given, it retruns `onehot(unk, labels)` if the input label `l` is not find in `labels`; otherwise
If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found it will error.
in `labels`; otherwise, it will raise an error.
## Examples
# Examples
```jldoctest ```jldoctest
julia> Flux.onehot(:b, [:a, :b, :c]) julia> using Flux: onehot
julia> onehot(:b, [:a, :b, :c])
3-element Flux.OneHotVector: 3-element Flux.OneHotVector:
0 0
1 1
0 0
julia> Flux.onehot(:c, [:a, :b, :c]) julia> onehot(:c, [:a, :b, :c])
3-element Flux.OneHotVector: 3-element Flux.OneHotVector:
0 0
0 0
@ -81,14 +82,15 @@ end
""" """
onehotbatch(ls, labels[, unk...]) onehotbatch(ls, labels[, unk...])
Create a `OneHotMatrix` with a batch of labels based on the Create an [`OneHotMatrix`](@ref) with a batch of labels based on possible `labels` set, returns the
possible set of `labels`. `onehot(unk, labels)` if given labels `ls` is not found in set `labels`.
If `unk` is given, return [`onehot(unk, labels)`](@ref) if one of the input
labels `ls` is not found in `labels`; otherwise it will error. ## Examples
# Examples
```jldoctest ```jldoctest
julia> Flux.onehotbatch([:b, :a, :b], [:a, :b, :c]) julia> using Flux: onehotbatch
julia> onehotbatch([:b, :a, :b], [:a, :b, :c])
3×3 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}: 3×3 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}:
0 1 0 0 1 0
1 0 1 1 0 1
@ -105,12 +107,13 @@ Base.argmax(xs::OneHotVector) = xs.ix
Inverse operations of [`onehot`](@ref). Inverse operations of [`onehot`](@ref).
# Examples
```jldoctest ```jldoctest
julia> Flux.onecold([true, false, false], [:a, :b, :c]) julia> using Flux: onecold
julia> onecold([true, false, false], [:a, :b, :c])
:a :a
julia> Flux.onecold([0.3, 0.2, 0.5], [:a, :b, :c]) julia> onecold([0.3, 0.2, 0.5], [:a, :b, :c])
:c :c
``` ```
""" """

View File

@ -1,12 +1,9 @@
module Optimise module Optimise
using LinearAlgebra
export train!, update!, export train!, update!,
Descent, ADAM, Momentum, Nesterov, RMSProp, SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM,
InvDecay, ExpDecay, WeightDecay, stop, Optimiser, InvDecay, ExpDecay, WeightDecay, stop, Optimiser
ClipValue, ClipNorm
include("optimisers.jl") include("optimisers.jl")
include("train.jl") include("train.jl")

View File

@ -6,25 +6,24 @@ const ϵ = 1e-8
# TODO: should use weak refs # TODO: should use weak refs
""" """
Descent(η = 0.1) Descent(η)
Classic gradient descent optimiser with learning rate `η`. Classic gradient descent optimiser with learning rate `η`.
For each parameter `p` and its gradient `δp`, this runs `p -= η*δp` For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
the weights.
# Examples ## Example
```julia ```julia-repl
opt = Descent() opt = Descent() # uses default η (0.1)
opt = Descent(0.3) opt = Descent(0.3) # use provided η
ps = params(model) ps = params(model)
gs = gradient(ps) do gs = gradient(ps) do
loss(x, y) loss(x, y)
end end
Flux.Optimise.update!(opt, ps, gs) Flux.Optimise.update!(opt, ps, gs)
@ -41,19 +40,17 @@ function apply!(o::Descent, x, Δ)
end end
""" """
Momentum(η = 0.01, ρ = 0.9) Momentum(η, ρ)
Gradient descent optimizer with learning rate `η` and momentum `ρ`. Gradient descent with learning rate `η` and momentum `ρ`.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`.
the weights. - Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`.
- Momentum (`ρ`): Controls the acceleration of gradient descent in the
prominent direction, in effect dampening oscillations.
# Examples ## Examples
```julia ```julia
opt = Momentum() opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9
opt = Momentum(0.01, 0.99) opt = Momentum(0.01, 0.99)
``` ```
@ -74,19 +71,17 @@ function apply!(o::Momentum, x, Δ)
end end
""" """
Nesterov(η = 0.001, ρ = 0.9) Nesterov(η, ρ)
Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`. Gradient descent with learning rate `η` and Nesterov momentum `ρ`.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
the weights. - Nesterov Momentum (ρ): Parameters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
prominent direction, in effect dampening oscillations.
# Examples ## Examples
```julia ```julia
opt = Nesterov() opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9
opt = Nesterov(0.003, 0.95) opt = Nesterov(0.003, 0.95)
``` ```
@ -108,25 +103,23 @@ function apply!(o::Nesterov, x, Δ)
end end
""" """
RMSProp(η = 0.001, ρ = 0.9) RMSProp(η, ρ)
Optimizer using the Implements the RMSProp algortihm. Often a good choice for recurrent networks. Parameters other than learning rate generally don't need tuning.
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
algorithm. Often a good choice for recurrent networks. Parameters other than learning rate
generally don't need tuning.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (η): Defaults to `0.001`.
the weights. - Rho (ρ): Defaults to `0.9`.
- Momentum (`ρ`): Controls the acceleration of gradient descent in the
prominent direction, in effect dampening oscillations.
# Examples ## Examples
```julia ```julia
opt = RMSProp() opt = RMSProp() # uses default η = 0.001 and ρ = 0.9
opt = RMSProp(0.002, 0.95) opt = RMSProp(0.002, 0.95)
``` ```
## References
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
""" """
mutable struct RMSProp mutable struct RMSProp
eta::Float64 eta::Float64
@ -144,22 +137,23 @@ function apply!(o::RMSProp, x, Δ)
end end
""" """
ADAM(η = 0.001, β::Tuple = (0.9, 0.999)) ADAM(η, β::Tuple)
[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser. Implements the ADAM optimiser.
# Parameters ## Paramters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (`η`): Defaults to `0.001`.
the weights. - Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate. ## Examples
# Examples
```julia ```julia
opt = ADAM() opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
opt = ADAM(0.001, (0.9, 0.8)) opt = ADAM(0.001, (0.9, 0.8))
``` ```
## References
[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
""" """
mutable struct ADAM mutable struct ADAM
eta::Float64 eta::Float64
@ -180,22 +174,24 @@ function apply!(o::ADAM, x, Δ)
end end
""" """
RADAM(η = 0.001, β::Tuple = (0.9, 0.999)) RADAM(η, β::Tuple)
[Rectified ADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimizer. Implements the rectified ADAM optimizer.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (η): Defaults to `0.001`
the weights. - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate. ## Examples
# Examples
```julia ```julia
opt = RADAM() opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
opt = RADAM(0.001, (0.9, 0.8)) opt = RADAM(0.001, (0.9, 0.8))
``` ```
## References
[RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
""" """
mutable struct RADAM mutable struct RADAM
eta::Float64 eta::Float64
@ -223,22 +219,22 @@ function apply!(o::RADAM, x, Δ)
end end
""" """
AdaMax(η = 0.001, β::Tuple = (0.9, 0.999)) AdaMax(η, β::Tuple)
[AdaMax](https://arxiv.org/abs/1412.6980v9) is a variant of ADAM based on the -norm. Variant of ADAM based on -norm.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (η): Defaults to `0.001`
the weights. - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples ## Examples
```julia ```julia
opt = AdaMax() opt = AdaMax() # uses default η and β
opt = AdaMax(0.001, (0.9, 0.995)) opt = AdaMax(0.001, (0.9, 0.995))
``` ```
## References
[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser.
""" """
mutable struct AdaMax mutable struct AdaMax
eta::Float64 eta::Float64
@ -259,22 +255,23 @@ function apply!(o::AdaMax, x, Δ)
end end
""" """
ADAGrad(η = 0.1) ADAGrad(η)
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated.
parameter specific learning rates based on how frequently it is updated.
Parameters don't need tuning.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (η): Defaults to `0.1`
the weights.
# Examples ## Examples
```julia ```julia
opt = ADAGrad() opt = ADAGrad() # uses default η = 0.1
opt = ADAGrad(0.001) opt = ADAGrad(0.001)
``` ```
## References
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
Parameters don't need tuning.
""" """
mutable struct ADAGrad mutable struct ADAGrad
eta::Float64 eta::Float64
@ -291,21 +288,21 @@ function apply!(o::ADAGrad, x, Δ)
end end
""" """
ADADelta(ρ = 0.9) ADADelta(ρ)
[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning.
rate based on a window of past gradient updates.
Parameters don't need tuning.
# Parameters ## Parameters
- Rho (`ρ`): Factor by which the gradient is decayed at each time step. - Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`.
# Examples ## Examples
```julia ```julia
opt = ADADelta() opt = ADADelta() # uses default ρ = 0.9
opt = ADADelta(0.89) opt = ADADelta(0.89)
``` ```
## References
[ADADelta](https://arxiv.org/abs/1212.5701) optimiser.
""" """
mutable struct ADADelta mutable struct ADADelta
rho::Float64 rho::Float64
@ -324,23 +321,22 @@ function apply!(o::ADADelta, x, Δ)
end end
""" """
AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999)) AMSGrad(η, β::Tuple)
The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning.
optimiser. Parameters don't need tuning.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (η): Defaults to `0.001`.
the weights. - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples ## Examples
```julia ```julia
opt = AMSGrad() opt = AMSGrad() # uses default η and β
opt = AMSGrad(0.001, (0.89, 0.995)) opt = AMSGrad(0.001, (0.89, 0.995))
``` ```
## References
[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser.
""" """
mutable struct AMSGrad mutable struct AMSGrad
eta::Float64 eta::Float64
@ -360,23 +356,22 @@ function apply!(o::AMSGrad, x, Δ)
end end
""" """
NADAM(η = 0.001, β::Tuple = (0.9, 0.999)) NADAM(η, β::Tuple)
[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) is a Nesterov variant of ADAM. Nesterov variant of ADAM. Parameters don't need tuning.
Parameters don't need tuning.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (η): Defaults to `0.001`.
the weights. - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples ## Examples
```julia ```julia
opt = NADAM() opt = NADAM() # uses default η and β
opt = NADAM(0.002, (0.89, 0.995)) opt = NADAM(0.002, (0.89, 0.995))
``` ```
## References
[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser.
""" """
mutable struct NADAM mutable struct NADAM
eta::Float64 eta::Float64
@ -397,24 +392,23 @@ function apply!(o::NADAM, x, Δ)
end end
""" """
ADAMW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0) ADAMW(η, β::Tuple, decay)
[ADAMW](https://arxiv.org/abs/1711.05101) is a variant of ADAM fixing (as in repairing) its Variant of ADAM defined by fixing weight decay regularization.
weight decay regularization.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (η): Defaults to `0.001`.
the weights. - Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999).
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the - decay: Decay applied to weights during optimisation. Defaults to 0.
second (β2) momentum estimate.
- `decay`: Decay applied to weights during optimisation.
# Examples ## Examples
```julia ```julia
opt = ADAMW() opt = ADAMW() # uses default η, β and decay
opt = ADAMW(0.001, (0.89, 0.995), 0.1) opt = ADAMW(0.001, (0.89, 0.995), 0.1)
``` ```
## References
[ADAMW](https://arxiv.org/abs/1711.05101)
""" """
ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) = ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
Optimiser(ADAM(η, β), WeightDecay(decay)) Optimiser(ADAM(η, β), WeightDecay(decay))
@ -447,13 +441,14 @@ function apply!(o::Optimiser, x, Δ)
end end
""" """
InvDecay(γ = 0.001) InvDecay(γ)
Apply inverse time decay to an optimiser, so that the effective step size at Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size.
The wrapped optimiser's step size is not modified.
# Examples ## Parameters
- gamma (γ): Defaults to `0.001`
## Example
```julia ```julia
Optimiser(InvDecay(..), Opt(..)) Optimiser(InvDecay(..), Opt(..))
``` ```
@ -474,24 +469,20 @@ function apply!(o::InvDecay, x, Δ)
end end
""" """
ExpDecay(η = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4) ExpDecay(eta, decay, decay_step, clip)
Discount the learning rate `η` by the factor `decay` every `decay_step` steps till Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
a minimum of `clip`.
# Parameters ## Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating - Learning Rate (eta): Defaults to `0.001`.
the weights. - decay: Factor by which the learning rate is discounted. Defaults to `0.1`.
- `decay`: Factor by which the learning rate is discounted. - decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`.
- `decay_step`: Schedule decay operations by setting the number of steps between - clip: Minimum value of learning rate. Defaults to `1e-4`.
two decay operations.
- `clip`: Minimum value of learning rate.
# Examples ## Example
To apply exponential decay to an optimiser: To apply exponential decay to an optimiser:
```julia ```julia
Optimiser(ExpDecay(..), Opt(..)) Optimiser(ExpDecay(..), Opt(..))
opt = Optimiser(ExpDecay(), ADAM()) opt = Optimiser(ExpDecay(), ADAM())
``` ```
""" """
@ -509,19 +500,19 @@ function apply!(o::ExpDecay, x, Δ)
η, s, decay = o.eta, o.step, o.decay η, s, decay = o.eta, o.step, o.decay
n = o.current[x] = get(o.current, x, 0) + 1 n = o.current[x] = get(o.current, x, 0) + 1
if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1 if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1
η = max(η * decay, o.clip) η = max(η * decay^(s / n), o.clip)
o.eta = η o.eta = η
end end
@. Δ *= η @. Δ *= η
end end
""" """
WeightDecay(wd = 0) WeightDecay(wd)
Decay weights by `wd`. Decays the weight by `wd`
# Parameters ## Parameters
- Weight decay (`wd`) - weight decay (wd): 0
""" """
mutable struct WeightDecay mutable struct WeightDecay
wd::Real wd::Real
@ -533,31 +524,3 @@ function apply!(o::WeightDecay, x, Δ)
wd = o.wd wd = o.wd
@. Δ += wd * x @. Δ += wd * x
end end
"""
ClipValue(thresh)
Clip gradients when their absolute value exceeds `thresh`.
"""
mutable struct ClipValue{T}
thresh::T
end
apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
"""
ClipNorm(thresh)
Clip gradients when their L2 norm exceeds `thresh`.
"""
mutable struct ClipNorm{T}
thresh::T
end
function apply!(o::ClipNorm, x, Δ)
Δnrm = norm(Δ)
if Δnrm > o.thresh
rmul!(Δ, o.thresh / Δnrm)
end
return Δ
end

View File

@ -2,25 +2,23 @@ using Juno
import Zygote: Params, gradient import Zygote: Params, gradient
""" """
update!(x, ) update!(opt, p, g)
update!(opt, ps::Params, gs)
Perform an update step of the parameters `ps` (or the single parameter `p`)
according to optimizer `opt` and the gradients `gs` (the gradient `g`).
As a result, the parameters are mutated and the optimizer's internal state may change.
update!(x, )
Update the array `x` according to `x .-= x̄`. Update the array `x` according to `x .-= x̄`.
""" """
function update!(x::AbstractArray, ) function update!(x::AbstractArray, )
x .-= x .-=
end end
"""
update!(opt, p, g)
update!(opt, ps::Params, gs)
Perform an update step of the parameters `ps` (or the single parameter `p`)
according to optimizer `opt` and the gradients `gs` (the gradient `g`).
As a result, the parameters are mutated and the optimizer's internal state may change.
"""
function update!(opt, x, ) function update!(opt, x, )
x .-= apply!(opt, x, ) x .-= apply!(opt, x, )
end end
@ -43,10 +41,11 @@ struct StopException <: Exception end
stop() stop()
Call `Flux.stop()` in a callback to indicate when a callback condition is met. Call `Flux.stop()` in a callback to indicate when a callback condition is met.
This will trigger the train loop to stop and exit. This would trigger the train loop to stop and exit.
# Examples
```julia ```julia
# Example callback:
cb = function () cb = function ()
accuracy() > 0.9 && Flux.stop() accuracy() > 0.9 && Flux.stop()
end end
@ -59,18 +58,19 @@ end
""" """
train!(loss, params, data, opt; cb) train!(loss, params, data, opt; cb)
For each datapoint `d` in `data` compute the gradient of `loss(d...)` through For each datapoint `d` in `data` computes the gradient of `loss(d...)` through
backpropagation and call the optimizer `opt`. backpropagation and calls the optimizer `opt`.
In case datapoints `d` are of numeric array type, assume no splatting is needed In case datapoints `d` are of numeric array type, assumes no splatting is needed
and compute the gradient of `loss(d)`. and computes the gradient of `loss(d)`.
A callback is given with the keyword argument `cb`. For example, this will print Takes a callback as keyword argument `cb`. For example, this will print "training"
"training" every 10 seconds (using [`Flux.throttle`](@ref)): every 10 seconds:
train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10)) train!(loss, params, data, opt,
cb = throttle(() -> println("training"), 10))
The callback can call [`Flux.stop`](@ref) to interrupt the training loop. The callback can call `Flux.stop()` to interrupt the training loop.
Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays. Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays.
""" """
@ -106,12 +106,11 @@ end
Run `body` `N` times. Mainly useful for quickly doing multiple epochs of Run `body` `N` times. Mainly useful for quickly doing multiple epochs of
training in a REPL. training in a REPL.
# Examples ```julia
```jldoctest julia> @epochs 2 println("hello")
julia> Flux.@epochs 2 println("hello") INFO: Epoch 1
[ Info: Epoch 1
hello hello
[ Info: Epoch 2 INFO: Epoch 2
hello hello
``` ```
""" """

View File

@ -1,40 +1,10 @@
# Arrays # Arrays
nfan() = 1, 1 # fan_in, fan_out nfan() = 1, 1 #fan_in, fan_out
nfan(n) = 1, n # A vector is treated as a n×1 matrix nfan(n) = 1, n #A vector is treated as a n×1 matrix
nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of convolution kernels nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
"""
glorot_uniform(dims...)
Return an `Array` of size `dims` containing random variables taken from a uniform
distribution in the interval ``[-x, x]``, where `x = sqrt(24 / sum(dims)) / 2`.
# Examples
```jldoctest; setup = :(using Random; Random.seed!(0))
julia> Flux.glorot_uniform(2, 3)
2×3 Array{Float32,2}:
0.601094 -0.57414 -0.814925
0.900868 0.805994 0.057514
```
"""
glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...))) glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
"""
glorot_normal(dims...)
Return an `Array` of size `dims` containing random variables taken from a normal
distribution with mean 0 and standard deviation `sqrt(2 / sum(dims))`.
# Examples
```jldoctest; setup = :(using Random; Random.seed!(0))
julia> Flux.glorot_normal(3, 2)
3×2 Array{Float32,2}:
0.429505 -0.0852891
0.523935 0.371009
-0.223261 0.188052
```
"""
glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...))) glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
ones(T::Type, dims...) = Base.ones(T, dims...) ones(T::Type, dims...) = Base.ones(T, dims...)
@ -43,81 +13,9 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
ones(dims...) = Base.ones(Float32, dims...) ones(dims...) = Base.ones(Float32, dims...)
zeros(dims...) = Base.zeros(Float32, dims...) zeros(dims...) = Base.zeros(Float32, dims...)
"""
unsqueeze(xs, dim)
Return `xs` reshaped into an `Array` one dimensionality higher than `xs`,
where `dim` indicates in which dimension `xs` is extended.
# Examples
```jldoctest
julia> xs = [[1, 2], [3, 4], [5, 6]]
3-element Array{Array{Int64,1},1}:
[1, 2]
[3, 4]
[5, 6]
julia> Flux.unsqueeze(xs, 1)
1×3 Array{Array{Int64,1},2}:
[1, 2] [3, 4] [5, 6]
julia> Flux.unsqueeze([1 2; 3 4], 2)
2×1×2 Array{Int64,3}:
[:, :, 1] =
1
3
[:, :, 2] =
2
4
```
"""
unsqueeze(xs, dim) = reshape(xs, (size(xs)[1:dim-1]..., 1, size(xs)[dim:end]...)) unsqueeze(xs, dim) = reshape(xs, (size(xs)[1:dim-1]..., 1, size(xs)[dim:end]...))
"""
stack(xs, dim)
Concatenate the given `Array` of `Array`s `xs` into a single `Array` along the
given dimension `dim`.
# Examples
```jldoctest
julia> xs = [[1, 2], [3, 4], [5, 6]]
3-element Array{Array{Int64,1},1}:
[1, 2]
[3, 4]
[5, 6]
julia> Flux.stack(xs, 1)
3×2 Array{Int64,2}:
1 2
3 4
5 6
julia> cat(xs, dims=1)
3-element Array{Array{Int64,1},1}:
[1, 2]
[3, 4]
[5, 6]
```
"""
stack(xs, dim) = cat(unsqueeze.(xs, dim)..., dims=dim) stack(xs, dim) = cat(unsqueeze.(xs, dim)..., dims=dim)
"""
unstack(xs, dim)
Unroll the given `xs` into an `Array` of `Array`s along the given dimension `dim`.
# Examples
```jldoctest
julia> Flux.unstack([1 3 5 7; 2 4 6 8], 2)
4-element Array{Array{Int64,1},1}:
[1, 2]
[3, 4]
[5, 6]
[7, 8]
```
"""
unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)] unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]
""" """
@ -125,16 +23,9 @@ unstack(xs, dim) = [copy(selectdim(xs, dim, i)) for i in 1:size(xs, dim)]
Split `xs` into `n` parts. Split `xs` into `n` parts.
# Examples ```julia
```jldoctest julia> chunk(1:10, 3)
julia> Flux.chunk(1:10, 3) 3-element Array{Array{Int64,1},1}:
3-element Array{UnitRange{Int64},1}:
1:4
5:8
9:10
julia> Flux.chunk(collect(1:10), 3)
3-element Array{SubArray{Int64,1,Array{Int64,1},Tuple{UnitRange{Int64}},true},1}:
[1, 2, 3, 4] [1, 2, 3, 4]
[5, 6, 7, 8] [5, 6, 7, 8]
[9, 10] [9, 10]
@ -149,12 +40,11 @@ batchindex(xs, i) = (reverse(Base.tail(reverse(axes(xs))))..., i)
Count the number of times that each element of `xs` appears. Count the number of times that each element of `xs` appears.
# Examples ```julia
```jldoctest julia> frequencies(['a','b','b'])
julia> Flux.frequencies(['a','b','b'])
Dict{Char,Int64} with 2 entries: Dict{Char,Int64} with 2 entries:
'a' => 1
'b' => 2 'b' => 2
'a' => 1
``` ```
""" """
function frequencies(xs) function frequencies(xs)
@ -174,9 +64,8 @@ squeezebatch(x) = reshape(x, head(size(x)))
Batch the arrays in `xs` into a single array. Batch the arrays in `xs` into a single array.
# Examples ```julia
```jldoctest julia> batch([[1,2,3],[4,5,6]])
julia> Flux.batch([[1,2,3],[4,5,6]])
3×2 Array{Int64,2}: 3×2 Array{Int64,2}:
1 4 1 4
2 5 2 5
@ -193,25 +82,6 @@ function batch(xs)
return data return data
end end
"""
Return the given sequence padded with `p` up to a maximum length of `n`.
# Examples
```jldoctest
julia> rpad([1, 2], 4, 0)
4-element Array{Int64,1}:
1
2
0
0
julia> rpad([1, 2, 3], 2, 0)
3-element Array{Int64,1}:
1
2
3
```
"""
Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))] Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))]
""" """
@ -220,9 +90,8 @@ Base.rpad(v::AbstractVector, n::Integer, p) = [v; fill(p, max(n - length(v), 0))
Take a list of `N` sequences, and turn them into a single sequence where each Take a list of `N` sequences, and turn them into a single sequence where each
item is a batch of `N`. Short sequences will be padded by `pad`. item is a batch of `N`. Short sequences will be padded by `pad`.
# Examples ```julia
```jldoctest julia> batchseq([[1, 2, 3], [4, 5]], 0)
julia> Flux.batchseq([[1, 2, 3], [4, 5]], 0)
3-element Array{Array{Int64,1},1}: 3-element Array{Array{Int64,1},1}:
[1, 4] [1, 4]
[2, 5] [2, 5]
@ -246,10 +115,6 @@ function _restructure(m, xs)
end end
end end
@adjoint function _restructure(m, xs)
_restructure(m, xs), dm -> (nothing,destructure(dm)[1])
end
""" """
destructure(m) destructure(m)
@ -283,15 +148,11 @@ end
# Other # Other
""" """
throttle(f, timeout; leading=true, trailing=false) Returns a function that when invoked, will only be triggered at most once
during `timeout` seconds. Normally, the throttled function will run
Return a function that when invoked, will only be triggered at most once as much as it can, without ever going more than once per `wait` duration;
during `timeout` seconds. but if you'd like to disable the execution on the leading edge, pass
`leading=false`. To enable execution on the trailing edge, ditto.
Normally, the throttled function will run as much as it can, without ever
going more than once per `wait` duration; but if you'd like to disable the
execution on the leading edge, pass `leading=false`. To enable execution on
the trailing edge, pass `trailing=true`.
""" """
function throttle(f, timeout; leading=true, trailing=false) function throttle(f, timeout; leading=true, trailing=false)
cooldown = true cooldown = true

View File

@ -1,106 +0,0 @@
import Base: +, -, *, reshape, size
import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
"""
Zeros()
Zeros(size...)
Zeros(Type, size...)
Acts as a stand-in for an array of zeros that can be
used during training which is ignored by the optimisers.
Useful to turn bias off for a forward pass of a layer.
## Examples
```julia
julia> Flux.Zeros(3,3)
3×3 Flux.Zeros{Bool,2}:
false false false
false false false
false false false
julia> Flux.Zeros(Float32, 3,3)
3×3 Flux.Zeros{Float32,2}:
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
julia> rand(3,3) .+ Flux.Zeros()
3×3 Array{Float64,2}:
0.198739 0.490459 0.785386
0.779074 0.39986 0.66383
0.854981 0.447292 0.314497
julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
Conv((2, 2), 1=>3)
```
"""
struct Zeros{T,N} <: AbstractArray{T,N}
size::Tuple
end
Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
Zeros(sz::Integer...) = Zeros(Bool, sz...)
Base.size(xs::Zeros) = xs.size
Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
Zeros(T, length(inds))
Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
@adjoint reshape(xs::Zeros{T}, dims...) where T =
reshape(xs, dims...), _ -> nothing
# Define basic ops
for f in (:+, :-)
@eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
@assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
a
end
end
+(a::Zeros, b::AbstractArray) = b + a
-(a::Zeros, b::AbstractArray) = -b + a
Base.copy(xs::Zeros{T,N}) where {T,N} = xs
# Define broadcasting behaviour
for op in (:+, :-)
@eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
bs = Broadcast.broadcast_shape(size(a), size(b))
size(a) == bs && return a
sz = similar(a, bs)
sz .= a
end
end
broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
end
broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
for op in (:+, :-, :*)
@eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
end
# Some opportunities to avoid scalar indexing, intermediaries
# Since it replicates a little of what we expect Base to do,
# it should be possible to remove in the future, but for now,
# these help with performance.
broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)

View File

@ -69,7 +69,6 @@ if CuArrays.has_cudnn()
@info "Testing Flux/CUDNN" @info "Testing Flux/CUDNN"
include("cudnn.jl") include("cudnn.jl")
include("curnn.jl") include("curnn.jl")
include("layers.jl")
else else
@warn "CUDNN unavailable, not testing GPU DNN support" @warn "CUDNN unavailable, not testing GPU DNN support"
end end

View File

@ -1,98 +0,0 @@
# Test layers and data/model movements on and off the GPU
# Add tests for layers and their gradients on the GPU
# Most of the forward passes should be fine being applied
# to bitstype objects, but this gives higher coverage for our use-cases
# Check that getting the gradients does not throw
# generic movement tests
@testset "Basic GPU Movement" begin
@test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
@test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
end
# TODO: These layers get into scalar indexing
# `AlphaDropout` throws a compilation error on GPUs,
# whereas, the rest are scalar indexing issues.
const BROKEN_LAYERS = [DepthwiseConv,
AlphaDropout,
InstanceNorm,
GroupNorm]
function gradtest(name::String, layers::Vector, xs = nothing, args...)
isnothing(xs) && error("Missing input to test the layers against.")
@testset "$name GPU grad tests" begin
for layer in layers
@testset "$layer GPU grad test" begin
l = gpu(layer(args...))
xs = gpu(xs)
if any(x -> isa(l, x), BROKEN_LAYERS)
ps = Flux.params(l)
@test_broken gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
else
ps = Flux.params(l)
@test gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
gs = gradient(() -> sum(l(xs)), ps)
# Handle pooling layers
if !isempty(ps)
@test gs[first(ps)] isa Flux.CuArrays.CuArray
end
end
end
end
end
end
# Repeats from Conv, CrossCor
r = rand(Float32, 28, 28, 1, 1)
conv_layers = [Conv, ConvTranspose, CrossCor, DepthwiseConv]
gradtest("Conv", conv_layers, r, (2,2), 1=>3)
pooling_layers = [MaxPool, MeanPool]
gradtest("Pooling", pooling_layers, r, (2,2))
dropout_layers = [Dropout, AlphaDropout]
gradtest("Dropout", dropout_layers, r, 0.5f0)
norm_layers = [LayerNorm, BatchNorm]
gradtest("Normalising", norm_layers, rand(Float32, 28,28,3,1), 1)
instancenorm = [InstanceNorm]
gradtest("InstanceNorm", instancenorm, r, 1)
groupnorm = [GroupNorm]
gradtest("GroupNorm", groupnorm, rand(Float32, 28,28,3,1), 3, 1)
const stateless_layers = [Flux.mse,
Flux.crossentropy,
Flux.logitcrossentropy,
Flux.normalise]
const stateless_layers_broadcasted = [Flux.binarycrossentropy,
Flux.logitbinarycrossentropy]
function stateless_gradtest(f, args...)
@test gradient((args...) -> sum(f(args...)), args...)[1] isa CuArray
end
function stateless_gradtest_broadcasted(f, args...)
@test gradient((args...) -> sum(f.(args...)), args...)[1] isa CuArray
end
@testset "Stateless GPU grad tests" begin
x = gpu(rand(3,3))
y = gpu(rand(3,3))
for layer in stateless_layers
if layer == Flux.normalise
stateless_gradtest(layer, x)
else
stateless_gradtest(layer, x, y)
end
end
for layer in stateless_layers_broadcasted
stateless_gradtest_broadcasted(layer, x, y)
end
end

View File

@ -3,34 +3,20 @@
Y = [1:5;] Y = [1:5;]
d = DataLoader(X, batchsize=2) d = DataLoader(X, batchsize=2)
@inferred first(d)
batches = collect(d) batches = collect(d)
@test eltype(batches) == eltype(d) == typeof(X)
@test length(batches) == 3 @test length(batches) == 3
@test batches[1] == X[:,1:2] @test batches[1] == X[:,1:2]
@test batches[2] == X[:,3:4] @test batches[2] == X[:,3:4]
@test batches[3] == X[:,5:5] @test batches[3] == X[:,5:5]
d = DataLoader(X, batchsize=2, partial=false) d = DataLoader(X, batchsize=2, partial=false)
@inferred first(d)
batches = collect(d) batches = collect(d)
@test eltype(batches) == eltype(d) == typeof(X)
@test length(batches) == 2 @test length(batches) == 2
@test batches[1] == X[:,1:2] @test batches[1] == X[:,1:2]
@test batches[2] == X[:,3:4] @test batches[2] == X[:,3:4]
d = DataLoader((X,), batchsize=2, partial=false) d = DataLoader(X, Y, batchsize=2)
@inferred first(d)
batches = collect(d) batches = collect(d)
@test eltype(batches) == eltype(d) == Tuple{typeof(X)}
@test length(batches) == 2
@test batches[1] == (X[:,1:2],)
@test batches[2] == (X[:,3:4],)
d = DataLoader((X, Y), batchsize=2)
@inferred first(d)
batches = collect(d)
@test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
@test length(batches) == 3 @test length(batches) == 3
@test length(batches[1]) == 2 @test length(batches[1]) == 2
@test length(batches[2]) == 2 @test length(batches[2]) == 2
@ -42,22 +28,6 @@
@test batches[3][1] == X[:,5:5] @test batches[3][1] == X[:,5:5]
@test batches[3][2] == Y[5:5] @test batches[3][2] == Y[5:5]
# test with NamedTuple
d = DataLoader((x=X, y=Y), batchsize=2)
@inferred first(d)
batches = collect(d)
@test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
@test length(batches) == 3
@test length(batches[1]) == 2
@test length(batches[2]) == 2
@test length(batches[3]) == 2
@test batches[1][1] == batches[1].x == X[:,1:2]
@test batches[1][2] == batches[1].y == Y[1:2]
@test batches[2][1] == batches[2].x == X[:,3:4]
@test batches[2][2] == batches[2].y == Y[3:4]
@test batches[3][1] == batches[3].x == X[:,5:5]
@test batches[3][2] == batches[3].y == Y[5:5]
# test interaction with `train!` # test interaction with `train!`
θ = ones(2) θ = ones(2)
X = zeros(2, 10) X = zeros(2, 10)
@ -71,7 +41,7 @@
X = ones(2, 10) X = ones(2, 10)
Y = fill(2, 10) Y = fill(2, 10)
loss(x, y) = sum((y - x'*θ).^2) loss(x, y) = sum((y - x'*θ).^2)
d = DataLoader((X, Y)) d = DataLoader(X, Y)
Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1)) Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
@test norm(θ .- 1) < 1e-10 @test norm(θ .- 1) < 1e-10
end end
@ -106,9 +76,8 @@ end
@test size(Iris.labels()) == (150,) @test size(Iris.labels()) == (150,)
end end
@testset "Housing" begin @testset "Housing" begin
@test Housing.features() isa Matrix # test broken due to SSL certifate expiration problem @test Housing.features() isa Matrix
@test size(Housing.features()) == (506, 13) @test size(Housing.features()) == (506, 13)
@test Housing.targets() isa Array{Float64} @test Housing.targets() isa Array{Float64}

View File

@ -28,14 +28,6 @@ import Flux: activations
end end
@testset "Dense" begin @testset "Dense" begin
@testset "constructors" begin
@test size(Dense(10, 100).W) == (100, 10)
@test Dense(rand(100,10), rand(10)).σ == identity
@test_throws MethodError Dense(10, 10.5)
@test_throws MethodError Dense(10, 10.5, tanh)
end
@test length(Dense(10, 5)(randn(10))) == 5 @test length(Dense(10, 5)(randn(10))) == 5
@test_throws DimensionMismatch Dense(10, 5)(randn(1)) @test_throws DimensionMismatch Dense(10, 5)(randn(1))
@test_throws MethodError Dense(10, 5)(1) # avoid broadcasting @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
@ -45,6 +37,7 @@ import Flux: activations
@test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2) @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
@test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1) @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
@test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20] @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
end end
@testset "Diagonal" begin @testset "Diagonal" begin

View File

@ -4,10 +4,6 @@ using Flux: gradient
@testset "Pooling" begin @testset "Pooling" begin
x = randn(Float32, 10, 10, 3, 2) x = randn(Float32, 10, 10, 3, 2)
gmp = GlobalMaxPool()
@test size(gmp(x)) == (1, 1, 3, 2)
gmp = GlobalMeanPool()
@test size(gmp(x)) == (1, 1, 3, 2)
mp = MaxPool((2, 2)) mp = MaxPool((2, 2))
@test mp(x) == maxpool(x, PoolDims(x, 2)) @test mp(x) == maxpool(x, PoolDims(x, 2))
mp = MeanPool((2, 2)) mp = MeanPool((2, 2))
@ -25,35 +21,6 @@ end
Dense(288, 10), softmax) Dense(288, 10), softmax)
@test size(m(r)) == (10, 5) @test size(m(r)) == (10, 5)
# Test bias switch
bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
ip = zeros(Float32, 28,28,1,1)
op = bias(ip)
@test sum(op) == prod(size(op))
bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
op = bias(ip)
@test sum(op) === 0.f0
gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
@test gs[bias.bias] == nothing
# Train w/o bias and make sure no convergence happens
# when only bias can be converged
bias = Conv((2, 2), 1=>3, bias = Flux.Zeros());
ip = zeros(Float32, 28,28,1,1)
op = zeros(Float32, 27,27,3,1) .+ 2.f0
opt = Descent()
for _ = 1:10^3
gs = gradient(params(bias)) do
Flux.mse(bias(ip), op)
end
Flux.Optimise.update!(opt, params(bias), gs)
end
@test Flux.mse(bias(ip), op) 4.f0
end end
@testset "asymmetric padding" begin @testset "asymmetric padding" begin
@ -191,28 +158,4 @@ end
@test Flux.outdims(m, (5, 5)) == (4, 4) @test Flux.outdims(m, (5, 5)) == (4, 4)
m = MeanPool((2, 2); stride = 2, pad = 3) m = MeanPool((2, 2); stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5) @test Flux.outdims(m, (5, 5)) == (5, 5)
end end
@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, CrossCor), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
data = ones(Float32, (k .+ 3)..., 1,1)
l = ltype(k, 1=>1, pad=SamePad())
@test size(l(data)) == size(data)
l = ltype(k, 1=>1, pad=SamePad(), dilation = k 2)
@test size(l(data)) == size(data)
stride = 3
l = ltype(k, 1=>1, pad=SamePad(), stride = stride)
if ltype == ConvTranspose
@test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] .- stride .+ 1
else
@test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ stride)
end
end
@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
data = ones(Float32, (k .+ 3)..., 1,1)
l = ltype(k, pad=SamePad())
@test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ k)
end

View File

@ -1,26 +1,9 @@
using Test using Test
using Flux: onehotbatch, mse, crossentropy, logitcrossentropy, using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
σ, binarycrossentropy, logitbinarycrossentropy, flatten, σ, binarycrossentropy, logitbinarycrossentropy
xlogx, xlogy
const ϵ = 1e-7 const ϵ = 1e-7
@testset "xlogx & xlogy" begin
@test iszero(xlogx(0))
@test isnan(xlogx(NaN))
@test xlogx(2) 2.0 * log(2.0)
@inferred xlogx(2)
@inferred xlogx(0)
@test iszero(xlogy(0, 1))
@test isnan(xlogy(NaN, 1))
@test isnan(xlogy(1, NaN))
@test isnan(xlogy(NaN, NaN))
@test xlogy(2, 3) 2.0 * log(3.0)
@inferred xlogy(2, 3)
@inferred xlogy(0, 1)
end
@testset "losses" begin @testset "losses" begin
# First, regression-style y's # First, regression-style y's
y = [1, 1, 0, 0] y = [1, 1, 0, 0]
@ -30,20 +13,6 @@ end
@test mse(ŷ, y) (.1^2 + .9^2)/2 @test mse(ŷ, y) (.1^2 + .9^2)/2
end end
@testset "mae" begin
@test Flux.mae(ŷ, y) 1/2
end
@testset "huber_loss" begin
@test Flux.huber_loss(ŷ, y) 0.20500000000000002
end
y = [123.0,456.0,789.0]
ŷ = [345.0,332.0,789.0]
@testset "msle" begin
@test Flux.msle(ŷ, y) 0.38813985859136585
end
# Now onehot y's # Now onehot y's
y = onehotbatch([1, 1, 0, 0], 0:1) y = onehotbatch([1, 1, 0, 0], 0:1)
ŷ = [.1 .9; .9 .1; .9 .1; .1 .9]' ŷ = [.1 .9; .9 .1; .9 .1; .1 .9]'
@ -52,7 +21,6 @@ end
lossvalue = 1.203972804325936 lossvalue = 1.203972804325936
@testset "crossentropy" begin @testset "crossentropy" begin
@test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) crossentropy([0.1,0.9], [0.1,0.9])
@test crossentropy(ŷ, y) lossvalue @test crossentropy(ŷ, y) lossvalue
end end
@ -81,53 +49,33 @@ end
@testset "logitbinarycrossentropy" begin @testset "logitbinarycrossentropy" begin
@test logitbinarycrossentropy.(logŷ, y) binarycrossentropy.(σ.(logŷ), y; ϵ=0) @test logitbinarycrossentropy.(logŷ, y) binarycrossentropy.(σ.(logŷ), y; ϵ=0)
end end
y = [1 2 3] y = [1 2 3]
ŷ = [4.0 5.0 6.0] y1 = [4.0 5.0 6.0]
@testset "kldivergence" begin @testset "kldivergence" begin
@test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) Flux.kldivergence([0.1,0.9], [0.1,0.9]) @test Flux.kldivergence(y, y1) 4.761838062403337
@test Flux.kldivergence(ŷ, y) -1.7661057888493457 @test Flux.kldivergence(y, y) 0
@test Flux.kldivergence(y, y) 0
end end
y = [1 2 3 4] y = [1 2 3 4]
ŷ = [5.0 6.0 7.0 8.0] y1 = [5.0 6.0 7.0 8.0]
@testset "hinge" begin @testset "hinge" begin
@test Flux.hinge(ŷ, y) 0 @test Flux.hinge(y, y1) 0
@test Flux.hinge(y, 0.5 .* y) 0.125 @test Flux.hinge(y, 0.5 .* y) 0.125
end end
@testset "squared_hinge" begin
@test Flux.squared_hinge(ŷ, y) 0
@test Flux.squared_hinge(y, 0.5 .* y) 0.0625
end
y = [0.1 0.2 0.3] y = [0.1 0.2 0.3]
ŷ = [0.4 0.5 0.6] y1 = [0.4 0.5 0.6]
@testset "poisson" begin @testset "poisson" begin
@test Flux.poisson(ŷ, y) 0.6278353988097339 @test Flux.poisson(y, y1) 1.0160455586700767
@test Flux.poisson(y, y) 0.5044459776946685 @test Flux.poisson(y, y) 0.5044459776946685
end end
y = [1.0 0.5 0.3 2.4]
ŷ = [0 1.4 0.5 1.2]
@testset "dice_coeff_loss" begin
@test Flux.dice_coeff_loss(ŷ, y) 0.2799999999999999
@test Flux.dice_coeff_loss(y, y) 0.0
end
@testset "tversky_loss" begin
@test Flux.tversky_loss(ŷ, y) -0.06772009029345383
@test Flux.tversky_loss(ŷ, y, β = 0.8) -0.09490740740740744
@test Flux.tversky_loss(y, y) -0.5576923076923075
end
@testset "no spurious promotions" begin @testset "no spurious promotions" begin
for T in (Float32, Float64) for T in (Float32, Float64)
y = rand(T, 2) y = rand(T, 2)
ŷ = rand(T, 2) ŷ = rand(T, 2)
for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson, for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
Flux.mae, Flux.huber_loss, Flux.msle, Flux.squared_hinge, Flux.dice_coeff_loss, Flux.tversky_loss)
fwd, back = Flux.pullback(f, , y) fwd, back = Flux.pullback(f, , y)
@test fwd isa T @test fwd isa T
@test eltype(back(one(T))[1]) == T @test eltype(back(one(T))[1]) == T
@ -135,10 +83,3 @@ end
end end
end end
end end
@testset "helpers" begin
@testset "flatten" begin
x = randn(Float32, 10, 10, 3, 2)
@test size(flatten(x)) == (300, 2)
end
end

View File

@ -57,57 +57,35 @@ end
end end
@testset "ExpDecay" begin @testset "ExpDecay" begin
@testset "Sanity Check" begin
o = ExpDecay(0.2, 0.5, 1, 1e-3)
p = [0.0]
steps = 1:8
eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
@test eta_actual == eta_expected
end
w = randn(10, 10)
o = ExpDecay(0.1, 0.1, 1000, 1e-4)
w1 = randn(10,10)
loss(x) = Flux.mse(w*x, w1*x)
flag = 1
decay_steps = []
for t = 1:10^5
prev_eta = o.eta
θ = Params([w1])
x = rand(10)
θ̄ = gradient(() -> loss(x), θ)
prev_grad = collect(θ̄[w1])
delta = Optimise.apply!(o, w1, θ̄[w1])
w1 .-= delta
new_eta = o.eta
if new_eta != prev_eta
push!(decay_steps, t)
end
array = fill(o.eta, size(prev_grad))
if array .* prev_grad != delta
flag = 0
end
end
@test flag == 1
# Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
ground_truth = []
for i in 1:4
push!(ground_truth, 1000*i) # Expected decay steps for this example.
end
@test decay_steps == ground_truth
@test o.eta == o.clip
end
@testset "Clipping" begin
w = randn(10, 10) w = randn(10, 10)
loss(x) = sum(w * x) o = ExpDecay(0.1, 0.1, 1000, 1e-4)
θ = Params([w]) w1 = randn(10,10)
x = 1000 * randn(10) loss(x) = Flux.mse(w*x, w1*x)
= gradient(() -> loss(x), θ)[w] flag = 1
w̄_value = Optimise.apply!(ClipValue(1.0), w, copy()) decay_steps = []
@test all(w̄_value .<= 1) for t = 1:10^5
w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy()) prev_eta = o.eta
@test norm(w̄_norm) <= 1 θ = Params([w1])
end x = rand(10)
θ̄ = gradient(() -> loss(x), θ)
prev_grad = collect(θ̄[w1])
delta = Optimise.apply!(o, w1, θ̄[w1])
w1 .-= delta
new_eta = o.eta
if new_eta != prev_eta
push!(decay_steps, t)
end
array = fill(o.eta, size(prev_grad))
if array .* prev_grad != delta
flag = 0
end
end
@test flag == 1
# Test to check if decay happens at decay steps. Eta reaches clip value eventually.
ground_truth = []
for i in 1:11
push!(ground_truth, 1000*i) # Expected decay steps for this example.
end
@test decay_steps == ground_truth
@test o.eta == o.clip
end

View File

@ -2,45 +2,48 @@ using Flux
using Flux.Data using Flux.Data
using Test using Test
using Random, Statistics, LinearAlgebra using Random, Statistics, LinearAlgebra
using Documenter
using IterTools: ncycle using IterTools: ncycle
Random.seed!(0) Random.seed!(0)
@testset "Utils" begin @testset "Flux" begin
include("utils.jl")
end
@testset "Onehot" begin @testset "Utils" begin
include("onehot.jl") include("utils.jl")
end end
@testset "Optimise" begin @testset "Onehot" begin
include("optimise.jl") include("onehot.jl")
end end
@testset "Data" begin @testset "Optimise" begin
include("data.jl") include("optimise.jl")
end end
@testset "Layers" begin @testset "Data" begin
include("layers/basic.jl") include("data.jl")
include("layers/normalisation.jl") end
include("layers/stateless.jl")
include("layers/conv.jl") @testset "Layers" begin
end include("layers/basic.jl")
include("layers/normalisation.jl")
@testset "CUDA" begin include("layers/stateless.jl")
if Flux.use_cuda[] include("layers/conv.jl")
include("cuda/cuda.jl") end
else
@warn "CUDA unavailable, not testing GPU support" @testset "CUDA" begin
if Flux.use_cuda[]
include("cuda/cuda.jl")
else
@warn "CUDA unavailable, not testing GPU support"
end
end end
end
@static if VERSION >= v"1.4"
using Documenter
@testset "Docs" begin @testset "Docs" begin
DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true) if VERSION >= v"1.2"
doctest(Flux) doctest(Flux)
end
end end
end
end # testset Flux