Compare commits

..

No commits in common. "master" and "functors" have entirely different histories.

34 changed files with 262 additions and 915 deletions

View File

@ -1,12 +0,0 @@
[Please delete this text and describe your change here.
For bugfixes, please detail the bug and include a test case which your patch fixes.
If you are adding a new feature, please clearly describe the design, its rationale, the possible alternatives considered.
It is easiest to merge new features when there is clear precedent in other systems; we need to know we're taking
the right direction since it can be hard to change later.]
### PR Checklist
- [ ] Tests are added
- [ ] Entry in NEWS.md
- [ ] Documentation, if applicable
- [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes).

View File

@ -6,8 +6,16 @@ on:
jobs: jobs:
CompatHelper: CompatHelper:
runs-on: ubuntu-latest runs-on: ${{ matrix.os }}
strategy:
matrix:
julia-version: [1.3]
julia-arch: [x64]
os: [ubuntu-latest]
steps: steps:
- uses: julia-actions/setup-julia@latest
with:
version: ${{ matrix.julia-version }}
- name: Pkg.add("CompatHelper") - name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")' run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main() - name: CompatHelper.main()

View File

@ -16,7 +16,7 @@ notifications:
jobs: jobs:
include: include:
- stage: "Documentation" - stage: "Documentation"
julia: 1.3 julia: 1
os: linux os: linux
script: script:
- julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));

View File

@ -8,35 +8,35 @@ version = "0.5.0"
[[AbstractTrees]] [[AbstractTrees]]
deps = ["Markdown"] deps = ["Markdown"]
git-tree-sha1 = "33e450545eaf7699da1a6e755f9ea65f14077a45" git-tree-sha1 = "86d092c2599f1f7bb01668bf8eb3412f98d61e47"
uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
version = "0.3.3" version = "0.3.2"
[[Adapt]] [[Adapt]]
deps = ["LinearAlgebra"] deps = ["LinearAlgebra"]
git-tree-sha1 = "fd04049c7dd78cfef0b06cdc1f0f181467655712" git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "1.1.0" version = "1.0.1"
[[ArrayLayouts]] [[ArrayLayouts]]
deps = ["FillArrays", "LinearAlgebra"] deps = ["FillArrays", "LinearAlgebra"]
git-tree-sha1 = "a504dca2ac7eda8761c8f7c1ed52427a1be75a3c" git-tree-sha1 = "41956a49a8a4fefa1bf6664bca4a3035aba4c3a0"
uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a" uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
version = "0.2.6" version = "0.2.3"
[[Base64]] [[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]] [[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"] deps = ["Libdl", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10" version = "0.5.8"
[[CEnum]] [[CEnum]]
git-tree-sha1 = "1b77a77c3b28e0b3f413f7567c9bb8dd9bdccd14" git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.3.0" version = "0.2.0"
[[CUDAapi]] [[CUDAapi]]
deps = ["Libdl", "Logging"] deps = ["Libdl", "Logging"]
@ -46,21 +46,21 @@ version = "4.0.0"
[[CUDAdrv]] [[CUDAdrv]]
deps = ["CEnum", "CUDAapi", "Printf"] deps = ["CEnum", "CUDAapi", "Printf"]
git-tree-sha1 = "f56bbf18c86bcff7a961a32a4947a5abb2963a29" git-tree-sha1 = "e650cbaee92b60433313157926b1e80d0c3a0e2e"
uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
version = "6.3.0" version = "6.2.2"
[[CUDAnative]] [[CUDAnative]]
deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "ExprTools", "GPUCompiler", "LLVM", "Libdl", "Pkg", "Printf"] deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
git-tree-sha1 = "ac86db2b05fdfec96b011e25a504ffe7476e8a68" git-tree-sha1 = "d1fc99635d0002c8a819b78cb1f441eb44310725"
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
version = "3.1.0" version = "3.0.2"
[[CodeTracking]] [[CodeTracking]]
deps = ["InteractiveUtils", "UUIDs"] deps = ["InteractiveUtils", "UUIDs"]
git-tree-sha1 = "cab4da992adc0a64f63fa30d2db2fd8bec40cab4" git-tree-sha1 = "0becdab7e6fbbcb7b88d8de5b72e5bb2f28239f3"
uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2" uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
version = "0.5.11" version = "0.5.8"
[[CodecZlib]] [[CodecZlib]]
deps = ["TranscodingStreams", "Zlib_jll"] deps = ["TranscodingStreams", "Zlib_jll"]
@ -70,15 +70,15 @@ version = "0.7.0"
[[ColorTypes]] [[ColorTypes]]
deps = ["FixedPointNumbers", "Random"] deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "c73d9cfc2a9d8433dc77f5bff4bddf46b1d78c20" git-tree-sha1 = "c4c1cca28748906265ed62c788d6fe6f0134d264"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.10.3" version = "0.10.0"
[[Colors]] [[Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"] deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"]
git-tree-sha1 = "1e9bba7984e78aa8cdeea7f9f7cc984ad4e4b1c7" git-tree-sha1 = "2fdeb981ebcf52cd800ddb6a0aa5eac34153552d"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.2" version = "0.12.0"
[[CommonSubexpressions]] [[CommonSubexpressions]]
deps = ["Test"] deps = ["Test"]
@ -93,27 +93,27 @@ uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "0.3.3+0" version = "0.3.3+0"
[[Cthulhu]] [[Cthulhu]]
deps = ["CodeTracking", "InteractiveUtils", "REPL", "UUIDs", "Unicode"] deps = ["CodeTracking", "InteractiveUtils", "REPL", "Unicode"]
git-tree-sha1 = "f3643e78353199d3097821e806348bd83f364155" git-tree-sha1 = "484790098c85c26f8e59051f8ff1a0745c034a7d"
uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f" uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
version = "1.1.1" version = "1.0.1"
[[CuArrays]] [[CuArrays]]
deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"] deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
git-tree-sha1 = "1582b74d2322df7dd94549d4ac9d095e0f20e884" git-tree-sha1 = "e8c55b38dcca955f5aed8ec4479cdc95810db1e1"
uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
version = "2.2.1" version = "2.0.1"
[[DataAPI]] [[DataAPI]]
git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32" git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.3.0" version = "1.1.0"
[[DataStructures]] [[DataStructures]]
deps = ["InteractiveUtils", "OrderedCollections"] deps = ["InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "af6d9c86e191c917c2276fbede1137e8ea20157f" git-tree-sha1 = "73eb18320fe3ba58790c8b8f6f89420f0a622773"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.17.17" version = "0.17.11"
[[Dates]] [[Dates]]
deps = ["Printf"] deps = ["Printf"]
@ -139,16 +139,11 @@ version = "1.0.1"
deps = ["Random", "Serialization", "Sockets"] deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[ExprTools]]
git-tree-sha1 = "6f0517056812fd6aa3af23d4b70d5325a2ae4e95"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.1"
[[FillArrays]] [[FillArrays]]
deps = ["LinearAlgebra", "Random", "SparseArrays"] deps = ["LinearAlgebra", "Random", "SparseArrays"]
git-tree-sha1 = "44f561e293987ffc84272cd3d2b14b0b93123d63" git-tree-sha1 = "51cc2f9bc4eb9c6c0e81ec2f779d1085583cc956"
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
version = "0.8.10" version = "0.8.7"
[[FixedPointNumbers]] [[FixedPointNumbers]]
git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238" git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238"
@ -167,27 +162,17 @@ git-tree-sha1 = "f40adc6422f548176bb4351ebd29e4abf773040a"
uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196" uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
version = "0.1.0" version = "0.1.0"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[GPUArrays]] [[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"] deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
git-tree-sha1 = "d887693eb1bd5e1fd573262a978745481895ec7d" git-tree-sha1 = "d586762b08dcda13228df8967119b9cb6f22ade5"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "3.4.1" version = "3.1.0"
[[GPUCompiler]]
deps = ["Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs"]
git-tree-sha1 = "5275aa268ecd09640b32560e1eae90c78816e4d1"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.2.0"
[[IRTools]] [[IRTools]]
deps = ["InteractiveUtils", "MacroTools", "Test"] deps = ["InteractiveUtils", "MacroTools", "Test"]
git-tree-sha1 = "90ee39f9beaaa186e4968417ea2b8ed5673c91c0" git-tree-sha1 = "1a4355e4b5b50be2311ebb644f34f3306dbd0410"
uuid = "7869d1d1-7146-5819-86e3-90919afe41df" uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
version = "0.3.3" version = "0.3.1"
[[InteractiveUtils]] [[InteractiveUtils]]
deps = ["Markdown"] deps = ["Markdown"]
@ -195,15 +180,15 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[Juno]] [[Juno]]
deps = ["Base64", "Logging", "Media", "Profile"] deps = ["Base64", "Logging", "Media", "Profile"]
git-tree-sha1 = "a686b0cf235fa3e491b79b4783c2d2382292b436" git-tree-sha1 = "e1ba2a612645b3e07c773c3a208f215745081fe6"
uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d" uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
version = "0.8.2" version = "0.8.1"
[[LLVM]] [[LLVM]]
deps = ["CEnum", "Libdl", "Printf", "Unicode"] deps = ["CEnum", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "dd3f584c3dbefe39b2a8fbafa1a3b77e31e21255" git-tree-sha1 = "b6b86801ae2f2682e0a4889315dc76b68db2de71"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0" uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "1.5.1" version = "1.3.4"
[[LibGit2]] [[LibGit2]]
deps = ["Printf"] deps = ["Printf"]
@ -262,9 +247,10 @@ uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.3+3" version = "0.5.3+3"
[[OrderedCollections]] [[OrderedCollections]]
git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3" deps = ["Random", "Serialization", "Test"]
git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.2.0" version = "1.1.0"
[[Pkg]] [[Pkg]]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
@ -319,15 +305,15 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]] [[SpecialFunctions]]
deps = ["OpenSpecFun_jll"] deps = ["OpenSpecFun_jll"]
git-tree-sha1 = "d8d8b8a9f4119829410ecd706da4cc8594a1e020" git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b" uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "0.10.3" version = "0.10.0"
[[StaticArrays]] [[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"] deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "5c06c0aeb81bef54aed4b3f446847905eb6cbda0" git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
uuid = "90137ffa-7385-5640-81b9-e52037218182" uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "0.12.3" version = "0.12.1"
[[Statistics]] [[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"] deps = ["LinearAlgebra", "SparseArrays"]
@ -345,9 +331,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TimerOutputs]] [[TimerOutputs]]
deps = ["Printf"] deps = ["Printf"]
git-tree-sha1 = "f458ca23ff80e46a630922c555d838303e4b9603" git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.6" version = "0.5.3"
[[TranscodingStreams]] [[TranscodingStreams]]
deps = ["Random", "Test"] deps = ["Random", "Test"]
@ -364,21 +350,21 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[ZipFile]] [[ZipFile]]
deps = ["Libdl", "Printf", "Zlib_jll"] deps = ["Libdl", "Printf", "Zlib_jll"]
git-tree-sha1 = "254975fef2fc526583bb9b7c9420fe66ffe09f2f" git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
version = "0.9.2" version = "0.9.1"
[[Zlib_jll]] [[Zlib_jll]]
deps = ["Libdl", "Pkg"] deps = ["Libdl", "Pkg"]
git-tree-sha1 = "a2e0d558f6031002e380a90613b199e37a8565bf" git-tree-sha1 = "2f6c3e15e20e036ee0a0965879b31442b7ec50fa"
uuid = "83775a58-1f1d-513f-b197-d71354ab007a" uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.11+10" version = "1.2.11+9"
[[Zygote]] [[Zygote]]
deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "Future", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"] deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
git-tree-sha1 = "707ceea58e2bd0ff3077ab13a92f8355181d3ee4" git-tree-sha1 = "1ccbfbe8930376e31752b812daa2532c723dc332"
uuid = "e88e6eb3-aa80-5325-afca-941959d7151f" uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
version = "0.4.20" version = "0.4.13"
[[ZygoteRules]] [[ZygoteRules]]
deps = ["MacroTools"] deps = ["MacroTools"]

16
NEWS.md
View File

@ -1,19 +1,3 @@
# v0.11
* Change to `DataLoader`'s constructor [https://github.com/FluxML/Flux.jl/pull/1152]
* Use `DataLoader` with `NamedTuple`s, so that tensors can be accessed by name [https://github.com/FluxML/Flux.jl/pull/1221].
* Error if Dense layers weights and biases are not arrays [https://github.com/FluxML/Flux.jl/pull/1218].
# v0.10.5
* Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`.
* Added option to set `bias` to [Flux.Zeros](https://github.com/FluxML/Flux.jl/pull/873) to eliminating `bias` from being trained.
* Added `GlobalMaxPool` and `GlobalMeanPool` [layers](https://github.com/FluxML/Flux.jl/pull/950) for performing global pooling operations.
* Added `ClipValue` and `ClipNorm` in this [pr](https://github.com/FluxML/Flux.jl/pull/1133) to `Flux.Optimise` to provide a cleaner API for gradient clipping.
* Added new kwarg-only [constructors](https://github.com/FluxML/Flux.jl/pull/873) for the various convolutional layers.
* Documented the convolutional layer constructors accepting `weight` and `bias` keyword arguments to supply custom arrays for those fields.
* Testing suite improvements now test for gradients of all layers along with GPU support.
* Functors have now moved to [Functors.jl](https://github.com/FluxML/Flux.jl/pull/1174) to allow for their use outside of Flux.
* Added [helper functions](https://github.com/FluxML/Flux.jl/pull/873) `Flux.convfilter` and `Flux.depthwiseconvfilter` to construct weight arrays for convolutions outside of layer constructors so as to not have to depend on the default layers for custom implementations.
# v0.10.0 # v0.10.0
* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669) * The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
- The dependency on Tracker.jl has been removed. - The dependency on Tracker.jl has been removed.

View File

@ -1,6 +1,6 @@
name = "Flux" name = "Flux"
uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
version = "0.11.0-DEV" version = "0.10.4"
[deps] [deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@ -11,7 +11,6 @@ CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d" Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@ -27,11 +26,10 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
[compat] [compat]
AbstractTrees = "0.2, 0.3" AbstractTrees = "0.2, 0.3"
Adapt = "1, 2.0" Adapt = "1"
CodecZlib = "0.5, 0.6, 0.7" CodecZlib = "0.5, 0.6, 0.7"
Colors = "0.8, 0.9, 0.10, 0.11, 0.12" Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
CuArrays = "2" CuArrays = "2"
Functors = "0.1"
Juno = "0.5, 0.6, 0.7, 0.8" Juno = "0.5, 0.6, 0.7, 0.8"
MacroTools = "0.3, 0.4, 0.5" MacroTools = "0.3, 0.4, 0.5"
NNlib = "0.6" NNlib = "0.6"

View File

@ -7,15 +7,15 @@ julia> using Flux: onehot, onecold
julia> onehot(:b, [:a, :b, :c]) julia> onehot(:b, [:a, :b, :c])
3-element Flux.OneHotVector: 3-element Flux.OneHotVector:
0 false
1 true
0 false
julia> onehot(:c, [:a, :b, :c]) julia> onehot(:c, [:a, :b, :c])
3-element Flux.OneHotVector: 3-element Flux.OneHotVector:
0 false
0 false
1 true
``` ```
The inverse is `onecold` (which can take a general probability distribution, as well as just booleans). The inverse is `onecold` (which can take a general probability distribution, as well as just booleans).

View File

@ -19,7 +19,7 @@ Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0
julia> Flux.params(a) # default behavior julia> Flux.params(a) # default behavior
Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]]) Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])
julia> Flux.trainable(a::Affine) = (a.W,) julia> Flux.trainable(a::Affine) = (a.W, a.b,)
julia> Flux.params(a) julia> Flux.params(a)
Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]]) Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])

View File

@ -32,6 +32,8 @@ julia> gradient(f, [2, 1], [2, 0])
But machine learning models can have *hundreds* of parameters! To handle this, Flux lets you work with collections of parameters, via `params`. You can get the gradient of all parameters used in a program without explicitly passing them in. But machine learning models can have *hundreds* of parameters! To handle this, Flux lets you work with collections of parameters, via `params`. You can get the gradient of all parameters used in a program without explicitly passing them in.
```jldoctest basics ```jldoctest basics
julia> using Flux
julia> x = [2, 1]; julia> x = [2, 1];
julia> y = [2, 0]; julia> y = [2, 0];

View File

@ -20,11 +20,7 @@ GlobalMeanPool
DepthwiseConv DepthwiseConv
ConvTranspose ConvTranspose
CrossCor CrossCor
SamePad
flatten flatten
Flux.Zeros
Flux.convfilter
Flux.depthwiseconvfilter
``` ```
## Recurrent Layers ## Recurrent Layers

View File

@ -39,7 +39,7 @@ E.g. the following will have run into the same problem as above:
leaky_tanh(x) = 0.01*x + tanh(x) leaky_tanh(x) = 0.01*x + tanh(x)
``` ```
While one could change the activation function (e.g. to use `0.01f0*x`), the idiomatic (and safe way) to avoid type casts whenever inputs changes is to use `oftype`: While one could change the activation function (e.g. to use `0.01f0x`), the idiomatic (and safe way) to avoid type casts whenever inputs changes is to use `oftype`:
``` ```
leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x) leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x)
``` ```

View File

@ -80,7 +80,7 @@ Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked. The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
```julia ```julia
function Flux.Optimise.apply!(o::Momentum, x, Δ) function apply!(o::Momentum, x, Δ)
η, ρ = o.eta, o.rho η, ρ = o.eta, o.rho
v = get!(o.velocity, x, zero(x))::typeof(x) v = get!(o.velocity, x, zero(x))::typeof(x)
@. v = ρ * v - η * Δ @. v = ρ * v - η * Δ
@ -140,16 +140,3 @@ ExpDecay
InvDecay InvDecay
WeightDecay WeightDecay
``` ```
## Gradient Clipping
Gradient clipping is useful for training recurrent neural networks, which have a tendency to suffer from the exploding gradient problem. An example usage is
```julia
opt = Optimiser(ClipValue(1e-3), ADAM(1e-3))
```
```@docs
ClipValue
ClipNorm
```

View File

@ -3,15 +3,14 @@ module Flux
# Zero Flux Given # Zero Flux Given
using Base: tail using Base: tail
using Statistics, Random, LinearAlgebra using Zygote, MacroTools, Juno, Reexport, Statistics, Random
using Zygote, MacroTools, Juno, Reexport
using MacroTools: @forward using MacroTools: @forward
@reexport using NNlib @reexport using NNlib
using Zygote: Params, @adjoint, gradient, pullback, @nograd using Zygote: Params, @adjoint, gradient, pullback, @nograd
export gradient export gradient
export Chain, Dense, Maxout, RNN, LSTM, GRU, SamePad, Conv, CrossCor, ConvTranspose, export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose,
GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, flatten, GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool, flatten,
DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm, DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode! SkipConnection, params, fmap, cpu, gpu, f32, f64, testmode!, trainmode!
@ -19,17 +18,15 @@ export Chain, Dense, Maxout, RNN, LSTM, GRU, SamePad, Conv, CrossCor, ConvTransp
include("optimise/Optimise.jl") include("optimise/Optimise.jl")
using .Optimise using .Optimise
using .Optimise: @epochs using .Optimise: @epochs
export Descent, ADAM, Momentum, Nesterov, RMSProp, export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay, ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
ClipValue, ClipNorm
using CuArrays using CuArrays
const use_cuda = Ref(false) const use_cuda = Ref(false)
include("utils.jl") include("utils.jl")
include("zeros.jl")
include("onehot.jl") include("onehot.jl")
include("functor.jl") include("functor.jl")

View File

@ -51,6 +51,4 @@ export Iris
include("housing.jl") include("housing.jl")
export Housing export Housing
@deprecate DataLoader(x...; kws...) DataLoader(x; kws...)
end end

View File

@ -1,7 +1,7 @@
# Adapted from Knet's src/data.jl (author: Deniz Yuret) # Adapted from Knet's src/data.jl (author: Deniz Yuret)
struct DataLoader{D} struct DataLoader
data::D data
batchsize::Int batchsize::Int
nobs::Int nobs::Int
partial::Bool partial::Bool
@ -11,20 +11,21 @@ struct DataLoader{D}
end end
""" """
DataLoader(data; batchsize=1, shuffle=false, partial=true) DataLoader(data...; batchsize=1, shuffle=false, partial=true)
An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
(except possibly the last one). (except possibly the last one).
Takes as input a single data tensor, or a tuple (or a named tuple) of tensors. Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in
The last dimension in each tensor is considered to be the observation dimension. supervised learning. The last dimension in each tensor is considered to be the observation
dimension.
If `shuffle=true`, shuffles the observations each time iterations are re-started. If `shuffle=true`, shuffles the observations each time iterations are re-started.
If `partial=false`, drops the last mini-batch if it is smaller than the batchsize. If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
The original data is preserved in the `data` field of the DataLoader. The original data is preserved as a tuple in the `data` field of the DataLoader.
Usage example: Example usage:
Xtrain = rand(10, 100) Xtrain = rand(10, 100)
train_loader = DataLoader(Xtrain, batchsize=2) train_loader = DataLoader(Xtrain, batchsize=2)
@ -36,16 +37,9 @@ Usage example:
train_loader.data # original dataset train_loader.data # original dataset
# similar, but yielding tuples
train_loader = DataLoader((Xtrain,), batchsize=2)
for (x,) in train_loader
@assert size(x) == (10, 2)
...
end
Xtrain = rand(10, 100) Xtrain = rand(10, 100)
Ytrain = rand(100) Ytrain = rand(100)
train_loader = DataLoader((Xtrain, Ytrain), batchsize=2, shuffle=true) train_loader = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true)
for epoch in 1:100 for epoch in 1:100
for (x, y) in train_loader for (x, y) in train_loader
@assert size(x) == (10, 2) @assert size(x) == (10, 2)
@ -57,25 +51,25 @@ Usage example:
# train for 10 epochs # train for 10 epochs
using IterTools: ncycle using IterTools: ncycle
Flux.train!(loss, ps, ncycle(train_loader, 10), opt) Flux.train!(loss, ps, ncycle(train_loader, 10), opt)
# can use NamedTuple to name tensors
train_loader = DataLoader((images=Xtrain, labels=Ytrain), batchsize=2, shuffle=true)
for datum in train_loader
@assert size(datum.images) == (10, 2)
@assert size(datum.labels) == (2,)
end
""" """
function DataLoader(data; batchsize=1, shuffle=false, partial=true) function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
length(data) > 0 || throw(ArgumentError("Need at least one data input"))
batchsize > 0 || throw(ArgumentError("Need positive batchsize")) batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
n = _nobs(data) nx = size(data[1])[end]
if n < batchsize for i=2:length(data)
@warn "Number of observations less than batchsize, decreasing the batchsize to $n" nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
batchsize = n
end end
imax = partial ? n : n - batchsize + 1 if nx < batchsize
DataLoader(data, batchsize, n, partial, imax, [1:n;], shuffle) @warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
batchsize = nx
end end
imax = partial ? nx : nx - batchsize + 1
ids = 1:min(nx, batchsize)
DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
end
getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
@propagate_inbounds function Base.iterate(d::DataLoader, i=0) # returns data in d.indices[i+1:i+batchsize] @propagate_inbounds function Base.iterate(d::DataLoader, i=0) # returns data in d.indices[i+1:i+batchsize]
i >= d.imax && return nothing i >= d.imax && return nothing
@ -84,7 +78,11 @@ end
end end
nexti = min(i + d.batchsize, d.nobs) nexti = min(i + d.batchsize, d.nobs)
ids = d.indices[i+1:nexti] ids = d.indices[i+1:nexti]
batch = _getobs(d.data, ids) if length(d.data) == 1
batch = getdata(d.data[1], ids)
else
batch = ((getdata(x, ids) for x in d.data)...,)
end
return (batch, nexti) return (batch, nexti)
end end
@ -92,19 +90,3 @@ function Base.length(d::DataLoader)
n = d.nobs / d.batchsize n = d.nobs / d.batchsize
d.partial ? ceil(Int,n) : floor(Int,n) d.partial ? ceil(Int,n) : floor(Int,n)
end end
_nobs(data::AbstractArray) = size(data)[end]
function _nobs(data::Union{Tuple, NamedTuple})
length(data) > 0 || throw(ArgumentError("Need at least one data input"))
n = _nobs(data[1])
if !all(x -> _nobs(x) == n, Base.tail(data))
throw(DimensionMismatch("All data should contain same number of observations"))
end
return n
end
_getobs(data::AbstractArray, i) = data[ntuple(i -> Colon(), Val(ndims(data) - 1))..., i]
_getobs(data::Union{Tuple, NamedTuple}, i) = map(Base.Fix2(_getobs, i), data)
Base.eltype(::DataLoader{D}) where D = D

View File

@ -24,7 +24,7 @@ testmode!(m, mode = true) = m
trainmode!(m, mode = true) trainmode!(m, mode = true)
Set a layer of model's train mode (see below). Set a layer of model's train mode (see below).
Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`). Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
_Note_: if you manually set a model into train mode, you need to manually place _Note_: if you manually set a model into train mode, you need to manually place
it into test mode during testing phase. it into test mode during testing phase.

View File

@ -102,7 +102,7 @@ julia> d(rand(5))
-0.16210233 -0.16210233
0.12311903``` 0.12311903```
""" """
struct Dense{F,S<:AbstractArray,T<:AbstractArray} struct Dense{F,S,T}
W::S W::S
b::T b::T
σ::F σ::F

View File

@ -7,59 +7,26 @@ _convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+
expand(N, i::Tuple) = i expand(N, i::Tuple) = i
expand(N, i::Integer) = ntuple(_ -> i, N) expand(N, i::Integer) = ntuple(_ -> i, N)
""" """
SamePad Conv(size, in => out, σ = identity; init = glorot_uniform,
Padding for convolutional layers will be calculated so that outputshape == inputshape when stride = 1.
For stride > 1 the output shape depends on the type of convolution layer.
"""
struct SamePad end
calc_padding(pad, k::NTuple{N,T}, dilation, stride) where {T,N}= expand(Val(2*N), pad)
function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
#Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/pdf/1603.07285
# Effective kernel size, including dilation
k_eff = @. k + (k - 1) * (dilation - 1)
# How much total padding needs to be applied?
pad_amt = @. k_eff - 1
# In case amount of padding is odd we need to apply different amounts to each side.
return Tuple(mapfoldl(i -> [ceil(Int, i/2), floor(Int, i/2)], vcat, pad_amt))
end
"""
Conv(filter, in => out, σ = identity; init = glorot_uniform,
stride = 1, pad = 0, dilation = 1) stride = 1, pad = 0, dilation = 1)
filter = (2,2) Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
in = 1
out = 16
Conv((2, 2), 1=>16, relu)
Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively. `in` and `out` specify the number of input and output channels respectively.
Data should be stored in WHCN order (width, height, # channels, batch size). Data should be stored in WHCN order (width, height, # channels, batch size).
In other words, a 100×100 RGB image would be a `100×100×3×1` array, In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array. and a batch of 50 would be a `100×100×3×50` array.
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
# Examples # Examples
Apply a `Conv` layer to a 1-channel input using a 2×2 window filter size, giving us a Apply a `Conv` layer to a 1-channel input using a 2×2 window size, giving us a
16-channel output. Output is activated with ReLU. 16-channel output. Output is activated with ReLU.
```julia ```julia
filter = (2,2) size = (2,2)
in = 1 in = 1
out = 16 out = 16
Conv(filter, in => out, relu) Conv(size, in => out, relu)
``` ```
""" """
struct Conv{N,M,F,A,V} struct Conv{N,M,F,A,V}
@ -71,68 +38,25 @@ struct Conv{N,M,F,A,V}
dilation::NTuple{N,Int} dilation::NTuple{N,Int}
end end
""" function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
Conv(weight::AbstractArray, bias::AbstractArray)
Conv(weight::AbstractArray, bias::AbstractArray, activation)
Constructs the convolutional layer with user defined weight and bias arrays.
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
There is also a keyword-only constuctor available for all convoultional
layers.
```julia
weight = rand(Float32, 3, 3, 5)
bias = zeros(Float32, 5)
Conv(weight = weight,
bias = bias,
σ = sigmoid)
```
"""
function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N} stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride) stride = expand(Val(N-2), stride)
pad = expand(Val(2*(N-2)), pad)
dilation = expand(Val(N-2), dilation) dilation = expand(Val(N-2), dilation)
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
return Conv(σ, w, b, stride, pad, dilation) return Conv(σ, w, b, stride, pad, dilation)
end end
function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}}, Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N} init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation) Conv(init(k..., ch...), zeros(ch[2]), σ,
end
"""
convfilter(filter::Tuple, in=>out)
Constructs a standard convolutional weight matrix with given `filter` and
channels from `in` to `out`.
Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
distribution.
See also: [`depthwiseconvfilter`](@ref)
"""
convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
init = glorot_uniform) where N = init(filter..., ch...)
function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
Conv(weight, bias, σ,
stride = stride, pad = pad, dilation = dilation) stride = stride, pad = pad, dilation = dilation)
end
@functor Conv @functor Conv
function (c::Conv)(x::AbstractArray) function (c::Conv)(x::AbstractArray)
# TODO: breaks gpu broadcast :( # TODO: breaks gpu broadcast :(
# ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1))) # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
σ, b = c.σ, reshape(c.bias, ntuple(_->1, length(c.stride))..., :, 1) σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation) cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
σ.(conv(x, c.weight, cdims) .+ b) σ.(conv(x, c.weight, cdims) .+ b)
end end
@ -166,23 +90,15 @@ outdims(l::Conv, isize) =
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation)) output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
""" """
ConvTranspose(filter, in=>out) ConvTranspose(size, in => out, σ = identity; init = glorot_uniform,
ConvTranspose(filter, in=>out, activation)
ConvTranspose(filter, in => out, σ = identity; init = glorot_uniform,
stride = 1, pad = 0, dilation = 1) stride = 1, pad = 0, dilation = 1)
Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`. Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively. `in` and `out` specify the number of input and output channels respectively.
Data should be stored in WHCN order (width, height, # channels, batch size). Data should be stored in WHCN order (width, height, # channels, batch size).
In other words, a 100×100 RGB image would be a `100×100×3×1` array, In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array. and a batch of 50 would be a `100×100×3×50` array.
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1.
""" """
struct ConvTranspose{N,M,F,A,V} struct ConvTranspose{N,M,F,A,V}
σ::F σ::F
@ -193,39 +109,18 @@ struct ConvTranspose{N,M,F,A,V}
dilation::NTuple{N,Int} dilation::NTuple{N,Int}
end end
""" function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
ConvTranspose(weight::AbstractArray, bias::AbstractArray)
ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
Constructs the convolutional transpose layer with user defined weight and bias arrays.
forward pass.
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
For keyword-only constuctor, see also [`Conv`](@ref)
"""
function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N} stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride) stride = expand(Val(N-2), stride)
pad = expand(Val(2*(N-2)), pad)
dilation = expand(Val(N-2), dilation) dilation = expand(Val(N-2), dilation)
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
return ConvTranspose(σ, w, b, stride, pad, dilation) return ConvTranspose(σ, w, b, stride, pad, dilation)
end end
function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}}, ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N} init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation) ConvTranspose(init(k..., reverse(ch)...), zeros(ch[2]), σ,
end
function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
ConvTranspose(weight, bias, σ,
stride = stride, pad = pad, dilation = dilation) stride = stride, pad = pad, dilation = dilation)
end
@functor ConvTranspose @functor ConvTranspose
@ -250,7 +145,7 @@ function (c::ConvTranspose)(x::AbstractArray)
# ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1))) # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1) σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
cdims = conv_transpose_dims(c, x) cdims = conv_transpose_dims(c, x)
σ.(∇conv_data(x, c.weight, cdims) .+ b) return σ.(∇conv_data(x, c.weight, cdims) .+ b)
end end
function Base.show(io::IO, l::ConvTranspose) function Base.show(io::IO, l::ConvTranspose)
@ -269,24 +164,16 @@ end
outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad) outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
""" """
DepthwiseConv(filter::Tuple, in=>out) DepthwiseConv(size, in => out, σ = identity; init = glorot_uniform,
DepthwiseConv(filter::Tuple, in=>out, activation)
DepthwiseConv(filter, in => out, σ = identity; init = glorot_uniform,
stride = 1, pad = 0, dilation = 1) stride = 1, pad = 0, dilation = 1)
Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`. Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively. `in` and `out` specify the number of input and output channels respectively.
Note that `out` must be an integer multiple of `in`. Note that `out` must be an integer multiple of `in`.
Data should be stored in WHCN order (width, height, # channels, batch size). Data should be stored in WHCN order (width, height, # channels, batch size).
In other words, a 100×100 RGB image would be a `100×100×3×1` array, In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array. and a batch of 50 would be a `100×100×3×50` array.
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
""" """
struct DepthwiseConv{N,M,F,A,V} struct DepthwiseConv{N,M,F,A,V}
σ::F σ::F
@ -297,54 +184,20 @@ struct DepthwiseConv{N,M,F,A,V}
dilation::NTuple{N,Int} dilation::NTuple{N,Int}
end end
""" function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
DepthwiseConv(weight::AbstractArray, bias::AbstractArray)
DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
forward pass.
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
For keyword-only constuctor, see also [`Conv`](@ref)
"""
function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N} stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride) stride = expand(Val(N-2), stride)
pad = expand(Val(2*(N-2)), pad)
dilation = expand(Val(N-2), dilation) dilation = expand(Val(N-2), dilation)
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
return DepthwiseConv(σ, w, b, stride, pad, dilation) return DepthwiseConv(σ, w, b, stride, pad, dilation)
end end
function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
end
"""
depthwiseconvfilter(filter::Tuple, in=>out)
Constructs a depthwise convolutional weight array defined by `filter` and channels
from `in` to `out`.
Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
distribution.
See also: [`convfilter`](@ref)
"""
depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1, init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N
weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
@assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels" @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
return DepthwiseConv( return DepthwiseConv(
weight, init(k..., div(ch[2], ch[1]), ch[1]),
bias, zeros(ch[2]),
σ; σ;
stride = stride, stride = stride,
pad = pad, pad = pad,
@ -377,30 +230,22 @@ outdims(l::DepthwiseConv, isize) =
output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation)) output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
""" """
CrossCor(filter, in=>out) CrossCor(size, in => out, σ = identity; init = glorot_uniform,
CrossCor(filter, in=>out, activation)
CrossCor(filter, in => out, σ = identity; init = glorot_uniform,
stride = 1, pad = 0, dilation = 1) stride = 1, pad = 0, dilation = 1)
Standard cross convolutional layer. `filter` should be a tuple like `(2, 2)`. Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively. `in` and `out` specify the number of input and output channels respectively.
Data should be stored in WHCN order (width, height, # channels, batch size). Data should be stored in WHCN order (width, height, # channels, batch size).
In other words, a 100×100 RGB image would be a `100×100×3×1` array, In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array. and a batch of 50 would be a `100×100×3×50` array.
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
# Examples # Examples
Apply a `CrossCor` layer to a 1-channel input using a 2×2 window filter size, giving us a Apply a `CrossCor` layer to a 1-channel input using a 2×2 window size, giving us a
16-channel output. Output is activated with ReLU. 16-channel output. Output is activated with ReLU.
```julia ```julia
filter = (2,2) size = (2,2)
in = 1 in = 1
out = 16 out = 16
CrossCor((2, 2), 1=>16, relu) CrossCor((2, 2), 1=>16, relu)
@ -415,39 +260,18 @@ struct CrossCor{N,M,F,A,V}
dilation::NTuple{N,Int} dilation::NTuple{N,Int}
end end
""" function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
CrossCor(weight::AbstractArray, bias::AbstractArray)
CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
Constructs the standard cross convolutional layer with user defined weight and bias
arrays.
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
Takes the keyword arguments `pad`, `stride` and `dilation`.
For keyword-only constuctor, see also [`Conv`](@ref)
"""
function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N} stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride) stride = expand(Val(N-2), stride)
pad = expand(Val(2*(N-2)), pad)
dilation = expand(Val(N-2), dilation) dilation = expand(Val(N-2), dilation)
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
return CrossCor(σ, w, b, stride, pad, dilation) return CrossCor(σ, w, b, stride, pad, dilation)
end end
function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}}, CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N} init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation) CrossCor(init(k..., ch...), zeros(ch[2]), σ,
end
function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
CrossCor(weight, bias, σ,
stride = stride, pad = pad, dilation = dilation) stride = stride, pad = pad, dilation = dilation)
end
@functor CrossCor @functor CrossCor
@ -534,9 +358,6 @@ end
MaxPool(k; pad = 0, stride = k) MaxPool(k; pad = 0, stride = k)
Max pooling layer. `k` is the size of the window for each dimension of the input. Max pooling layer. `k` is the size of the window for each dimension of the input.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
=======
""" """
struct MaxPool{N,M} struct MaxPool{N,M}
k::NTuple{N,Int} k::NTuple{N,Int}
@ -546,7 +367,8 @@ end
function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
stride = expand(Val(N), stride) stride = expand(Val(N), stride)
pad = calc_padding(pad, k, 1, stride) pad = expand(Val(2*N), pad)
return MaxPool(k, pad, stride) return MaxPool(k, pad, stride)
end end
@ -565,8 +387,6 @@ outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.
MeanPool(k; pad = 0, stride = k) MeanPool(k; pad = 0, stride = k)
Mean pooling layer. `k` is the size of the window for each dimension of the input. Mean pooling layer. `k` is the size of the window for each dimension of the input.
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
""" """
struct MeanPool{N,M} struct MeanPool{N,M}
k::NTuple{N,Int} k::NTuple{N,Int}
@ -576,7 +396,7 @@ end
function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N
stride = expand(Val(N), stride) stride = expand(Val(N), stride)
pad = calc_padding(pad, k, 1, stride) pad = expand(Val(2*N), pad)
return MeanPool(k, pad, stride) return MeanPool(k, pad, stride)
end end

View File

@ -46,24 +46,23 @@ given the prediction `ŷ` and true values `y`.
Huber loss = | Huber loss = |
| δ * (| - y| - 0.5 * δ), otherwise | δ * (| - y| - 0.5 * δ), otherwise
""" """
#TODO: remove dropgrad when Zygote can handle this function with CuArrays
function huber_loss(, y; δ=eltype()(1)) function huber_loss(, y; δ=eltype()(1))
abs_error = abs.( .- y) abs_error = abs.( .- y)
temp = Zygote.dropgrad(abs_error .< δ) temp = abs_error .< δ
x = eltype()(0.5) x = eltype()(0.5)
hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y) hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
end end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
return -sum(xlogy.(y, )) * 1 // size(y, 2) return -sum(y .* log.()) * 1 // size(y, 2)
end end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
return -sum(xlogy.(y, )) .* weight * 1 // size(y, 2) return -sum(y .* log.()) .* weight * 1 // size(y, 2)
end end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
return -sum(xlogy.(y, ) .* weight) * 1 // size(y, 2) return -sum(y .* log.() .* weight) * 1 // size(y, 2)
end end
""" """
@ -92,7 +91,7 @@ Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
calculated as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`. calculated as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
`logitcrossentropy(ŷ, y)` is mathematically equivalent to `logitcrossentropy(ŷ, y)` is mathematically equivalent to
[`Flux.crossentropy(softmax(ŷ), y)`](@ref) but it is more numerically stable. [`Flux.crossentropy(softmax(log()), y)`](@ref) but it is more numerically stable.
See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref) See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
@ -124,7 +123,7 @@ julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
0.8616703662235441 0.8616703662235441
``` ```
""" """
binarycrossentropy(, y; ϵ=eps()) = -xlogy(y, + ϵ) - xlogy(1 - y, 1 - + ϵ) binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ)
# Re-definition to fix interaction with CuArrays. # Re-definition to fix interaction with CuArrays.
CuArrays.@cufunc binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ) CuArrays.@cufunc binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ)
@ -133,7 +132,7 @@ CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1
logitbinarycrossentropy(ŷ, y) logitbinarycrossentropy(ŷ, y)
`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to `logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to
[`Flux.binarycrossentropy(σ(ŷ), y)`](@ref) but it is more numerically stable. [`Flux.binarycrossentropy(σ(log(ŷ)), y)`](@ref) but it is more numerically stable.
See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref) See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref)
@ -196,7 +195,7 @@ It is always non-negative and zero only when both the distributions are equal
everywhere. everywhere.
""" """
function kldivergence(, y) function kldivergence(, y)
entropy = sum(xlogx.(y)) * 1 //size(y,2) entropy = sum(y .* log.(y)) * 1 //size(y,2)
cross_entropy = crossentropy(, y) cross_entropy = crossentropy(, y)
return entropy + cross_entropy return entropy + cross_entropy
end end
@ -209,7 +208,7 @@ distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
[More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson). [More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
""" """
poisson(, y) = sum( .- xlogy.(y, )) * 1 // size(y,2) poisson(, y) = sum( .- y .* log.()) * 1 // size(y,2)
""" """
hinge(, y) hinge(, y)
@ -263,34 +262,3 @@ by linearizing all values for each element in the batch.
function flatten(x::AbstractArray) function flatten(x::AbstractArray)
return reshape(x, :, size(x)[end]) return reshape(x, :, size(x)[end])
end end
"""
xlogx(x)
Return `x * log(x)` for `x ≥ 0`, handling `x = 0` by taking the downward limit.
"""
function xlogx(x)
result = x * log(x)
ifelse(iszero(x), zero(result), result)
end
CuArrays.@cufunc function xlogx(x)
result = x * log(x)
ifelse(iszero(x), zero(result), result)
end
"""
xlogy(x, y)
Return `x * log(y)` for `y > 0` with correct limit at `x = 0`.
"""
function xlogy(x, y)
result = x * log(y)
ifelse(iszero(x), zero(result), result)
end
CuArrays.@cufunc function xlogy(x, y)
result = x * log(y)
ifelse(iszero(x), zero(result), result)
end
@adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric)
res = xlogy.(x, y)
res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
end

View File

@ -27,8 +27,7 @@ Base.getindex(xs::OneHotMatrix, ::Colon, ::Colon) = OneHotMatrix(xs.height, copy
Base.getindex(xs::OneHotMatrix, i::Integer, ::Colon) = map(x -> x[i], xs.data) Base.getindex(xs::OneHotMatrix, i::Integer, ::Colon) = map(x -> x[i], xs.data)
# remove workaround when https://github.com/JuliaGPU/CuArrays.jl/issues/676 is fixed A::AbstractMatrix * B::OneHotMatrix = A[:, map(x->x.ix, B.data)]
A::AbstractMatrix * B::OneHotMatrix = A[:, cpu(map(x->x.ix, B.data))]
Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...]) Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...])
@ -49,7 +48,7 @@ cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.d
Create a `OneHotVector` with its `l`-th element `true` based on the Create a `OneHotVector` with its `l`-th element `true` based on the
possible set of `labels`. possible set of `labels`.
If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
in `labels`; otherwise, it will raise an error. in `labels`; otherwise it will error.
# Examples # Examples
```jldoctest ```jldoctest

View File

@ -1,12 +1,9 @@
module Optimise module Optimise
using LinearAlgebra
export train!, update!, export train!, update!,
Descent, ADAM, Momentum, Nesterov, RMSProp, SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM,
InvDecay, ExpDecay, WeightDecay, stop, Optimiser, InvDecay, ExpDecay, WeightDecay, stop, Optimiser
ClipValue, ClipNorm
include("optimisers.jl") include("optimisers.jl")
include("train.jl") include("train.jl")

View File

@ -509,7 +509,7 @@ function apply!(o::ExpDecay, x, Δ)
η, s, decay = o.eta, o.step, o.decay η, s, decay = o.eta, o.step, o.decay
n = o.current[x] = get(o.current, x, 0) + 1 n = o.current[x] = get(o.current, x, 0) + 1
if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1 if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1
η = max(η * decay, o.clip) η = max(η * decay^(s / n), o.clip)
o.eta = η o.eta = η
end end
@. Δ *= η @. Δ *= η
@ -533,31 +533,3 @@ function apply!(o::WeightDecay, x, Δ)
wd = o.wd wd = o.wd
@. Δ += wd * x @. Δ += wd * x
end end
"""
ClipValue(thresh)
Clip gradients when their absolute value exceeds `thresh`.
"""
mutable struct ClipValue{T}
thresh::T
end
apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
"""
ClipNorm(thresh)
Clip gradients when their L2 norm exceeds `thresh`.
"""
mutable struct ClipNorm{T}
thresh::T
end
function apply!(o::ClipNorm, x, Δ)
Δnrm = norm(Δ)
if Δnrm > o.thresh
rmul!(Δ, o.thresh / Δnrm)
end
return Δ
end

View File

@ -68,7 +68,8 @@ and compute the gradient of `loss(d)`.
A callback is given with the keyword argument `cb`. For example, this will print A callback is given with the keyword argument `cb`. For example, this will print
"training" every 10 seconds (using [`Flux.throttle`](@ref)): "training" every 10 seconds (using [`Flux.throttle`](@ref)):
train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10)) train!(loss, params, data, opt,
cb = throttle(() -> println("training"), 10))
The callback can call [`Flux.stop`](@ref) to interrupt the training loop. The callback can call [`Flux.stop`](@ref) to interrupt the training loop.

View File

@ -24,7 +24,7 @@ glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum
glorot_normal(dims...) glorot_normal(dims...)
Return an `Array` of size `dims` containing random variables taken from a normal Return an `Array` of size `dims` containing random variables taken from a normal
distribution with mean 0 and standard deviation `sqrt(2 / sum(dims))`. distribution with mean 0 and standard deviation `(2 / sum(dims))`.
# Examples # Examples
```jldoctest; setup = :(using Random; Random.seed!(0)) ```jldoctest; setup = :(using Random; Random.seed!(0))
@ -246,10 +246,6 @@ function _restructure(m, xs)
end end
end end
@adjoint function _restructure(m, xs)
_restructure(m, xs), dm -> (nothing,destructure(dm)[1])
end
""" """
destructure(m) destructure(m)

View File

@ -1,106 +0,0 @@
import Base: +, -, *, reshape, size
import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
"""
Zeros()
Zeros(size...)
Zeros(Type, size...)
Acts as a stand-in for an array of zeros that can be
used during training which is ignored by the optimisers.
Useful to turn bias off for a forward pass of a layer.
## Examples
```julia
julia> Flux.Zeros(3,3)
3×3 Flux.Zeros{Bool,2}:
false false false
false false false
false false false
julia> Flux.Zeros(Float32, 3,3)
3×3 Flux.Zeros{Float32,2}:
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
julia> rand(3,3) .+ Flux.Zeros()
3×3 Array{Float64,2}:
0.198739 0.490459 0.785386
0.779074 0.39986 0.66383
0.854981 0.447292 0.314497
julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
Conv((2, 2), 1=>3)
```
"""
struct Zeros{T,N} <: AbstractArray{T,N}
size::Tuple
end
Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
Zeros(sz::Integer...) = Zeros(Bool, sz...)
Base.size(xs::Zeros) = xs.size
Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
Zeros(T, length(inds))
Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
@adjoint reshape(xs::Zeros{T}, dims...) where T =
reshape(xs, dims...), _ -> nothing
# Define basic ops
for f in (:+, :-)
@eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
@assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
a
end
end
+(a::Zeros, b::AbstractArray) = b + a
-(a::Zeros, b::AbstractArray) = -b + a
Base.copy(xs::Zeros{T,N}) where {T,N} = xs
# Define broadcasting behaviour
for op in (:+, :-)
@eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
bs = Broadcast.broadcast_shape(size(a), size(b))
size(a) == bs && return a
sz = similar(a, bs)
sz .= a
end
end
broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
end
broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
for op in (:+, :-, :*)
@eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
end
# Some opportunities to avoid scalar indexing, intermediaries
# Since it replicates a little of what we expect Base to do,
# it should be possible to remove in the future, but for now,
# these help with performance.
broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)

View File

@ -69,7 +69,6 @@ if CuArrays.has_cudnn()
@info "Testing Flux/CUDNN" @info "Testing Flux/CUDNN"
include("cudnn.jl") include("cudnn.jl")
include("curnn.jl") include("curnn.jl")
include("layers.jl")
else else
@warn "CUDNN unavailable, not testing GPU DNN support" @warn "CUDNN unavailable, not testing GPU DNN support"
end end

View File

@ -1,98 +0,0 @@
# Test layers and data/model movements on and off the GPU
# Add tests for layers and their gradients on the GPU
# Most of the forward passes should be fine being applied
# to bitstype objects, but this gives higher coverage for our use-cases
# Check that getting the gradients does not throw
# generic movement tests
@testset "Basic GPU Movement" begin
@test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
@test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
end
# TODO: These layers get into scalar indexing
# `AlphaDropout` throws a compilation error on GPUs,
# whereas, the rest are scalar indexing issues.
const BROKEN_LAYERS = [DepthwiseConv,
AlphaDropout,
InstanceNorm,
GroupNorm]
function gradtest(name::String, layers::Vector, xs = nothing, args...)
isnothing(xs) && error("Missing input to test the layers against.")
@testset "$name GPU grad tests" begin
for layer in layers
@testset "$layer GPU grad test" begin
l = gpu(layer(args...))
xs = gpu(xs)
if any(x -> isa(l, x), BROKEN_LAYERS)
ps = Flux.params(l)
@test_broken gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
else
ps = Flux.params(l)
@test gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
gs = gradient(() -> sum(l(xs)), ps)
# Handle pooling layers
if !isempty(ps)
@test gs[first(ps)] isa Flux.CuArrays.CuArray
end
end
end
end
end
end
# Repeats from Conv, CrossCor
r = rand(Float32, 28, 28, 1, 1)
conv_layers = [Conv, ConvTranspose, CrossCor, DepthwiseConv]
gradtest("Conv", conv_layers, r, (2,2), 1=>3)
pooling_layers = [MaxPool, MeanPool]
gradtest("Pooling", pooling_layers, r, (2,2))
dropout_layers = [Dropout, AlphaDropout]
gradtest("Dropout", dropout_layers, r, 0.5f0)
norm_layers = [LayerNorm, BatchNorm]
gradtest("Normalising", norm_layers, rand(Float32, 28,28,3,1), 1)
instancenorm = [InstanceNorm]
gradtest("InstanceNorm", instancenorm, r, 1)
groupnorm = [GroupNorm]
gradtest("GroupNorm", groupnorm, rand(Float32, 28,28,3,1), 3, 1)
const stateless_layers = [Flux.mse,
Flux.crossentropy,
Flux.logitcrossentropy,
Flux.normalise]
const stateless_layers_broadcasted = [Flux.binarycrossentropy,
Flux.logitbinarycrossentropy]
function stateless_gradtest(f, args...)
@test gradient((args...) -> sum(f(args...)), args...)[1] isa CuArray
end
function stateless_gradtest_broadcasted(f, args...)
@test gradient((args...) -> sum(f.(args...)), args...)[1] isa CuArray
end
@testset "Stateless GPU grad tests" begin
x = gpu(rand(3,3))
y = gpu(rand(3,3))
for layer in stateless_layers
if layer == Flux.normalise
stateless_gradtest(layer, x)
else
stateless_gradtest(layer, x, y)
end
end
for layer in stateless_layers_broadcasted
stateless_gradtest_broadcasted(layer, x, y)
end
end

View File

@ -3,34 +3,20 @@
Y = [1:5;] Y = [1:5;]
d = DataLoader(X, batchsize=2) d = DataLoader(X, batchsize=2)
@inferred first(d)
batches = collect(d) batches = collect(d)
@test eltype(batches) == eltype(d) == typeof(X)
@test length(batches) == 3 @test length(batches) == 3
@test batches[1] == X[:,1:2] @test batches[1] == X[:,1:2]
@test batches[2] == X[:,3:4] @test batches[2] == X[:,3:4]
@test batches[3] == X[:,5:5] @test batches[3] == X[:,5:5]
d = DataLoader(X, batchsize=2, partial=false) d = DataLoader(X, batchsize=2, partial=false)
@inferred first(d)
batches = collect(d) batches = collect(d)
@test eltype(batches) == eltype(d) == typeof(X)
@test length(batches) == 2 @test length(batches) == 2
@test batches[1] == X[:,1:2] @test batches[1] == X[:,1:2]
@test batches[2] == X[:,3:4] @test batches[2] == X[:,3:4]
d = DataLoader((X,), batchsize=2, partial=false) d = DataLoader(X, Y, batchsize=2)
@inferred first(d)
batches = collect(d) batches = collect(d)
@test eltype(batches) == eltype(d) == Tuple{typeof(X)}
@test length(batches) == 2
@test batches[1] == (X[:,1:2],)
@test batches[2] == (X[:,3:4],)
d = DataLoader((X, Y), batchsize=2)
@inferred first(d)
batches = collect(d)
@test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
@test length(batches) == 3 @test length(batches) == 3
@test length(batches[1]) == 2 @test length(batches[1]) == 2
@test length(batches[2]) == 2 @test length(batches[2]) == 2
@ -42,22 +28,6 @@
@test batches[3][1] == X[:,5:5] @test batches[3][1] == X[:,5:5]
@test batches[3][2] == Y[5:5] @test batches[3][2] == Y[5:5]
# test with NamedTuple
d = DataLoader((x=X, y=Y), batchsize=2)
@inferred first(d)
batches = collect(d)
@test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
@test length(batches) == 3
@test length(batches[1]) == 2
@test length(batches[2]) == 2
@test length(batches[3]) == 2
@test batches[1][1] == batches[1].x == X[:,1:2]
@test batches[1][2] == batches[1].y == Y[1:2]
@test batches[2][1] == batches[2].x == X[:,3:4]
@test batches[2][2] == batches[2].y == Y[3:4]
@test batches[3][1] == batches[3].x == X[:,5:5]
@test batches[3][2] == batches[3].y == Y[5:5]
# test interaction with `train!` # test interaction with `train!`
θ = ones(2) θ = ones(2)
X = zeros(2, 10) X = zeros(2, 10)
@ -71,7 +41,7 @@
X = ones(2, 10) X = ones(2, 10)
Y = fill(2, 10) Y = fill(2, 10)
loss(x, y) = sum((y - x'*θ).^2) loss(x, y) = sum((y - x'*θ).^2)
d = DataLoader((X, Y)) d = DataLoader(X, Y)
Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1)) Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
@test norm(θ .- 1) < 1e-10 @test norm(θ .- 1) < 1e-10
end end

View File

@ -28,14 +28,6 @@ import Flux: activations
end end
@testset "Dense" begin @testset "Dense" begin
@testset "constructors" begin
@test size(Dense(10, 100).W) == (100, 10)
@test Dense(rand(100,10), rand(10)).σ == identity
@test_throws MethodError Dense(10, 10.5)
@test_throws MethodError Dense(10, 10.5, tanh)
end
@test length(Dense(10, 5)(randn(10))) == 5 @test length(Dense(10, 5)(randn(10))) == 5
@test_throws DimensionMismatch Dense(10, 5)(randn(1)) @test_throws DimensionMismatch Dense(10, 5)(randn(1))
@test_throws MethodError Dense(10, 5)(1) # avoid broadcasting @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
@ -45,6 +37,7 @@ import Flux: activations
@test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2) @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
@test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1) @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
@test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20] @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
end end
@testset "Diagonal" begin @testset "Diagonal" begin

View File

@ -25,35 +25,6 @@ end
Dense(288, 10), softmax) Dense(288, 10), softmax)
@test size(m(r)) == (10, 5) @test size(m(r)) == (10, 5)
# Test bias switch
bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
ip = zeros(Float32, 28,28,1,1)
op = bias(ip)
@test sum(op) == prod(size(op))
bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
op = bias(ip)
@test sum(op) === 0.f0
gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
@test gs[bias.bias] == nothing
# Train w/o bias and make sure no convergence happens
# when only bias can be converged
bias = Conv((2, 2), 1=>3, bias = Flux.Zeros());
ip = zeros(Float32, 28,28,1,1)
op = zeros(Float32, 27,27,3,1) .+ 2.f0
opt = Descent()
for _ = 1:10^3
gs = gradient(params(bias)) do
Flux.mse(bias(ip), op)
end
Flux.Optimise.update!(opt, params(bias), gs)
end
@test Flux.mse(bias(ip), op) 4.f0
end end
@testset "asymmetric padding" begin @testset "asymmetric padding" begin
@ -192,27 +163,3 @@ end
m = MeanPool((2, 2); stride = 2, pad = 3) m = MeanPool((2, 2); stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5) @test Flux.outdims(m, (5, 5)) == (5, 5)
end end
@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, CrossCor), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
data = ones(Float32, (k .+ 3)..., 1,1)
l = ltype(k, 1=>1, pad=SamePad())
@test size(l(data)) == size(data)
l = ltype(k, 1=>1, pad=SamePad(), dilation = k 2)
@test size(l(data)) == size(data)
stride = 3
l = ltype(k, 1=>1, pad=SamePad(), stride = stride)
if ltype == ConvTranspose
@test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] .- stride .+ 1
else
@test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ stride)
end
end
@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), k in ( (1,), (2,), (3,), (4,5), (6,7,8))
data = ones(Float32, (k .+ 3)..., 1,1)
l = ltype(k, pad=SamePad())
@test size(l(data))[1:end-2] == ceil.(Int, size(data)[1:end-2] ./ k)
end

View File

@ -1,26 +1,9 @@
using Test using Test
using Flux: onehotbatch, mse, crossentropy, logitcrossentropy, using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
σ, binarycrossentropy, logitbinarycrossentropy, flatten, σ, binarycrossentropy, logitbinarycrossentropy, flatten
xlogx, xlogy
const ϵ = 1e-7 const ϵ = 1e-7
@testset "xlogx & xlogy" begin
@test iszero(xlogx(0))
@test isnan(xlogx(NaN))
@test xlogx(2) 2.0 * log(2.0)
@inferred xlogx(2)
@inferred xlogx(0)
@test iszero(xlogy(0, 1))
@test isnan(xlogy(NaN, 1))
@test isnan(xlogy(1, NaN))
@test isnan(xlogy(NaN, NaN))
@test xlogy(2, 3) 2.0 * log(3.0)
@inferred xlogy(2, 3)
@inferred xlogy(0, 1)
end
@testset "losses" begin @testset "losses" begin
# First, regression-style y's # First, regression-style y's
y = [1, 1, 0, 0] y = [1, 1, 0, 0]
@ -52,7 +35,6 @@ end
lossvalue = 1.203972804325936 lossvalue = 1.203972804325936
@testset "crossentropy" begin @testset "crossentropy" begin
@test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) crossentropy([0.1,0.9], [0.1,0.9])
@test crossentropy(ŷ, y) lossvalue @test crossentropy(ŷ, y) lossvalue
end end
@ -85,7 +67,6 @@ end
y = [1 2 3] y = [1 2 3]
ŷ = [4.0 5.0 6.0] ŷ = [4.0 5.0 6.0]
@testset "kldivergence" begin @testset "kldivergence" begin
@test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) Flux.kldivergence([0.1,0.9], [0.1,0.9])
@test Flux.kldivergence(ŷ, y) -1.7661057888493457 @test Flux.kldivergence(ŷ, y) -1.7661057888493457
@test Flux.kldivergence(y, y) 0 @test Flux.kldivergence(y, y) 0
end end

View File

@ -57,16 +57,6 @@ end
end end
@testset "ExpDecay" begin @testset "ExpDecay" begin
@testset "Sanity Check" begin
o = ExpDecay(0.2, 0.5, 1, 1e-3)
p = [0.0]
steps = 1:8
eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
@test eta_actual == eta_expected
end
w = randn(10, 10) w = randn(10, 10)
o = ExpDecay(0.1, 0.1, 1000, 1e-4) o = ExpDecay(0.1, 0.1, 1000, 1e-4)
w1 = randn(10,10) w1 = randn(10,10)
@ -91,23 +81,11 @@ end
end end
end end
@test flag == 1 @test flag == 1
# Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1). # Test to check if decay happens at decay steps. Eta reaches clip value eventually.
ground_truth = [] ground_truth = []
for i in 1:4 for i in 1:11
push!(ground_truth, 1000*i) # Expected decay steps for this example. push!(ground_truth, 1000*i) # Expected decay steps for this example.
end end
@test decay_steps == ground_truth @test decay_steps == ground_truth
@test o.eta == o.clip @test o.eta == o.clip
end end
@testset "Clipping" begin
w = randn(10, 10)
loss(x) = sum(w * x)
θ = Params([w])
x = 1000 * randn(10)
= gradient(() -> loss(x), θ)[w]
w̄_value = Optimise.apply!(ClipValue(1.0), w, copy())
@test all(w̄_value .<= 1)
w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy())
@test norm(w̄_norm) <= 1
end

View File

@ -2,10 +2,13 @@ using Flux
using Flux.Data using Flux.Data
using Test using Test
using Random, Statistics, LinearAlgebra using Random, Statistics, LinearAlgebra
using Documenter
using IterTools: ncycle using IterTools: ncycle
Random.seed!(0) Random.seed!(0)
@testset "Flux" begin
@testset "Utils" begin @testset "Utils" begin
include("utils.jl") include("utils.jl")
end end
@ -37,10 +40,11 @@ end
end end
end end
@static if VERSION >= v"1.4"
using Documenter
@testset "Docs" begin @testset "Docs" begin
if VERSION >= v"1.4"
DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true) DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
doctest(Flux) doctest(Flux)
end end
end end
end # testset Flux