Compare commits
8 Commits
Author | SHA1 | Date |
---|---|---|
![]() |
2290ced09a | |
![]() |
79391beca0 | |
![]() |
b44ba162b1 | |
![]() |
654b100ce3 | |
![]() |
508b392204 | |
![]() |
20ed5c5622 | |
![]() |
5f1604d25d | |
![]() |
fd64f4e18e |
|
@ -1,12 +0,0 @@
|
|||
[Please delete this text and describe your change here.
|
||||
For bugfixes, please detail the bug and include a test case which your patch fixes.
|
||||
If you are adding a new feature, please clearly describe the design, its rationale, the possible alternatives considered.
|
||||
It is easiest to merge new features when there is clear precedent in other systems; we need to know we're taking
|
||||
the right direction since it can be hard to change later.]
|
||||
|
||||
### PR Checklist
|
||||
|
||||
- [ ] Tests are added
|
||||
- [ ] Entry in NEWS.md
|
||||
- [ ] Documentation, if applicable
|
||||
- [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes).
|
|
@ -6,8 +6,16 @@ on:
|
|||
|
||||
jobs:
|
||||
CompatHelper:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
julia-version: [1.3]
|
||||
julia-arch: [x64]
|
||||
os: [ubuntu-latest]
|
||||
steps:
|
||||
- uses: julia-actions/setup-julia@latest
|
||||
with:
|
||||
version: ${{ matrix.julia-version }}
|
||||
- name: Pkg.add("CompatHelper")
|
||||
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
|
||||
- name: CompatHelper.main()
|
||||
|
|
|
@ -16,7 +16,7 @@ notifications:
|
|||
jobs:
|
||||
include:
|
||||
- stage: "Documentation"
|
||||
julia: 1.3
|
||||
julia: 1
|
||||
os: linux
|
||||
script:
|
||||
- julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
|
||||
|
|
119
Manifest.toml
119
Manifest.toml
|
@ -14,29 +14,29 @@ version = "0.3.3"
|
|||
|
||||
[[Adapt]]
|
||||
deps = ["LinearAlgebra"]
|
||||
git-tree-sha1 = "fd04049c7dd78cfef0b06cdc1f0f181467655712"
|
||||
git-tree-sha1 = "c88cfc7f9c1f9f8633cddf0b56e86302b70f64c5"
|
||||
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
|
||||
version = "1.1.0"
|
||||
version = "1.0.1"
|
||||
|
||||
[[ArrayLayouts]]
|
||||
deps = ["FillArrays", "LinearAlgebra"]
|
||||
git-tree-sha1 = "a504dca2ac7eda8761c8f7c1ed52427a1be75a3c"
|
||||
git-tree-sha1 = "5a57a6158c1d340635a89d19beb34b0f325a4431"
|
||||
uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
|
||||
version = "0.2.6"
|
||||
version = "0.2.5"
|
||||
|
||||
[[Base64]]
|
||||
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
|
||||
|
||||
[[BinaryProvider]]
|
||||
deps = ["Libdl", "Logging", "SHA"]
|
||||
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
|
||||
git-tree-sha1 = "428e9106b1ff27593cbd979afac9b45b82372b8c"
|
||||
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
|
||||
version = "0.5.10"
|
||||
version = "0.5.9"
|
||||
|
||||
[[CEnum]]
|
||||
git-tree-sha1 = "1b77a77c3b28e0b3f413f7567c9bb8dd9bdccd14"
|
||||
git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
|
||||
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
|
||||
version = "0.3.0"
|
||||
version = "0.2.0"
|
||||
|
||||
[[CUDAapi]]
|
||||
deps = ["Libdl", "Logging"]
|
||||
|
@ -46,21 +46,21 @@ version = "4.0.0"
|
|||
|
||||
[[CUDAdrv]]
|
||||
deps = ["CEnum", "CUDAapi", "Printf"]
|
||||
git-tree-sha1 = "f56bbf18c86bcff7a961a32a4947a5abb2963a29"
|
||||
git-tree-sha1 = "17248da4169c0cdd1699da542f8e110fe4168af6"
|
||||
uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
|
||||
version = "6.3.0"
|
||||
version = "6.2.3"
|
||||
|
||||
[[CUDAnative]]
|
||||
deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "ExprTools", "GPUCompiler", "LLVM", "Libdl", "Pkg", "Printf"]
|
||||
git-tree-sha1 = "ac86db2b05fdfec96b011e25a504ffe7476e8a68"
|
||||
deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Pkg", "Printf", "TimerOutputs"]
|
||||
git-tree-sha1 = "0da071ed49a6f5f62d5164de071daa07cedaa1e6"
|
||||
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
|
||||
version = "3.1.0"
|
||||
version = "3.0.4"
|
||||
|
||||
[[CodeTracking]]
|
||||
deps = ["InteractiveUtils", "UUIDs"]
|
||||
git-tree-sha1 = "cab4da992adc0a64f63fa30d2db2fd8bec40cab4"
|
||||
git-tree-sha1 = "c8f94de86731698373f3c82a8aa40d8ab765c50c"
|
||||
uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
|
||||
version = "0.5.11"
|
||||
version = "0.5.9"
|
||||
|
||||
[[CodecZlib]]
|
||||
deps = ["TranscodingStreams", "Zlib_jll"]
|
||||
|
@ -70,15 +70,15 @@ version = "0.7.0"
|
|||
|
||||
[[ColorTypes]]
|
||||
deps = ["FixedPointNumbers", "Random"]
|
||||
git-tree-sha1 = "c73d9cfc2a9d8433dc77f5bff4bddf46b1d78c20"
|
||||
git-tree-sha1 = "f746d4fc892fdf683b5c22064c8e99b2f5b990e7"
|
||||
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
|
||||
version = "0.10.3"
|
||||
version = "0.10.2"
|
||||
|
||||
[[Colors]]
|
||||
deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"]
|
||||
git-tree-sha1 = "1e9bba7984e78aa8cdeea7f9f7cc984ad4e4b1c7"
|
||||
git-tree-sha1 = "2fdeb981ebcf52cd800ddb6a0aa5eac34153552d"
|
||||
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
|
||||
version = "0.12.2"
|
||||
version = "0.12.0"
|
||||
|
||||
[[CommonSubexpressions]]
|
||||
deps = ["Test"]
|
||||
|
@ -93,16 +93,16 @@ uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
|
|||
version = "0.3.3+0"
|
||||
|
||||
[[Cthulhu]]
|
||||
deps = ["CodeTracking", "InteractiveUtils", "REPL", "UUIDs", "Unicode"]
|
||||
git-tree-sha1 = "f3643e78353199d3097821e806348bd83f364155"
|
||||
deps = ["CodeTracking", "InteractiveUtils", "REPL", "Unicode"]
|
||||
git-tree-sha1 = "a4849ec61df9659423cc63b298ed895904ee9743"
|
||||
uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
|
||||
version = "1.1.1"
|
||||
version = "1.0.2"
|
||||
|
||||
[[CuArrays]]
|
||||
deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
|
||||
git-tree-sha1 = "1582b74d2322df7dd94549d4ac9d095e0f20e884"
|
||||
git-tree-sha1 = "ad04351946e2ee59a0f1295de28a750dc4917704"
|
||||
uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
|
||||
version = "2.2.1"
|
||||
version = "2.1.0"
|
||||
|
||||
[[DataAPI]]
|
||||
git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32"
|
||||
|
@ -111,9 +111,9 @@ version = "1.3.0"
|
|||
|
||||
[[DataStructures]]
|
||||
deps = ["InteractiveUtils", "OrderedCollections"]
|
||||
git-tree-sha1 = "af6d9c86e191c917c2276fbede1137e8ea20157f"
|
||||
git-tree-sha1 = "6166ecfaf2b8bbf2b68d791bc1d54501f345d314"
|
||||
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
||||
version = "0.17.17"
|
||||
version = "0.17.15"
|
||||
|
||||
[[Dates]]
|
||||
deps = ["Printf"]
|
||||
|
@ -146,9 +146,9 @@ version = "0.1.1"
|
|||
|
||||
[[FillArrays]]
|
||||
deps = ["LinearAlgebra", "Random", "SparseArrays"]
|
||||
git-tree-sha1 = "44f561e293987ffc84272cd3d2b14b0b93123d63"
|
||||
git-tree-sha1 = "51cc2f9bc4eb9c6c0e81ec2f779d1085583cc956"
|
||||
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
|
||||
version = "0.8.10"
|
||||
version = "0.8.7"
|
||||
|
||||
[[FixedPointNumbers]]
|
||||
git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238"
|
||||
|
@ -161,33 +161,17 @@ git-tree-sha1 = "869540e4367122fbffaace383a5bdc34d6e5e5ac"
|
|||
uuid = "f6369f11-7733-5829-9624-2563aa707210"
|
||||
version = "0.10.10"
|
||||
|
||||
[[Functors]]
|
||||
deps = ["MacroTools"]
|
||||
git-tree-sha1 = "f40adc6422f548176bb4351ebd29e4abf773040a"
|
||||
uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
|
||||
version = "0.1.0"
|
||||
|
||||
[[Future]]
|
||||
deps = ["Random"]
|
||||
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
|
||||
|
||||
[[GPUArrays]]
|
||||
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
|
||||
git-tree-sha1 = "d887693eb1bd5e1fd573262a978745481895ec7d"
|
||||
git-tree-sha1 = "c63cb01e3b6f48ab39f1e35c31ba870650814a18"
|
||||
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
|
||||
version = "3.4.1"
|
||||
|
||||
[[GPUCompiler]]
|
||||
deps = ["Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs"]
|
||||
git-tree-sha1 = "5275aa268ecd09640b32560e1eae90c78816e4d1"
|
||||
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
|
||||
version = "0.2.0"
|
||||
version = "3.2.0"
|
||||
|
||||
[[IRTools]]
|
||||
deps = ["InteractiveUtils", "MacroTools", "Test"]
|
||||
git-tree-sha1 = "90ee39f9beaaa186e4968417ea2b8ed5673c91c0"
|
||||
git-tree-sha1 = "8845400bd2d9815d37720251f1b53d27a335e1f4"
|
||||
uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
|
||||
version = "0.3.3"
|
||||
version = "0.3.2"
|
||||
|
||||
[[InteractiveUtils]]
|
||||
deps = ["Markdown"]
|
||||
|
@ -195,15 +179,15 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
|
|||
|
||||
[[Juno]]
|
||||
deps = ["Base64", "Logging", "Media", "Profile"]
|
||||
git-tree-sha1 = "a686b0cf235fa3e491b79b4783c2d2382292b436"
|
||||
git-tree-sha1 = "e1ba2a612645b3e07c773c3a208f215745081fe6"
|
||||
uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
|
||||
version = "0.8.2"
|
||||
version = "0.8.1"
|
||||
|
||||
[[LLVM]]
|
||||
deps = ["CEnum", "Libdl", "Printf", "Unicode"]
|
||||
git-tree-sha1 = "dd3f584c3dbefe39b2a8fbafa1a3b77e31e21255"
|
||||
git-tree-sha1 = "b6b86801ae2f2682e0a4889315dc76b68db2de71"
|
||||
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
|
||||
version = "1.5.1"
|
||||
version = "1.3.4"
|
||||
|
||||
[[LibGit2]]
|
||||
deps = ["Printf"]
|
||||
|
@ -262,9 +246,10 @@ uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
|
|||
version = "0.5.3+3"
|
||||
|
||||
[[OrderedCollections]]
|
||||
git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3"
|
||||
deps = ["Random", "Serialization", "Test"]
|
||||
git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
|
||||
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
|
||||
version = "1.2.0"
|
||||
version = "1.1.0"
|
||||
|
||||
[[Pkg]]
|
||||
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
|
||||
|
@ -319,15 +304,15 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
|
|||
|
||||
[[SpecialFunctions]]
|
||||
deps = ["OpenSpecFun_jll"]
|
||||
git-tree-sha1 = "d8d8b8a9f4119829410ecd706da4cc8594a1e020"
|
||||
git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
|
||||
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
|
||||
version = "0.10.3"
|
||||
version = "0.10.0"
|
||||
|
||||
[[StaticArrays]]
|
||||
deps = ["LinearAlgebra", "Random", "Statistics"]
|
||||
git-tree-sha1 = "5c06c0aeb81bef54aed4b3f446847905eb6cbda0"
|
||||
git-tree-sha1 = "4118cba3529e99af61aea9a83f7bfd3cff5ffb28"
|
||||
uuid = "90137ffa-7385-5640-81b9-e52037218182"
|
||||
version = "0.12.3"
|
||||
version = "0.12.2"
|
||||
|
||||
[[Statistics]]
|
||||
deps = ["LinearAlgebra", "SparseArrays"]
|
||||
|
@ -345,9 +330,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
|||
|
||||
[[TimerOutputs]]
|
||||
deps = ["Printf"]
|
||||
git-tree-sha1 = "f458ca23ff80e46a630922c555d838303e4b9603"
|
||||
git-tree-sha1 = "0cc8db57cb537191b02948d4fabdc09eb7f31f98"
|
||||
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
|
||||
version = "0.5.6"
|
||||
version = "0.5.5"
|
||||
|
||||
[[TranscodingStreams]]
|
||||
deps = ["Random", "Test"]
|
||||
|
@ -364,21 +349,21 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
|||
|
||||
[[ZipFile]]
|
||||
deps = ["Libdl", "Printf", "Zlib_jll"]
|
||||
git-tree-sha1 = "254975fef2fc526583bb9b7c9420fe66ffe09f2f"
|
||||
git-tree-sha1 = "8748302cfdec02c4ae9c97b112cf10003f7f767f"
|
||||
uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
|
||||
version = "0.9.2"
|
||||
version = "0.9.1"
|
||||
|
||||
[[Zlib_jll]]
|
||||
deps = ["Libdl", "Pkg"]
|
||||
git-tree-sha1 = "a2e0d558f6031002e380a90613b199e37a8565bf"
|
||||
git-tree-sha1 = "2f6c3e15e20e036ee0a0965879b31442b7ec50fa"
|
||||
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
|
||||
version = "1.2.11+10"
|
||||
version = "1.2.11+9"
|
||||
|
||||
[[Zygote]]
|
||||
deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "Future", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
|
||||
git-tree-sha1 = "707ceea58e2bd0ff3077ab13a92f8355181d3ee4"
|
||||
deps = ["AbstractFFTs", "ArrayLayouts", "DiffRules", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
|
||||
git-tree-sha1 = "f7b0f77a86d2434abf693e3c0330e4682deed28d"
|
||||
uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
|
||||
version = "0.4.20"
|
||||
version = "0.4.18"
|
||||
|
||||
[[ZygoteRules]]
|
||||
deps = ["MacroTools"]
|
||||
|
|
13
NEWS.md
13
NEWS.md
|
@ -1,18 +1,5 @@
|
|||
# v0.11
|
||||
* Change to `DataLoader`'s constructor [https://github.com/FluxML/Flux.jl/pull/1152]
|
||||
* Use `DataLoader` with `NamedTuple`s, so that tensors can be accessed by name [https://github.com/FluxML/Flux.jl/pull/1221].
|
||||
* Error if Dense layers weights and biases are not arrays [https://github.com/FluxML/Flux.jl/pull/1218].
|
||||
|
||||
# v0.10.5
|
||||
* Add option for [same padding](https://github.com/FluxML/Flux.jl/pull/901) to conv and pooling layers by setting `pad=SamePad()`.
|
||||
* Added option to set `bias` to [Flux.Zeros](https://github.com/FluxML/Flux.jl/pull/873) to eliminating `bias` from being trained.
|
||||
* Added `GlobalMaxPool` and `GlobalMeanPool` [layers](https://github.com/FluxML/Flux.jl/pull/950) for performing global pooling operations.
|
||||
* Added `ClipValue` and `ClipNorm` in this [pr](https://github.com/FluxML/Flux.jl/pull/1133) to `Flux.Optimise` to provide a cleaner API for gradient clipping.
|
||||
* Added new kwarg-only [constructors](https://github.com/FluxML/Flux.jl/pull/873) for the various convolutional layers.
|
||||
* Documented the convolutional layer constructors accepting `weight` and `bias` keyword arguments to supply custom arrays for those fields.
|
||||
* Testing suite improvements now test for gradients of all layers along with GPU support.
|
||||
* Functors have now moved to [Functors.jl](https://github.com/FluxML/Flux.jl/pull/1174) to allow for their use outside of Flux.
|
||||
* Added [helper functions](https://github.com/FluxML/Flux.jl/pull/873) `Flux.convfilter` and `Flux.depthwiseconvfilter` to construct weight arrays for convolutions outside of layer constructors so as to not have to depend on the default layers for custom implementations.
|
||||
|
||||
# v0.10.0
|
||||
* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
name = "Flux"
|
||||
uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
|
||||
version = "0.11.0-DEV"
|
||||
version = "0.10.4"
|
||||
|
||||
[deps]
|
||||
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
|
||||
|
@ -9,9 +9,7 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
|
|||
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
|
||||
CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
|
||||
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
|
||||
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
|
||||
Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
|
||||
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
|
||||
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
|
||||
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||
|
@ -27,11 +25,10 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
|
|||
|
||||
[compat]
|
||||
AbstractTrees = "0.2, 0.3"
|
||||
Adapt = "1, 2.0"
|
||||
Adapt = "1"
|
||||
CodecZlib = "0.5, 0.6, 0.7"
|
||||
Colors = "0.8, 0.9, 0.10, 0.11, 0.12"
|
||||
CuArrays = "2"
|
||||
Functors = "0.1"
|
||||
Juno = "0.5, 0.6, 0.7, 0.8"
|
||||
MacroTools = "0.3, 0.4, 0.5"
|
||||
NNlib = "0.6"
|
||||
|
|
|
@ -8,8 +8,9 @@ makedocs(modules=[Flux, NNlib],
|
|||
"Building Models" =>
|
||||
["Basics" => "models/basics.md",
|
||||
"Recurrence" => "models/recurrence.md",
|
||||
"Regularisation" => "models/regularisation.md",
|
||||
"Model Reference" => "models/layers.md",
|
||||
"Loss Functions" => "models/losses.md",
|
||||
"Regularisation" => "models/regularisation.md",
|
||||
"Advanced Model Building" => "models/advanced.md",
|
||||
"NNlib" => "models/nnlib.md"],
|
||||
"Handling Data" =>
|
||||
|
|
|
@ -7,15 +7,15 @@ julia> using Flux: onehot, onecold
|
|||
|
||||
julia> onehot(:b, [:a, :b, :c])
|
||||
3-element Flux.OneHotVector:
|
||||
0
|
||||
1
|
||||
0
|
||||
false
|
||||
true
|
||||
false
|
||||
|
||||
julia> onehot(:c, [:a, :b, :c])
|
||||
3-element Flux.OneHotVector:
|
||||
0
|
||||
0
|
||||
1
|
||||
false
|
||||
false
|
||||
true
|
||||
```
|
||||
|
||||
The inverse is `onecold` (which can take a general probability distribution, as well as just booleans).
|
||||
|
|
|
@ -19,7 +19,7 @@ Affine{Array{Float64,2},Array{Float64,1}}([0.66722 0.774872 0.249809; 0.843321 0
|
|||
julia> Flux.params(a) # default behavior
|
||||
Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297], [0.42394, 0.0170927, 0.544955]])
|
||||
|
||||
julia> Flux.trainable(a::Affine) = (a.W,)
|
||||
julia> Flux.trainable(a::Affine) = (a.W, a.b,)
|
||||
|
||||
julia> Flux.params(a)
|
||||
Params([[0.66722 0.774872 0.249809; 0.843321 0.403843 0.429232; 0.683525 0.662455 0.065297]])
|
||||
|
|
|
@ -32,6 +32,8 @@ julia> gradient(f, [2, 1], [2, 0])
|
|||
But machine learning models can have *hundreds* of parameters! To handle this, Flux lets you work with collections of parameters, via `params`. You can get the gradient of all parameters used in a program without explicitly passing them in.
|
||||
|
||||
```jldoctest basics
|
||||
julia> using Flux
|
||||
|
||||
julia> x = [2, 1];
|
||||
|
||||
julia> y = [2, 0];
|
||||
|
|
|
@ -20,11 +20,7 @@ GlobalMeanPool
|
|||
DepthwiseConv
|
||||
ConvTranspose
|
||||
CrossCor
|
||||
SamePad
|
||||
flatten
|
||||
Flux.Zeros
|
||||
Flux.convfilter
|
||||
Flux.depthwiseconvfilter
|
||||
```
|
||||
|
||||
## Recurrent Layers
|
||||
|
@ -71,22 +67,4 @@ Many normalisation layers behave differently under training and inference (testi
|
|||
```@docs
|
||||
Flux.testmode!
|
||||
trainmode!
|
||||
```
|
||||
|
||||
## Cost Functions
|
||||
```@docs
|
||||
Flux.mae
|
||||
Flux.mse
|
||||
Flux.msle
|
||||
Flux.huber_loss
|
||||
Flux.crossentropy
|
||||
Flux.logitcrossentropy
|
||||
Flux.binarycrossentropy
|
||||
Flux.logitbinarycrossentropy
|
||||
Flux.kldivergence
|
||||
Flux.poisson
|
||||
Flux.hinge
|
||||
Flux.squared_hinge
|
||||
Flux.dice_coeff_loss
|
||||
Flux.tversky_loss
|
||||
```
|
||||
```
|
|
@ -0,0 +1,36 @@
|
|||
## Loss Functions
|
||||
Flux provides a large number of common loss functions used for training machine learning models.
|
||||
|
||||
Loss functions for supervised learning typically expect as inputs a target `y`, and a prediction `ŷ`.
|
||||
In Flux's convention, the order of the arguments is the following
|
||||
```julia
|
||||
loss(ŷ, y)
|
||||
```
|
||||
|
||||
Most loss functions in Flux have an optional argument `agg`, denoting the type of aggregation performed over the
|
||||
batch:
|
||||
```julia
|
||||
loss(ŷ, y) # defaults to `mean`
|
||||
loss(ŷ, y, agg=sum) # use `sum` for reduction
|
||||
loss(ŷ, y, agg=x->sum(x, dims=2)) # partial reduction
|
||||
loss(ŷ, y, agg=x->mean(w .* x)) # weighted mean
|
||||
loss(ŷ, y, agg=identity) # no aggregation.
|
||||
```
|
||||
|
||||
### Losses Reference
|
||||
```@docs
|
||||
Flux.mae
|
||||
Flux.mse
|
||||
Flux.msle
|
||||
Flux.huber_loss
|
||||
Flux.crossentropy
|
||||
Flux.logitcrossentropy
|
||||
Flux.binarycrossentropy
|
||||
Flux.logitbinarycrossentropy
|
||||
Flux.kldivergence
|
||||
Flux.poisson_loss
|
||||
Flux.hinge
|
||||
Flux.squared_hinge
|
||||
Flux.dice_coeff_loss
|
||||
Flux.tversky_loss
|
||||
```
|
|
@ -7,9 +7,10 @@ add the result to the overall loss.
|
|||
For example, say we have a simple regression.
|
||||
|
||||
```julia
|
||||
using Flux: crossentropy
|
||||
using Flux
|
||||
using Flux: logitcrossentropy
|
||||
m = Dense(10, 5)
|
||||
loss(x, y) = crossentropy(softmax(m(x)), y)
|
||||
loss(x, y) = logitcrossentropy(m(x), y)
|
||||
```
|
||||
|
||||
We can regularise this by taking the (L2) norm of the parameters, `m.W` and `m.b`.
|
||||
|
@ -18,19 +19,19 @@ We can regularise this by taking the (L2) norm of the parameters, `m.W` and `m.b
|
|||
using LinearAlgebra
|
||||
|
||||
penalty() = norm(m.W) + norm(m.b)
|
||||
loss(x, y) = crossentropy(softmax(m(x)), y) + penalty()
|
||||
loss(x, y) = logitcrossentropy(m(x), y) + penalty()
|
||||
```
|
||||
|
||||
When working with layers, Flux provides the `params` function to grab all
|
||||
parameters at once. We can easily penalise everything with `sum(norm, params)`.
|
||||
parameters at once. We can easily penalise everything with `sum`:
|
||||
|
||||
```julia
|
||||
julia> params(m)
|
||||
julia> Flux.params(m)
|
||||
2-element Array{Any,1}:
|
||||
param([0.355408 0.533092; … 0.430459 0.171498])
|
||||
param([0.0, 0.0, 0.0, 0.0, 0.0])
|
||||
|
||||
julia> sum(norm, params(m))
|
||||
julia> sum(norm, Flux.params(m))
|
||||
26.01749952921026
|
||||
```
|
||||
|
||||
|
@ -40,9 +41,9 @@ Here's a larger example with a multi-layer perceptron.
|
|||
m = Chain(
|
||||
Dense(28^2, 128, relu),
|
||||
Dense(128, 32, relu),
|
||||
Dense(32, 10), softmax)
|
||||
Dense(32, 10))
|
||||
|
||||
loss(x, y) = crossentropy(m(x), y) + sum(norm, params(m))
|
||||
loss(x, y) = logitcrossentropy(m(x), y) + sum(norm, Flux.params(m))
|
||||
|
||||
loss(rand(28^2), rand(10))
|
||||
```
|
||||
|
|
|
@ -39,7 +39,7 @@ E.g. the following will have run into the same problem as above:
|
|||
leaky_tanh(x) = 0.01*x + tanh(x)
|
||||
```
|
||||
|
||||
While one could change the activation function (e.g. to use `0.01f0*x`), the idiomatic (and safe way) to avoid type casts whenever inputs changes is to use `oftype`:
|
||||
While one could change the activation function (e.g. to use `0.01f0x`), the idiomatic (and safe way) to avoid type casts whenever inputs changes is to use `oftype`:
|
||||
```
|
||||
leaky_tanh(x) = oftype(x/1, 0.01)*x + tanh(x)
|
||||
```
|
||||
|
|
|
@ -80,7 +80,7 @@ Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
|
|||
The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
|
||||
|
||||
```julia
|
||||
function Flux.Optimise.apply!(o::Momentum, x, Δ)
|
||||
function apply!(o::Momentum, x, Δ)
|
||||
η, ρ = o.eta, o.rho
|
||||
v = get!(o.velocity, x, zero(x))::typeof(x)
|
||||
@. v = ρ * v - η * Δ
|
||||
|
@ -140,16 +140,3 @@ ExpDecay
|
|||
InvDecay
|
||||
WeightDecay
|
||||
```
|
||||
|
||||
## Gradient Clipping
|
||||
|
||||
Gradient clipping is useful for training recurrent neural networks, which have a tendency to suffer from the exploding gradient problem. An example usage is
|
||||
|
||||
```julia
|
||||
opt = Optimiser(ClipValue(1e-3), ADAM(1e-3))
|
||||
```
|
||||
|
||||
```@docs
|
||||
ClipValue
|
||||
ClipNorm
|
||||
```
|
|
@ -142,7 +142,7 @@ function my_custom_train!(loss, ps, data, opt)
|
|||
for d in data
|
||||
gs = gradient(ps) do
|
||||
training_loss = loss(d...)
|
||||
# Insert whatever code you want here that needs Training loss, e.g. logging
|
||||
# Insert what ever code you want here that needs Training loss, e.g. logging
|
||||
return training_loss
|
||||
end
|
||||
# insert what ever code you want here that needs gradient
|
||||
|
|
|
@ -3,8 +3,7 @@ module Flux
|
|||
# Zero Flux Given
|
||||
|
||||
using Base: tail
|
||||
using Statistics, Random, LinearAlgebra
|
||||
using Zygote, MacroTools, Juno, Reexport
|
||||
using Zygote, MacroTools, Juno, Reexport, Statistics, Random
|
||||
using MacroTools: @forward
|
||||
@reexport using NNlib
|
||||
using Zygote: Params, @adjoint, gradient, pullback, @nograd
|
||||
|
@ -21,19 +20,18 @@ using .Optimise
|
|||
using .Optimise: @epochs
|
||||
export Descent, ADAM, Momentum, Nesterov, RMSProp,
|
||||
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
|
||||
ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay,
|
||||
ClipValue, ClipNorm
|
||||
ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
|
||||
|
||||
|
||||
using CuArrays
|
||||
const use_cuda = Ref(false)
|
||||
|
||||
include("utils.jl")
|
||||
include("zeros.jl")
|
||||
include("onehot.jl")
|
||||
include("functor.jl")
|
||||
|
||||
include("layers/stateless.jl")
|
||||
include("layers/losses.jl")
|
||||
include("layers/basic.jl")
|
||||
include("layers/conv.jl")
|
||||
include("layers/recurrent.jl")
|
||||
|
|
|
@ -51,6 +51,4 @@ export Iris
|
|||
include("housing.jl")
|
||||
export Housing
|
||||
|
||||
@deprecate DataLoader(x...; kws...) DataLoader(x; kws...)
|
||||
|
||||
end
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Adapted from Knet's src/data.jl (author: Deniz Yuret)
|
||||
|
||||
struct DataLoader{D}
|
||||
data::D
|
||||
struct DataLoader
|
||||
data
|
||||
batchsize::Int
|
||||
nobs::Int
|
||||
partial::Bool
|
||||
|
@ -11,20 +11,21 @@ struct DataLoader{D}
|
|||
end
|
||||
|
||||
"""
|
||||
DataLoader(data; batchsize=1, shuffle=false, partial=true)
|
||||
DataLoader(data...; batchsize=1, shuffle=false, partial=true)
|
||||
|
||||
An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
|
||||
(except possibly the last one).
|
||||
|
||||
Takes as input a single data tensor, or a tuple (or a named tuple) of tensors.
|
||||
The last dimension in each tensor is considered to be the observation dimension.
|
||||
Takes as input one or more data tensors, e.g. X in unsupervised learning, X and Y in
|
||||
supervised learning. The last dimension in each tensor is considered to be the observation
|
||||
dimension.
|
||||
|
||||
If `shuffle=true`, shuffles the observations each time iterations are re-started.
|
||||
If `partial=false`, drops the last mini-batch if it is smaller than the batchsize.
|
||||
|
||||
The original data is preserved in the `data` field of the DataLoader.
|
||||
The original data is preserved as a tuple in the `data` field of the DataLoader.
|
||||
|
||||
Usage example:
|
||||
Example usage:
|
||||
|
||||
Xtrain = rand(10, 100)
|
||||
train_loader = DataLoader(Xtrain, batchsize=2)
|
||||
|
@ -36,16 +37,9 @@ Usage example:
|
|||
|
||||
train_loader.data # original dataset
|
||||
|
||||
# similar, but yielding tuples
|
||||
train_loader = DataLoader((Xtrain,), batchsize=2)
|
||||
for (x,) in train_loader
|
||||
@assert size(x) == (10, 2)
|
||||
...
|
||||
end
|
||||
|
||||
Xtrain = rand(10, 100)
|
||||
Ytrain = rand(100)
|
||||
train_loader = DataLoader((Xtrain, Ytrain), batchsize=2, shuffle=true)
|
||||
train_loader = DataLoader(Xtrain, Ytrain, batchsize=2, shuffle=true)
|
||||
for epoch in 1:100
|
||||
for (x, y) in train_loader
|
||||
@assert size(x) == (10, 2)
|
||||
|
@ -57,26 +51,26 @@ Usage example:
|
|||
# train for 10 epochs
|
||||
using IterTools: ncycle
|
||||
Flux.train!(loss, ps, ncycle(train_loader, 10), opt)
|
||||
|
||||
# can use NamedTuple to name tensors
|
||||
train_loader = DataLoader((images=Xtrain, labels=Ytrain), batchsize=2, shuffle=true)
|
||||
for datum in train_loader
|
||||
@assert size(datum.images) == (10, 2)
|
||||
@assert size(datum.labels) == (2,)
|
||||
end
|
||||
"""
|
||||
function DataLoader(data; batchsize=1, shuffle=false, partial=true)
|
||||
function DataLoader(data...; batchsize=1, shuffle=false, partial=true)
|
||||
length(data) > 0 || throw(ArgumentError("Need at least one data input"))
|
||||
batchsize > 0 || throw(ArgumentError("Need positive batchsize"))
|
||||
|
||||
n = _nobs(data)
|
||||
if n < batchsize
|
||||
@warn "Number of observations less than batchsize, decreasing the batchsize to $n"
|
||||
batchsize = n
|
||||
nx = size(data[1])[end]
|
||||
for i=2:length(data)
|
||||
nx != size(data[i])[end] && throw(DimensionMismatch("All data should contain same number of observations"))
|
||||
end
|
||||
imax = partial ? n : n - batchsize + 1
|
||||
DataLoader(data, batchsize, n, partial, imax, [1:n;], shuffle)
|
||||
if nx < batchsize
|
||||
@warn "Number of data points less than batchsize, decreasing the batchsize to $nx"
|
||||
batchsize = nx
|
||||
end
|
||||
imax = partial ? nx : nx - batchsize + 1
|
||||
ids = 1:min(nx, batchsize)
|
||||
DataLoader(data, batchsize, nx, partial, imax, [1:nx;], shuffle)
|
||||
end
|
||||
|
||||
getdata(x::AbstractArray, ids) = x[(Base.Colon() for _=1:ndims(x)-1)..., ids]
|
||||
|
||||
@propagate_inbounds function Base.iterate(d::DataLoader, i=0) # returns data in d.indices[i+1:i+batchsize]
|
||||
i >= d.imax && return nothing
|
||||
if d.shuffle && i == 0
|
||||
|
@ -84,7 +78,11 @@ end
|
|||
end
|
||||
nexti = min(i + d.batchsize, d.nobs)
|
||||
ids = d.indices[i+1:nexti]
|
||||
batch = _getobs(d.data, ids)
|
||||
if length(d.data) == 1
|
||||
batch = getdata(d.data[1], ids)
|
||||
else
|
||||
batch = ((getdata(x, ids) for x in d.data)...,)
|
||||
end
|
||||
return (batch, nexti)
|
||||
end
|
||||
|
||||
|
@ -92,19 +90,3 @@ function Base.length(d::DataLoader)
|
|||
n = d.nobs / d.batchsize
|
||||
d.partial ? ceil(Int,n) : floor(Int,n)
|
||||
end
|
||||
|
||||
_nobs(data::AbstractArray) = size(data)[end]
|
||||
|
||||
function _nobs(data::Union{Tuple, NamedTuple})
|
||||
length(data) > 0 || throw(ArgumentError("Need at least one data input"))
|
||||
n = _nobs(data[1])
|
||||
if !all(x -> _nobs(x) == n, Base.tail(data))
|
||||
throw(DimensionMismatch("All data should contain same number of observations"))
|
||||
end
|
||||
return n
|
||||
end
|
||||
|
||||
_getobs(data::AbstractArray, i) = data[ntuple(i -> Colon(), Val(ndims(data) - 1))..., i]
|
||||
_getobs(data::Union{Tuple, NamedTuple}, i) = map(Base.Fix2(_getobs, i), data)
|
||||
|
||||
Base.eltype(::DataLoader{D}) where D = D
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
@deprecate param(x) x
|
||||
@deprecate data(x) x
|
||||
@deprecate poisson poisson_loss
|
||||
|
|
|
@ -1,6 +1,41 @@
|
|||
import Adapt: adapt, adapt_storage
|
||||
using Zygote: IdSet
|
||||
import Functors: @functor, functor, fmap
|
||||
|
||||
functor(x) = (), _ -> x
|
||||
|
||||
functor(x::Tuple) = x, y -> y
|
||||
functor(x::NamedTuple) = x, y -> y
|
||||
|
||||
functor(x::AbstractArray) = x, y -> y
|
||||
functor(x::AbstractArray{<:Number}) = (), _ -> x
|
||||
|
||||
function makefunctor(m::Module, T, fs = fieldnames(T))
|
||||
@eval m begin
|
||||
Flux.functor(x::$T) = ($([:($f=x.$f) for f in fs]...),), y -> $T(y...)
|
||||
end
|
||||
end
|
||||
|
||||
function functorm(T, fs = nothing)
|
||||
fs == nothing || isexpr(fs, :tuple) || error("@functor T (a, b)")
|
||||
fs = fs == nothing ? [] : [:($(map(QuoteNode, fs.args)...),)]
|
||||
:(makefunctor(@__MODULE__, $(esc(T)), $(fs...)))
|
||||
end
|
||||
|
||||
macro functor(args...)
|
||||
functorm(args...)
|
||||
end
|
||||
|
||||
isleaf(x) = functor(x)[1] === ()
|
||||
|
||||
function fmap1(f, x)
|
||||
func, re = functor(x)
|
||||
re(map(f, func))
|
||||
end
|
||||
|
||||
function fmap(f, x; cache = IdDict())
|
||||
haskey(cache, x) && return cache[x]
|
||||
cache[x] = isleaf(x) ? f(x) : fmap1(x -> fmap(f, x, cache = cache), x)
|
||||
end
|
||||
|
||||
trainable(m) = functor(m)[1]
|
||||
|
||||
|
@ -24,7 +59,7 @@ testmode!(m, mode = true) = m
|
|||
trainmode!(m, mode = true)
|
||||
|
||||
Set a layer of model's train mode (see below).
|
||||
Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`).
|
||||
Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)).
|
||||
|
||||
_Note_: if you manually set a model into train mode, you need to manually place
|
||||
it into test mode during testing phase.
|
||||
|
|
|
@ -30,7 +30,7 @@ end
|
|||
@forward Chain.layers Base.getindex, Base.length, Base.first, Base.last,
|
||||
Base.iterate, Base.lastindex
|
||||
|
||||
functor(::Type{<:Chain}, c) = c.layers, ls -> Chain(ls...)
|
||||
functor(c::Chain) = c.layers, ls -> Chain(ls...)
|
||||
|
||||
applychain(::Tuple{}, x) = x
|
||||
applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
|
||||
|
@ -102,7 +102,7 @@ julia> d(rand(5))
|
|||
-0.16210233
|
||||
0.12311903```
|
||||
"""
|
||||
struct Dense{F,S<:AbstractArray,T<:AbstractArray}
|
||||
struct Dense{F,S,T}
|
||||
W::S
|
||||
b::T
|
||||
σ::F
|
||||
|
|
|
@ -30,36 +30,27 @@ function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
|
|||
end
|
||||
|
||||
"""
|
||||
Conv(filter, in => out, σ = identity; init = glorot_uniform,
|
||||
Conv(size, in => out, σ = identity; init = glorot_uniform,
|
||||
stride = 1, pad = 0, dilation = 1)
|
||||
|
||||
filter = (2,2)
|
||||
in = 1
|
||||
out = 16
|
||||
Conv((2, 2), 1=>16, relu)
|
||||
|
||||
Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
|
||||
Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
|
||||
`in` and `out` specify the number of input and output channels respectively.
|
||||
|
||||
Data should be stored in WHCN order (width, height, # channels, batch size).
|
||||
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
||||
and a batch of 50 would be a `100×100×3×50` array.
|
||||
|
||||
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
||||
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
||||
|
||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
|
||||
|
||||
# Examples
|
||||
|
||||
Apply a `Conv` layer to a 1-channel input using a 2×2 window filter size, giving us a
|
||||
Apply a `Conv` layer to a 1-channel input using a 2×2 window size, giving us a
|
||||
16-channel output. Output is activated with ReLU.
|
||||
```julia
|
||||
filter = (2,2)
|
||||
size = (2,2)
|
||||
in = 1
|
||||
out = 16
|
||||
Conv(filter, in => out, relu)
|
||||
Conv(size, in => out, relu)
|
||||
```
|
||||
"""
|
||||
struct Conv{N,M,F,A,V}
|
||||
|
@ -71,28 +62,7 @@ struct Conv{N,M,F,A,V}
|
|||
dilation::NTuple{N,Int}
|
||||
end
|
||||
|
||||
"""
|
||||
Conv(weight::AbstractArray, bias::AbstractArray)
|
||||
Conv(weight::AbstractArray, bias::AbstractArray, activation)
|
||||
|
||||
Constructs the convolutional layer with user defined weight and bias arrays.
|
||||
|
||||
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
||||
|
||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||
|
||||
There is also a keyword-only constuctor available for all convoultional
|
||||
layers.
|
||||
|
||||
```julia
|
||||
weight = rand(Float32, 3, 3, 5)
|
||||
bias = zeros(Float32, 5)
|
||||
Conv(weight = weight,
|
||||
bias = bias,
|
||||
σ = sigmoid)
|
||||
```
|
||||
"""
|
||||
function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
||||
function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
|
||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
stride = expand(Val(N-2), stride)
|
||||
dilation = expand(Val(N-2), dilation)
|
||||
|
@ -100,39 +70,17 @@ function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = id
|
|||
return Conv(σ, w, b, stride, pad, dilation)
|
||||
end
|
||||
|
||||
function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
|
||||
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
|
||||
end
|
||||
|
||||
"""
|
||||
convfilter(filter::Tuple, in=>out)
|
||||
|
||||
Constructs a standard convolutional weight matrix with given `filter` and
|
||||
channels from `in` to `out`.
|
||||
|
||||
Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
|
||||
distribution.
|
||||
|
||||
See also: [`depthwiseconvfilter`](@ref)
|
||||
"""
|
||||
convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
|
||||
init = glorot_uniform) where N = init(filter..., ch...)
|
||||
|
||||
function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
|
||||
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
|
||||
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
|
||||
|
||||
Conv(weight, bias, σ,
|
||||
stride = stride, pad = pad, dilation = dilation)
|
||||
end
|
||||
Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
|
||||
init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
|
||||
Conv(init(k..., ch...), zeros(ch[2]), σ,
|
||||
stride = stride, pad = pad, dilation = dilation)
|
||||
|
||||
@functor Conv
|
||||
|
||||
function (c::Conv)(x::AbstractArray)
|
||||
# TODO: breaks gpu broadcast :(
|
||||
# ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
|
||||
σ, b = c.σ, reshape(c.bias, ntuple(_->1, length(c.stride))..., :, 1)
|
||||
σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
|
||||
cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
|
||||
σ.(conv(x, c.weight, cdims) .+ b)
|
||||
end
|
||||
|
@ -166,22 +114,16 @@ outdims(l::Conv, isize) =
|
|||
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
|
||||
|
||||
"""
|
||||
ConvTranspose(filter, in=>out)
|
||||
ConvTranspose(filter, in=>out, activation)
|
||||
ConvTranspose(filter, in => out, σ = identity; init = glorot_uniform,
|
||||
ConvTranspose(size, in => out, σ = identity; init = glorot_uniform,
|
||||
stride = 1, pad = 0, dilation = 1)
|
||||
|
||||
Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
|
||||
Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
|
||||
`in` and `out` specify the number of input and output channels respectively.
|
||||
|
||||
Data should be stored in WHCN order (width, height, # channels, batch size).
|
||||
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
||||
and a batch of 50 would be a `100×100×3×50` array.
|
||||
|
||||
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
||||
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
||||
|
||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||
Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1.
|
||||
"""
|
||||
struct ConvTranspose{N,M,F,A,V}
|
||||
|
@ -193,39 +135,18 @@ struct ConvTranspose{N,M,F,A,V}
|
|||
dilation::NTuple{N,Int}
|
||||
end
|
||||
|
||||
"""
|
||||
ConvTranspose(weight::AbstractArray, bias::AbstractArray)
|
||||
ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
|
||||
|
||||
Constructs the convolutional transpose layer with user defined weight and bias arrays.
|
||||
forward pass.
|
||||
|
||||
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
||||
|
||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||
|
||||
For keyword-only constuctor, see also [`Conv`](@ref)
|
||||
"""
|
||||
function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
|
||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
stride = expand(Val(N-2), stride)
|
||||
dilation = expand(Val(N-2), dilation)
|
||||
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
|
||||
return ConvTranspose(σ, w, b, stride, pad, dilation)
|
||||
end
|
||||
|
||||
function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
|
||||
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
|
||||
end
|
||||
|
||||
function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
|
||||
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
|
||||
weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
|
||||
|
||||
ConvTranspose(weight, bias, σ,
|
||||
ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
|
||||
init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
|
||||
ConvTranspose(init(k..., reverse(ch)...), zeros(ch[2]), σ,
|
||||
stride = stride, pad = pad, dilation = dilation)
|
||||
end
|
||||
|
||||
@functor ConvTranspose
|
||||
|
||||
|
@ -237,9 +158,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
|
|||
batch_size = size(x)[end]
|
||||
# Create DenseConvDims() that looks like the corresponding conv()
|
||||
return DenseConvDims((I..., C_in, batch_size), size(c.weight);
|
||||
stride=c.stride,
|
||||
padding=c.pad,
|
||||
dilation=c.dilation,
|
||||
stride=c.stride,
|
||||
padding=c.pad,
|
||||
dilation=c.dilation,
|
||||
)
|
||||
end
|
||||
|
||||
|
@ -250,7 +171,7 @@ function (c::ConvTranspose)(x::AbstractArray)
|
|||
# ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
|
||||
σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
|
||||
cdims = conv_transpose_dims(c, x)
|
||||
σ.(∇conv_data(x, c.weight, cdims) .+ b)
|
||||
return σ.(∇conv_data(x, c.weight, cdims) .+ b)
|
||||
end
|
||||
|
||||
function Base.show(io::IO, l::ConvTranspose)
|
||||
|
@ -269,12 +190,10 @@ end
|
|||
outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
|
||||
|
||||
"""
|
||||
DepthwiseConv(filter::Tuple, in=>out)
|
||||
DepthwiseConv(filter::Tuple, in=>out, activation)
|
||||
DepthwiseConv(filter, in => out, σ = identity; init = glorot_uniform,
|
||||
DepthwiseConv(size, in => out, σ = identity; init = glorot_uniform,
|
||||
stride = 1, pad = 0, dilation = 1)
|
||||
|
||||
Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
|
||||
Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
|
||||
`in` and `out` specify the number of input and output channels respectively.
|
||||
Note that `out` must be an integer multiple of `in`.
|
||||
|
||||
|
@ -282,10 +201,6 @@ Data should be stored in WHCN order (width, height, # channels, batch size).
|
|||
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
||||
and a batch of 50 would be a `100×100×3×50` array.
|
||||
|
||||
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
||||
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
||||
|
||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
|
||||
"""
|
||||
struct DepthwiseConv{N,M,F,A,V}
|
||||
|
@ -297,54 +212,20 @@ struct DepthwiseConv{N,M,F,A,V}
|
|||
dilation::NTuple{N,Int}
|
||||
end
|
||||
|
||||
"""
|
||||
DepthwiseConv(weight::AbstractArray, bias::AbstractArray)
|
||||
DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
|
||||
|
||||
Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
|
||||
forward pass.
|
||||
|
||||
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
||||
|
||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||
|
||||
For keyword-only constuctor, see also [`Conv`](@ref)
|
||||
"""
|
||||
function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
|
||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
stride = expand(Val(N-2), stride)
|
||||
dilation = expand(Val(N-2), dilation)
|
||||
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
|
||||
return DepthwiseConv(σ, w, b, stride, pad, dilation)
|
||||
end
|
||||
|
||||
function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
|
||||
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
|
||||
end
|
||||
|
||||
"""
|
||||
depthwiseconvfilter(filter::Tuple, in=>out)
|
||||
|
||||
Constructs a depthwise convolutional weight array defined by `filter` and channels
|
||||
from `in` to `out`.
|
||||
|
||||
Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
|
||||
distribution.
|
||||
|
||||
See also: [`convfilter`](@ref)
|
||||
"""
|
||||
depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
|
||||
init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
|
||||
|
||||
function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
|
||||
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
|
||||
weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
|
||||
init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N
|
||||
@assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
|
||||
|
||||
return DepthwiseConv(
|
||||
weight,
|
||||
bias,
|
||||
init(k..., div(ch[2], ch[1]), ch[1]),
|
||||
zeros(ch[2]),
|
||||
σ;
|
||||
stride = stride,
|
||||
pad = pad,
|
||||
|
@ -377,30 +258,24 @@ outdims(l::DepthwiseConv, isize) =
|
|||
output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
|
||||
|
||||
"""
|
||||
CrossCor(filter, in=>out)
|
||||
CrossCor(filter, in=>out, activation)
|
||||
CrossCor(filter, in => out, σ = identity; init = glorot_uniform,
|
||||
CrossCor(size, in => out, σ = identity; init = glorot_uniform,
|
||||
stride = 1, pad = 0, dilation = 1)
|
||||
|
||||
Standard cross convolutional layer. `filter` should be a tuple like `(2, 2)`.
|
||||
Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
|
||||
`in` and `out` specify the number of input and output channels respectively.
|
||||
|
||||
Data should be stored in WHCN order (width, height, # channels, batch size).
|
||||
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
|
||||
and a batch of 50 would be a `100×100×3×50` array.
|
||||
|
||||
Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
|
||||
Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
|
||||
|
||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||
Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
|
||||
|
||||
# Examples
|
||||
|
||||
Apply a `CrossCor` layer to a 1-channel input using a 2×2 window filter size, giving us a
|
||||
Apply a `CrossCor` layer to a 1-channel input using a 2×2 window size, giving us a
|
||||
16-channel output. Output is activated with ReLU.
|
||||
```julia
|
||||
filter = (2,2)
|
||||
size = (2,2)
|
||||
in = 1
|
||||
out = 16
|
||||
CrossCor((2, 2), 1=>16, relu)
|
||||
|
@ -415,39 +290,18 @@ struct CrossCor{N,M,F,A,V}
|
|||
dilation::NTuple{N,Int}
|
||||
end
|
||||
|
||||
"""
|
||||
CrossCor(weight::AbstractArray, bias::AbstractArray)
|
||||
CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
|
||||
|
||||
Constructs the standard cross convolutional layer with user defined weight and bias
|
||||
arrays.
|
||||
|
||||
Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
|
||||
|
||||
Takes the keyword arguments `pad`, `stride` and `dilation`.
|
||||
|
||||
For keyword-only constuctor, see also [`Conv`](@ref)
|
||||
"""
|
||||
function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
|
||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
|
||||
stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
stride = expand(Val(N-2), stride)
|
||||
dilation = expand(Val(N-2), dilation)
|
||||
pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
|
||||
return CrossCor(σ, w, b, stride, pad, dilation)
|
||||
end
|
||||
|
||||
function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
|
||||
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
|
||||
CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
|
||||
end
|
||||
|
||||
function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
|
||||
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
|
||||
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
|
||||
|
||||
CrossCor(weight, bias, σ,
|
||||
CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
|
||||
init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
|
||||
CrossCor(init(k..., ch...), zeros(ch[2]), σ,
|
||||
stride = stride, pad = pad, dilation = dilation)
|
||||
end
|
||||
|
||||
@functor CrossCor
|
||||
|
||||
|
|
|
@ -1,43 +1,35 @@
|
|||
# Cost functions
|
||||
"""
|
||||
mae(ŷ, y)
|
||||
mae(ŷ, y; agg=mean)
|
||||
|
||||
Return the mean of absolute error; calculated as
|
||||
`sum(abs.(ŷ .- y)) / length(y)`.
|
||||
Return the loss corresponding to mean absolute error:
|
||||
|
||||
agg(abs.(ŷ .- y))
|
||||
"""
|
||||
mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
|
||||
|
||||
mae(ŷ, y; agg=mean) = agg(abs.(ŷ .- y))
|
||||
|
||||
"""
|
||||
mse(ŷ, y)
|
||||
mse(ŷ, y; agg=mean)
|
||||
|
||||
Return the mean squared error between ŷ and y; calculated as
|
||||
`sum((ŷ .- y).^2) / length(y)`.
|
||||
|
||||
# Examples
|
||||
```jldoctest
|
||||
julia> Flux.mse([0, 2], [1, 1])
|
||||
1//1
|
||||
```
|
||||
Return the loss corresponding to mean square error:
|
||||
|
||||
agg((ŷ .- y).^2)
|
||||
"""
|
||||
mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
|
||||
|
||||
mse(ŷ, y; agg=mean) = agg((ŷ .- y).^2)
|
||||
|
||||
"""
|
||||
msle(ŷ, y; ϵ=eps(eltype(ŷ)))
|
||||
msle(ŷ, y; agg=mean, ϵ=eps(eltype(ŷ)))
|
||||
|
||||
The loss corresponding to mean squared logarithmic errors, calculated as
|
||||
|
||||
agg((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2)
|
||||
|
||||
Return the mean of the squared logarithmic errors; calculated as
|
||||
`sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
|
||||
The `ϵ` term provides numerical stability.
|
||||
|
||||
Penalizes an under-predicted estimate greater than an over-predicted estimate.
|
||||
Penalizes an under-predicted estimate more than an over-predicted estimate.
|
||||
"""
|
||||
msle(ŷ, y; ϵ=eps(eltype(ŷ))) = sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) * 1 // length(y)
|
||||
|
||||
|
||||
msle(ŷ, y; agg=mean, ϵ=epseltype(ŷ)) = agg((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2)
|
||||
|
||||
"""
|
||||
huber_loss(ŷ, y; δ=1.0)
|
||||
huber_loss(ŷ, y; δ=1, agg=mean)
|
||||
|
||||
Return the mean of the [Huber loss](https://en.wikipedia.org/wiki/Huber_loss)
|
||||
given the prediction `ŷ` and true values `y`.
|
||||
|
@ -46,111 +38,188 @@ given the prediction `ŷ` and true values `y`.
|
|||
Huber loss = |
|
||||
| δ * (|ŷ - y| - 0.5 * δ), otherwise
|
||||
"""
|
||||
#TODO: remove dropgrad when Zygote can handle this function with CuArrays
|
||||
function huber_loss(ŷ, y; δ=eltype(ŷ)(1))
|
||||
function huber_loss(ŷ, y; agg=mean, δ=ofeltype(ŷ, 1))
|
||||
abs_error = abs.(ŷ .- y)
|
||||
temp = Zygote.dropgrad(abs_error .< δ)
|
||||
x = eltype(ŷ)(0.5)
|
||||
hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
|
||||
temp = abs_error .< δ
|
||||
x = ofeltype(ŷ, 0.5)
|
||||
agg(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp))
|
||||
end
|
||||
|
||||
function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
|
||||
return -sum(xlogy.(y, ŷ)) * 1 // size(y, 2)
|
||||
end
|
||||
|
||||
function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
|
||||
return -sum(xlogy.(y, ŷ)) .* weight * 1 // size(y, 2)
|
||||
end
|
||||
|
||||
function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
|
||||
return -sum(xlogy.(y, ŷ) .* weight) * 1 // size(y, 2)
|
||||
end
|
||||
wsum(w::Nothing, x; dims) = sum(x, dims=dims)
|
||||
wsum(w::Number, x; dims) = w .* sum(x, dims=dims)
|
||||
wsum(w::AbstractArray, x; dims) = sum( w .* x, dims=dims)
|
||||
|
||||
"""
|
||||
crossentropy(ŷ, y; weight = nothing)
|
||||
crossentropy(ŷ, y; weight=nothing, dims=1, ϵ=eps(eltype(ŷ)),
|
||||
logits=false, agg=mean)
|
||||
|
||||
Return the cross entropy between the given probability distributions;
|
||||
calculated as `-sum(y .* log.(ŷ) .* weight) / size(y, 2)`.
|
||||
calculated as
|
||||
|
||||
`weight` can be `Nothing`, a `Number` or an `AbstractVector`.
|
||||
agg(.-sum(weight .* y .* log.(ŷ .+ ϵ); dims=dims))agg=mean,
|
||||
|
||||
`weight` can be `nothing`, a number or an array.
|
||||
`weight=nothing` acts like `weight=1` but is faster.
|
||||
|
||||
If `logits=true`, the input `̂y` is first fed to a [`softmax`](@ref) layer.
|
||||
|
||||
See also: [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
|
||||
|
||||
# Examples
|
||||
```jldoctest
|
||||
julia> Flux.crossentropy(softmax([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
|
||||
3.085467254747739
|
||||
```
|
||||
"""
|
||||
crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
|
||||
|
||||
"""
|
||||
logitcrossentropy(ŷ, y; weight = 1)
|
||||
|
||||
Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
|
||||
calculated as `-sum(y .* logsoftmax(ŷ) .* weight) / size(y, 2)`.
|
||||
|
||||
`logitcrossentropy(ŷ, y)` is mathematically equivalent to
|
||||
[`Flux.crossentropy(softmax(ŷ), y)`](@ref) but it is more numerically stable.
|
||||
|
||||
See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
|
||||
|
||||
# Examples
|
||||
```jldoctest
|
||||
julia> Flux.logitcrossentropy([-1.1491, 0.8619, 0.3127], [1, 1, 0])
|
||||
3.085467254747738
|
||||
```
|
||||
"""
|
||||
function logitcrossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
|
||||
return -sum(y .* logsoftmax(ŷ) .* weight) * 1 // size(y, 2)
|
||||
function crossentropy(ŷ, y; dims=1, agg=mean, ϵ=epseltype(ŷ),
|
||||
weight=nothing, logits=false)
|
||||
if logits
|
||||
return logitcrossentropy(ŷ, y; dims=dims, agg=agg, weight=weight)
|
||||
end
|
||||
agg(.-wsum(weight, y .* log.(ŷ .+ ϵ); dims=dims))
|
||||
end
|
||||
|
||||
"""
|
||||
binarycrossentropy(ŷ, y; ϵ=eps(ŷ))
|
||||
logitcrossentropy(ŷ, y; weight=nothing, agg=mean, dims=1)
|
||||
|
||||
Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
|
||||
calculated as
|
||||
|
||||
agg(.-sum(weight .* y .* logsoftmax(ŷ; dims=dims); dims=dims))
|
||||
|
||||
`logitcrossentropy(ŷ, y)` is mathematically equivalent to
|
||||
[`Flux.crossentropy(softmax(log.(ŷ)), y)`](@ref) but it is more numerically stable.
|
||||
|
||||
See also: [`Flux.crossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
|
||||
"""
|
||||
function logitcrossentropy(ŷ, y; dims=1, agg=mean, weight=nothing)
|
||||
agg(.-wsum(weight, y .* logsoftmax(ŷ; dims=dims); dims=dims))
|
||||
end
|
||||
|
||||
"""
|
||||
binarycrossentropy(ŷ, y; agg=mean, ϵ=epseltype(ŷ), logits=false)
|
||||
|
||||
Return ``-y*\\log(ŷ + ϵ) - (1-y)*\\log(1-ŷ + ϵ)``. The `ϵ` term provides numerical stability.
|
||||
|
||||
Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation.
|
||||
|
||||
If `logits=true`, the input `̂y` is first fed to a [`sigmoid`](@ref) activation.
|
||||
See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
|
||||
|
||||
# Examples
|
||||
```jldoctest
|
||||
julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
|
||||
3-element Array{Float64,1}:
|
||||
1.424397097347566
|
||||
0.35231664672364077
|
||||
0.8616703662235441
|
||||
```
|
||||
"""
|
||||
binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)
|
||||
|
||||
# Re-definition to fix interaction with CuArrays.
|
||||
CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
|
||||
function binarycrossentropy(ŷ, y; agg=mean, ϵ=epseltype(ŷ), logits=false)
|
||||
if logits
|
||||
return logitbinarycrossentropy(ŷ, y; agg=agg)
|
||||
end
|
||||
agg(@.(-y*log(ŷ+ϵ) - (1-y)*log(1-ŷ+ϵ)))
|
||||
end
|
||||
|
||||
"""
|
||||
logitbinarycrossentropy(ŷ, y)
|
||||
logitbinarycrossentropy(ŷ, y; agg=mean)
|
||||
|
||||
`logitbinarycrossentropy(ŷ, y)` is mathematically equivalent to
|
||||
[`Flux.binarycrossentropy(σ(ŷ), y)`](@ref) but it is more numerically stable.
|
||||
[`Flux.binarycrossentropy(σ(log(ŷ)), y)`](@ref) but it is more numerically stable.
|
||||
|
||||
See also: [`Flux.crossentropy`](@ref), [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref)
|
||||
|
||||
# Examples
|
||||
```jldoctest
|
||||
julia> Flux.logitbinarycrossentropy.([-1.1491, 0.8619, 0.3127], [1, 1, 0])
|
||||
3-element Array{Float64,1}:
|
||||
1.4243970973475661
|
||||
0.35231664672364094
|
||||
0.8616703662235443
|
||||
```
|
||||
"""
|
||||
logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
|
||||
function logitbinarycrossentropy(ŷ, y; agg=mean)
|
||||
agg(@.((1-y)*ŷ - logsigmoid(ŷ)))
|
||||
end
|
||||
|
||||
# Re-definition to fix interaction with CuArrays.
|
||||
CuArrays.@cufunc logitbinarycrossentropy(ŷ, y) = (1 - y)*ŷ - logσ(ŷ)
|
||||
"""
|
||||
kldivergence(ŷ, y; dims=1, agg=mean, ϵ=eps(eltype(ŷ)))
|
||||
|
||||
Return the [Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)
|
||||
between the given arrays interpreted as probability distributions.
|
||||
|
||||
KL divergence is a measure of how much one probability distribution is different
|
||||
from the other.
|
||||
It is always non-negative and zero only when both the distributions are equal
|
||||
everywhere.
|
||||
"""
|
||||
function kldivergence(ŷ, y; dims=1, agg=mean, ϵ=epseltype(ŷ))
|
||||
entropy = agg(sum(y .* log.(y .+ ϵ), dims=dims))
|
||||
cross_entropy = crossentropy(ŷ, y; dims=dims, agg=agg, ϵ=ϵ)
|
||||
return entropy + cross_entropy
|
||||
end
|
||||
|
||||
"""
|
||||
poisson_loss(ŷ, y; agg=mean, ϵ=eps(eltype(ŷ))))
|
||||
|
||||
Loss function derived from likelihood for a Poisson random variable with mean
|
||||
`ŷ` to take value `y`. It is given by
|
||||
|
||||
agg(ŷ .- y .* log.(ŷ .+ ϵ))
|
||||
|
||||
[More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
|
||||
"""
|
||||
poisson_loss(ŷ, y; agg=mean, ϵ=epseltype(ŷ)) = agg(ŷ .- y .* log.(ŷ .+ ϵ))
|
||||
|
||||
"""
|
||||
hinge(ŷ, y; agg=mean)
|
||||
|
||||
Return the [hinge loss](https://en.wikipedia.org/wiki/Hinge_loss) given the
|
||||
prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as
|
||||
|
||||
agg(max.(0, 1 .- ŷ .* y))
|
||||
|
||||
See also: [`squared_hinge`](@ref)
|
||||
"""
|
||||
hinge(ŷ, y; agg=mean) = agg(max.(0, 1 .- ŷ .* y))
|
||||
|
||||
"""
|
||||
squared_hinge(ŷ, y; agg=mean)
|
||||
|
||||
Return the squared hinge loss given the prediction `ŷ` and true labels `y`
|
||||
(containing 1 or -1); calculated as
|
||||
|
||||
agg(max.(0, 1 .- ŷ .* y).^2)
|
||||
|
||||
See also: [`hinge`](@ref)
|
||||
"""
|
||||
squared_hinge(ŷ, y; agg=mean) = agg(max.(0, 1 .- ŷ .* y).^2)
|
||||
|
||||
"""
|
||||
dice_coeff_loss(ŷ, y; smooth=1, dims=size(ŷ)[1:end-1], agg=mean)
|
||||
|
||||
Return a loss based on the Dice coefficient.
|
||||
Used in the [V-Net](https://arxiv.org/pdf/1606.04797v1.pdf) architecture
|
||||
for image segmentation.
|
||||
Current implementation only works for the binary segmentation case.
|
||||
|
||||
The arrays `ŷ` and `y` contain the predicted and true probabilities respectively
|
||||
for the foreground to be present in a certain pixel.
|
||||
The loss is computed as
|
||||
|
||||
1 - (2*sum(ŷ .* y; dims) .+ smooth) ./ (sum(ŷ.^2 .+ y.^2; dims) .+ smooth)
|
||||
|
||||
and then aggregated with `agg` over the batch.
|
||||
"""
|
||||
function dice_coeff_loss(ŷ, y; smooth=ofeltype(ŷ, 1),
|
||||
dims=size(ŷ)[1:end-1],
|
||||
agg=mean)
|
||||
f = x -> sum(x, dims=dims)
|
||||
agg(1 .- (2 .* f(y .* ŷ) .+ smooth) ./ (f(y.^2 + ŷ.^2) .+ smooth))
|
||||
end
|
||||
|
||||
"""
|
||||
tversky_loss(ŷ, y; β=0.7, α=1-β, dims=size(ŷ)[1:end-1] agg=mean)
|
||||
|
||||
Return the [Tversky loss](https://arxiv.org/pdf/1706.05721.pdf)
|
||||
for binary classification.
|
||||
The arrays `ŷ` and `y` contain the predicted and true probabilities respectively.
|
||||
Used with imbalanced data to give more weight to false negatives.
|
||||
Larger `β` weigh recall higher than precision (by placing more emphasis on false negatives)
|
||||
Calculated as:
|
||||
|
||||
num = sum(y .* ŷ, dims=dims)
|
||||
den = sum(@.(ŷ*y + α*ŷ*(1-y) + β*(1-ŷ)*y)), dims=dims)
|
||||
tversky_loss = 1 - num/den
|
||||
|
||||
and then aggregated with `agg` over the batch.
|
||||
|
||||
When `α+β=1`, it is equal to `1-F_β`, where `F_β` is an F-score.
|
||||
"""
|
||||
function tversky_loss(ŷ, y; β=ofeltype(ŷ, 0.7), α=1-β, dims=size(ŷ)[1:end-1], agg=mean)
|
||||
f = x -> sum(x, dims=dims)
|
||||
agg(1 .- f(ŷ .* y) ./ f(@.(ŷ*y + α*ŷ*(1-y) + β*(1-ŷ)*y)))
|
||||
end
|
||||
|
||||
|
||||
# TODO normalise over last dimension is typically what you want to do.
|
||||
# Possible deprecation path: `normalise(x; dims=1)` -> `normalise(x; dims)` -> `normalise(x; dims=size(x)[end])`
|
||||
"""
|
||||
normalise(x; dims=1)
|
||||
|
||||
|
@ -177,120 +246,18 @@ julia> Flux.normalise(a, dims=2)
|
|||
-1.22474 0.0 1.22474
|
||||
```
|
||||
"""
|
||||
function normalise(x::AbstractArray; dims=1)
|
||||
μ′ = mean(x, dims = dims)
|
||||
σ′ = std(x, dims = dims, mean = μ′, corrected=false)
|
||||
return (x .- μ′) ./ σ′
|
||||
function normalise(x::AbstractArray; dims=1, ϵ=ofeltype(x, 1e-6))
|
||||
μ′ = mean(x, dims=dims)
|
||||
# σ′ = std(x, dims=dims, mean=μ′, corrected=false) # use this when #478 gets merged
|
||||
σ′ = std(x, dims=dims, corrected=false)
|
||||
return (x .- μ′) ./ (σ′.+ ϵ)
|
||||
end
|
||||
|
||||
"""
|
||||
kldivergence(ŷ, y)
|
||||
|
||||
Return the
|
||||
[Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)
|
||||
between the given probability distributions.
|
||||
|
||||
KL divergence is a measure of how much one probability distribution is different
|
||||
from the other.
|
||||
It is always non-negative and zero only when both the distributions are equal
|
||||
everywhere.
|
||||
"""
|
||||
function kldivergence(ŷ, y)
|
||||
entropy = sum(xlogx.(y)) * 1 //size(y,2)
|
||||
cross_entropy = crossentropy(ŷ, y)
|
||||
return entropy + cross_entropy
|
||||
end
|
||||
|
||||
"""
|
||||
poisson(ŷ, y)
|
||||
|
||||
Return how much the predicted distribution `ŷ` diverges from the expected Poisson
|
||||
distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
|
||||
|
||||
[More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
|
||||
"""
|
||||
poisson(ŷ, y) = sum(ŷ .- xlogy.(y, ŷ)) * 1 // size(y,2)
|
||||
|
||||
"""
|
||||
hinge(ŷ, y)
|
||||
|
||||
Return the [hinge loss](https://en.wikipedia.org/wiki/Hinge_loss) given the
|
||||
prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as
|
||||
`sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)`.
|
||||
|
||||
See also: [`squared_hinge`](@ref)
|
||||
"""
|
||||
hinge(ŷ, y) = sum(max.(0, 1 .- ŷ .* y)) * 1 // size(y, 2)
|
||||
|
||||
"""
|
||||
squared_hinge(ŷ, y)
|
||||
|
||||
Return the squared hinge loss given the prediction `ŷ` and true labels `y`
|
||||
(containing 1 or -1); calculated as `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`.
|
||||
|
||||
See also: [`hinge`](@ref)
|
||||
"""
|
||||
squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) * 1 // size(y, 2)
|
||||
|
||||
"""
|
||||
dice_coeff_loss(ŷ, y; smooth=1)
|
||||
|
||||
Return a loss based on the dice coefficient.
|
||||
Used in the [V-Net](https://arxiv.org/pdf/1606.04797v1.pdf) image segmentation
|
||||
architecture.
|
||||
Similar to the F1_score. Calculated as:
|
||||
1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)`
|
||||
"""
|
||||
dice_coeff_loss(ŷ, y; smooth=eltype(ŷ)(1.0)) = 1 - (2*sum(y .* ŷ) + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
|
||||
|
||||
"""
|
||||
tversky_loss(ŷ, y; β=0.7)
|
||||
|
||||
Return the [Tversky loss](https://arxiv.org/pdf/1706.05721.pdf).
|
||||
Used with imbalanced data to give more weight to false negatives.
|
||||
Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
|
||||
Calculated as:
|
||||
1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
|
||||
"""
|
||||
tversky_loss(ŷ, y; β=eltype(ŷ)(0.7)) = 1 - (sum(y .* ŷ) + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
|
||||
|
||||
"""
|
||||
flatten(x::AbstractArray)
|
||||
|
||||
Transform (w, h, c, b)-shaped input into (w × h × c, b)-shaped output
|
||||
by linearizing all values for each element in the batch.
|
||||
Reshape arbitrarly-shaped input into a matrix-shaped output
|
||||
preserving the last dimension size.
|
||||
Equivalent to `reshape(x, :, size(x)[end])`.
|
||||
"""
|
||||
function flatten(x::AbstractArray)
|
||||
return reshape(x, :, size(x)[end])
|
||||
end
|
||||
|
||||
"""
|
||||
xlogx(x)
|
||||
Return `x * log(x)` for `x ≥ 0`, handling `x = 0` by taking the downward limit.
|
||||
"""
|
||||
function xlogx(x)
|
||||
result = x * log(x)
|
||||
ifelse(iszero(x), zero(result), result)
|
||||
end
|
||||
CuArrays.@cufunc function xlogx(x)
|
||||
result = x * log(x)
|
||||
ifelse(iszero(x), zero(result), result)
|
||||
end
|
||||
|
||||
"""
|
||||
xlogy(x, y)
|
||||
Return `x * log(y)` for `y > 0` with correct limit at `x = 0`.
|
||||
"""
|
||||
function xlogy(x, y)
|
||||
result = x * log(y)
|
||||
ifelse(iszero(x), zero(result), result)
|
||||
end
|
||||
CuArrays.@cufunc function xlogy(x, y)
|
||||
result = x * log(y)
|
||||
ifelse(iszero(x), zero(result), result)
|
||||
end
|
||||
|
||||
@adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric)
|
||||
res = xlogy.(x, y)
|
||||
res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
|
||||
end
|
||||
flatten(x::AbstractArray) = reshape(x, :, size(x)[end])
|
||||
|
|
|
@ -27,8 +27,7 @@ Base.getindex(xs::OneHotMatrix, ::Colon, ::Colon) = OneHotMatrix(xs.height, copy
|
|||
|
||||
Base.getindex(xs::OneHotMatrix, i::Integer, ::Colon) = map(x -> x[i], xs.data)
|
||||
|
||||
# remove workaround when https://github.com/JuliaGPU/CuArrays.jl/issues/676 is fixed
|
||||
A::AbstractMatrix * B::OneHotMatrix = A[:, cpu(map(x->x.ix, B.data))]
|
||||
A::AbstractMatrix * B::OneHotMatrix = A[:, map(x->x.ix, B.data)]
|
||||
|
||||
Base.hcat(x::OneHotVector, xs::OneHotVector...) = OneHotMatrix(length(x), [x, xs...])
|
||||
|
||||
|
@ -49,7 +48,7 @@ cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.d
|
|||
Create a `OneHotVector` with its `l`-th element `true` based on the
|
||||
possible set of `labels`.
|
||||
If `unk` is given, return `onehot(unk, labels)` if the input label `l` is not found
|
||||
in `labels`; otherwise, it will raise an error.
|
||||
in `labels`; otherwise it will error.
|
||||
|
||||
# Examples
|
||||
```jldoctest
|
||||
|
|
|
@ -1,12 +1,9 @@
|
|||
module Optimise
|
||||
|
||||
using LinearAlgebra
|
||||
|
||||
export train!, update!,
|
||||
Descent, ADAM, Momentum, Nesterov, RMSProp,
|
||||
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM,
|
||||
InvDecay, ExpDecay, WeightDecay, stop, Optimiser,
|
||||
ClipValue, ClipNorm
|
||||
InvDecay, ExpDecay, WeightDecay, stop, Optimiser
|
||||
|
||||
include("optimisers.jl")
|
||||
include("train.jl")
|
||||
|
|
|
@ -509,7 +509,7 @@ function apply!(o::ExpDecay, x, Δ)
|
|||
η, s, decay = o.eta, o.step, o.decay
|
||||
n = o.current[x] = get(o.current, x, 0) + 1
|
||||
if o.current[x]%s == 0 && count(x -> x%s == 0, values(o.current)) == 1
|
||||
η = max(η * decay, o.clip)
|
||||
η = max(η * decay^(s / n), o.clip)
|
||||
o.eta = η
|
||||
end
|
||||
@. Δ *= η
|
||||
|
@ -533,31 +533,3 @@ function apply!(o::WeightDecay, x, Δ)
|
|||
wd = o.wd
|
||||
@. Δ += wd * x
|
||||
end
|
||||
|
||||
"""
|
||||
ClipValue(thresh)
|
||||
|
||||
Clip gradients when their absolute value exceeds `thresh`.
|
||||
"""
|
||||
mutable struct ClipValue{T}
|
||||
thresh::T
|
||||
end
|
||||
|
||||
apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
|
||||
|
||||
"""
|
||||
ClipNorm(thresh)
|
||||
|
||||
Clip gradients when their L2 norm exceeds `thresh`.
|
||||
"""
|
||||
mutable struct ClipNorm{T}
|
||||
thresh::T
|
||||
end
|
||||
|
||||
function apply!(o::ClipNorm, x, Δ)
|
||||
Δnrm = norm(Δ)
|
||||
if Δnrm > o.thresh
|
||||
rmul!(Δ, o.thresh / Δnrm)
|
||||
end
|
||||
return Δ
|
||||
end
|
|
@ -68,7 +68,8 @@ and compute the gradient of `loss(d)`.
|
|||
A callback is given with the keyword argument `cb`. For example, this will print
|
||||
"training" every 10 seconds (using [`Flux.throttle`](@ref)):
|
||||
|
||||
train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))
|
||||
train!(loss, params, data, opt,
|
||||
cb = throttle(() -> println("training"), 10))
|
||||
|
||||
The callback can call [`Flux.stop`](@ref) to interrupt the training loop.
|
||||
|
||||
|
|
|
@ -4,6 +4,9 @@ nfan(n) = 1, n # A vector is treated as a n×1 matrix
|
|||
nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices
|
||||
nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of convolution kernels
|
||||
|
||||
ofeltype(x, y) = convert(float(eltype(x)), y)
|
||||
epseltype(x) = eps(float(eltype(x)))
|
||||
|
||||
"""
|
||||
glorot_uniform(dims...)
|
||||
|
||||
|
@ -246,10 +249,6 @@ function _restructure(m, xs)
|
|||
end
|
||||
end
|
||||
|
||||
@adjoint function _restructure(m, xs)
|
||||
_restructure(m, xs), dm -> (nothing,destructure(dm)[1])
|
||||
end
|
||||
|
||||
"""
|
||||
destructure(m)
|
||||
|
||||
|
|
106
src/zeros.jl
106
src/zeros.jl
|
@ -1,106 +0,0 @@
|
|||
import Base: +, -, *, reshape, size
|
||||
import Base.Broadcast: broadcasted, Broadcasted, BroadcastStyle
|
||||
|
||||
"""
|
||||
Zeros()
|
||||
Zeros(size...)
|
||||
Zeros(Type, size...)
|
||||
|
||||
Acts as a stand-in for an array of zeros that can be
|
||||
used during training which is ignored by the optimisers.
|
||||
|
||||
Useful to turn bias off for a forward pass of a layer.
|
||||
|
||||
## Examples
|
||||
|
||||
```julia
|
||||
julia> Flux.Zeros(3,3)
|
||||
3×3 Flux.Zeros{Bool,2}:
|
||||
false false false
|
||||
false false false
|
||||
false false false
|
||||
|
||||
julia> Flux.Zeros(Float32, 3,3)
|
||||
3×3 Flux.Zeros{Float32,2}:
|
||||
0.0 0.0 0.0
|
||||
0.0 0.0 0.0
|
||||
0.0 0.0 0.0
|
||||
|
||||
julia> rand(3,3) .+ Flux.Zeros()
|
||||
3×3 Array{Float64,2}:
|
||||
0.198739 0.490459 0.785386
|
||||
0.779074 0.39986 0.66383
|
||||
0.854981 0.447292 0.314497
|
||||
|
||||
julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
|
||||
Conv((2, 2), 1=>3)
|
||||
```
|
||||
"""
|
||||
struct Zeros{T,N} <: AbstractArray{T,N}
|
||||
size::Tuple
|
||||
end
|
||||
|
||||
Zeros(::Type{T}, sz...) where T = Zeros{T,length(sz)}(sz)
|
||||
Zeros(sz::Integer...) = Zeros(Bool, sz...)
|
||||
|
||||
Base.size(xs::Zeros) = xs.size
|
||||
Base.axes(xs::Zeros) = Base.OneTo.(size(xs))
|
||||
|
||||
Base.IndexStyle(::Type{<:Zeros}) = IndexLinear()
|
||||
|
||||
Base.getindex(xs::Zeros{T,N}, I::Int) where {T,N} = zero(T)
|
||||
Base.getindex(xs::Zeros{T,N}, inds::Union{Base.OneTo, Base.UnitRange}) where {T,N} =
|
||||
Zeros(T, length(inds))
|
||||
|
||||
Base.collect(xs::Zeros{T,N}) where {T,N} = fill(zero(T), size(xs))
|
||||
|
||||
@adjoint reshape(xs::Zeros{T}, dims...) where T =
|
||||
reshape(xs, dims...), _ -> nothing
|
||||
|
||||
# Define basic ops
|
||||
for f in (:+, :-)
|
||||
@eval @inline function $f(a::Union{AbstractArray{<:Number}, Zeros}, b::Zeros)
|
||||
@assert size(a) == size(b) throw(DimensionMismatch("dimensions must match"))
|
||||
a
|
||||
end
|
||||
end
|
||||
|
||||
+(a::Zeros, b::AbstractArray) = b + a
|
||||
-(a::Zeros, b::AbstractArray) = -b + a
|
||||
|
||||
Base.copy(xs::Zeros{T,N}) where {T,N} = xs
|
||||
|
||||
# Define broadcasting behaviour
|
||||
for op in (:+, :-)
|
||||
@eval function broadcasted(::typeof($op), a::AbstractArray, b::Zeros)
|
||||
bs = Broadcast.broadcast_shape(size(a), size(b))
|
||||
size(a) == bs && return a
|
||||
sz = similar(a, bs)
|
||||
sz .= a
|
||||
end
|
||||
end
|
||||
|
||||
broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = broadcasted(+, b, a)
|
||||
broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = broadcasted(+, -b, a)
|
||||
|
||||
function broadcasted(::typeof(*), a::AbstractArray, b::Zeros)
|
||||
Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
|
||||
end
|
||||
|
||||
broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = broadcasted(*, b, a)
|
||||
|
||||
for op in (:+, :-, :*)
|
||||
@eval broadcasted(::typeof($op), a::Zeros, b::Zeros) = Zeros(Broadcast.broadcast_shape(size(a), size(b))...)
|
||||
end
|
||||
|
||||
# Some opportunities to avoid scalar indexing, intermediaries
|
||||
# Since it replicates a little of what we expect Base to do,
|
||||
# it should be possible to remove in the future, but for now,
|
||||
# these help with performance.
|
||||
broadcasted(::typeof(+), a::AbstractArray, b::Zeros{T,0}) where T = a
|
||||
broadcasted(::typeof(+), a::Zeros{T,0}, b::AbstractArray) where T = b
|
||||
broadcasted(::typeof(-), a::AbstractArray, b::Zeros{T,0}) where T = a
|
||||
broadcasted(::typeof(-), a::Zeros{T,0}, b::AbstractArray) where T = -b
|
||||
broadcasted(::typeof(*), a::AbstractArray, b::Zeros{T,0}) where T = zero(a)
|
||||
broadcasted(::typeof(*), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
|
||||
broadcasted(::typeof(/), a::Zeros{T,0}, b::AbstractArray) where T = zero(b)
|
|
@ -33,8 +33,8 @@ cx = gpu(x)
|
|||
|
||||
x = [-1.1491, 0.8619, 0.3127]
|
||||
y = [1, 1, 0.]
|
||||
@test Flux.binarycrossentropy.(σ.(x),y) ≈ Array(Flux.binarycrossentropy.(cu(σ.(x)),cu(y)))
|
||||
@test Flux.logitbinarycrossentropy.(x,y) ≈ Array(Flux.logitbinarycrossentropy.(cu(x),cu(y)))
|
||||
@test Flux.binarycrossentropy(σ.(x), y) ≈ Flux.binarycrossentropy(cu(σ.(x)), cu(y))
|
||||
@test Flux.logitbinarycrossentropy(x, y) ≈ Flux.logitbinarycrossentropy(cu(x), cu(y))
|
||||
|
||||
xs = rand(5, 5)
|
||||
ys = Flux.onehotbatch(1:5,1:5)
|
||||
|
@ -69,7 +69,6 @@ if CuArrays.has_cudnn()
|
|||
@info "Testing Flux/CUDNN"
|
||||
include("cudnn.jl")
|
||||
include("curnn.jl")
|
||||
include("layers.jl")
|
||||
else
|
||||
@warn "CUDNN unavailable, not testing GPU DNN support"
|
||||
end
|
||||
|
|
|
@ -1,98 +0,0 @@
|
|||
# Test layers and data/model movements on and off the GPU
|
||||
# Add tests for layers and their gradients on the GPU
|
||||
# Most of the forward passes should be fine being applied
|
||||
# to bitstype objects, but this gives higher coverage for our use-cases
|
||||
# Check that getting the gradients does not throw
|
||||
|
||||
# generic movement tests
|
||||
@testset "Basic GPU Movement" begin
|
||||
@test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
|
||||
@test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
|
||||
end
|
||||
|
||||
# TODO: These layers get into scalar indexing
|
||||
# `AlphaDropout` throws a compilation error on GPUs,
|
||||
# whereas, the rest are scalar indexing issues.
|
||||
const BROKEN_LAYERS = [DepthwiseConv,
|
||||
AlphaDropout,
|
||||
InstanceNorm,
|
||||
GroupNorm]
|
||||
|
||||
function gradtest(name::String, layers::Vector, xs = nothing, args...)
|
||||
isnothing(xs) && error("Missing input to test the layers against.")
|
||||
@testset "$name GPU grad tests" begin
|
||||
for layer in layers
|
||||
@testset "$layer GPU grad test" begin
|
||||
l = gpu(layer(args...))
|
||||
xs = gpu(xs)
|
||||
if any(x -> isa(l, x), BROKEN_LAYERS)
|
||||
ps = Flux.params(l)
|
||||
@test_broken gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
|
||||
else
|
||||
ps = Flux.params(l)
|
||||
@test gradient(() -> sum(l(xs)), ps) isa Flux.Zygote.Grads
|
||||
gs = gradient(() -> sum(l(xs)), ps)
|
||||
|
||||
# Handle pooling layers
|
||||
if !isempty(ps)
|
||||
@test gs[first(ps)] isa Flux.CuArrays.CuArray
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Repeats from Conv, CrossCor
|
||||
|
||||
r = rand(Float32, 28, 28, 1, 1)
|
||||
conv_layers = [Conv, ConvTranspose, CrossCor, DepthwiseConv]
|
||||
gradtest("Conv", conv_layers, r, (2,2), 1=>3)
|
||||
|
||||
pooling_layers = [MaxPool, MeanPool]
|
||||
gradtest("Pooling", pooling_layers, r, (2,2))
|
||||
|
||||
dropout_layers = [Dropout, AlphaDropout]
|
||||
gradtest("Dropout", dropout_layers, r, 0.5f0)
|
||||
|
||||
norm_layers = [LayerNorm, BatchNorm]
|
||||
gradtest("Normalising", norm_layers, rand(Float32, 28,28,3,1), 1)
|
||||
|
||||
instancenorm = [InstanceNorm]
|
||||
gradtest("InstanceNorm", instancenorm, r, 1)
|
||||
|
||||
groupnorm = [GroupNorm]
|
||||
gradtest("GroupNorm", groupnorm, rand(Float32, 28,28,3,1), 3, 1)
|
||||
|
||||
const stateless_layers = [Flux.mse,
|
||||
Flux.crossentropy,
|
||||
Flux.logitcrossentropy,
|
||||
Flux.normalise]
|
||||
|
||||
const stateless_layers_broadcasted = [Flux.binarycrossentropy,
|
||||
Flux.logitbinarycrossentropy]
|
||||
|
||||
function stateless_gradtest(f, args...)
|
||||
@test gradient((args...) -> sum(f(args...)), args...)[1] isa CuArray
|
||||
end
|
||||
|
||||
function stateless_gradtest_broadcasted(f, args...)
|
||||
@test gradient((args...) -> sum(f.(args...)), args...)[1] isa CuArray
|
||||
end
|
||||
|
||||
@testset "Stateless GPU grad tests" begin
|
||||
x = gpu(rand(3,3))
|
||||
y = gpu(rand(3,3))
|
||||
|
||||
for layer in stateless_layers
|
||||
if layer == Flux.normalise
|
||||
stateless_gradtest(layer, x)
|
||||
else
|
||||
stateless_gradtest(layer, x, y)
|
||||
end
|
||||
end
|
||||
|
||||
for layer in stateless_layers_broadcasted
|
||||
stateless_gradtest_broadcasted(layer, x, y)
|
||||
end
|
||||
end
|
34
test/data.jl
34
test/data.jl
|
@ -3,34 +3,20 @@
|
|||
Y = [1:5;]
|
||||
|
||||
d = DataLoader(X, batchsize=2)
|
||||
@inferred first(d)
|
||||
batches = collect(d)
|
||||
@test eltype(batches) == eltype(d) == typeof(X)
|
||||
@test length(batches) == 3
|
||||
@test batches[1] == X[:,1:2]
|
||||
@test batches[2] == X[:,3:4]
|
||||
@test batches[3] == X[:,5:5]
|
||||
|
||||
d = DataLoader(X, batchsize=2, partial=false)
|
||||
@inferred first(d)
|
||||
batches = collect(d)
|
||||
@test eltype(batches) == eltype(d) == typeof(X)
|
||||
@test length(batches) == 2
|
||||
@test batches[1] == X[:,1:2]
|
||||
@test batches[2] == X[:,3:4]
|
||||
|
||||
d = DataLoader((X,), batchsize=2, partial=false)
|
||||
@inferred first(d)
|
||||
d = DataLoader(X, Y, batchsize=2)
|
||||
batches = collect(d)
|
||||
@test eltype(batches) == eltype(d) == Tuple{typeof(X)}
|
||||
@test length(batches) == 2
|
||||
@test batches[1] == (X[:,1:2],)
|
||||
@test batches[2] == (X[:,3:4],)
|
||||
|
||||
d = DataLoader((X, Y), batchsize=2)
|
||||
@inferred first(d)
|
||||
batches = collect(d)
|
||||
@test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
|
||||
@test length(batches) == 3
|
||||
@test length(batches[1]) == 2
|
||||
@test length(batches[2]) == 2
|
||||
|
@ -42,22 +28,6 @@
|
|||
@test batches[3][1] == X[:,5:5]
|
||||
@test batches[3][2] == Y[5:5]
|
||||
|
||||
# test with NamedTuple
|
||||
d = DataLoader((x=X, y=Y), batchsize=2)
|
||||
@inferred first(d)
|
||||
batches = collect(d)
|
||||
@test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
|
||||
@test length(batches) == 3
|
||||
@test length(batches[1]) == 2
|
||||
@test length(batches[2]) == 2
|
||||
@test length(batches[3]) == 2
|
||||
@test batches[1][1] == batches[1].x == X[:,1:2]
|
||||
@test batches[1][2] == batches[1].y == Y[1:2]
|
||||
@test batches[2][1] == batches[2].x == X[:,3:4]
|
||||
@test batches[2][2] == batches[2].y == Y[3:4]
|
||||
@test batches[3][1] == batches[3].x == X[:,5:5]
|
||||
@test batches[3][2] == batches[3].y == Y[5:5]
|
||||
|
||||
# test interaction with `train!`
|
||||
θ = ones(2)
|
||||
X = zeros(2, 10)
|
||||
|
@ -71,7 +41,7 @@
|
|||
X = ones(2, 10)
|
||||
Y = fill(2, 10)
|
||||
loss(x, y) = sum((y - x'*θ).^2)
|
||||
d = DataLoader((X, Y))
|
||||
d = DataLoader(X, Y)
|
||||
Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
|
||||
@test norm(θ .- 1) < 1e-10
|
||||
end
|
||||
|
|
|
@ -28,14 +28,6 @@ import Flux: activations
|
|||
end
|
||||
|
||||
@testset "Dense" begin
|
||||
@testset "constructors" begin
|
||||
@test size(Dense(10, 100).W) == (100, 10)
|
||||
@test Dense(rand(100,10), rand(10)).σ == identity
|
||||
|
||||
@test_throws MethodError Dense(10, 10.5)
|
||||
@test_throws MethodError Dense(10, 10.5, tanh)
|
||||
end
|
||||
|
||||
@test length(Dense(10, 5)(randn(10))) == 5
|
||||
@test_throws DimensionMismatch Dense(10, 5)(randn(1))
|
||||
@test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
|
||||
|
@ -45,6 +37,7 @@ import Flux: activations
|
|||
@test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
|
||||
@test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
|
||||
@test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
|
||||
|
||||
end
|
||||
|
||||
@testset "Diagonal" begin
|
||||
|
|
|
@ -25,35 +25,6 @@ end
|
|||
Dense(288, 10), softmax)
|
||||
|
||||
@test size(m(r)) == (10, 5)
|
||||
|
||||
# Test bias switch
|
||||
bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3))
|
||||
ip = zeros(Float32, 28,28,1,1)
|
||||
|
||||
op = bias(ip)
|
||||
@test sum(op) == prod(size(op))
|
||||
|
||||
bias = Conv((2,2), 1=>3, bias = Flux.Zeros())
|
||||
op = bias(ip)
|
||||
@test sum(op) === 0.f0
|
||||
gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
|
||||
@test gs[bias.bias] == nothing
|
||||
|
||||
# Train w/o bias and make sure no convergence happens
|
||||
# when only bias can be converged
|
||||
bias = Conv((2, 2), 1=>3, bias = Flux.Zeros());
|
||||
ip = zeros(Float32, 28,28,1,1)
|
||||
op = zeros(Float32, 27,27,3,1) .+ 2.f0
|
||||
opt = Descent()
|
||||
|
||||
for _ = 1:10^3
|
||||
gs = gradient(params(bias)) do
|
||||
Flux.mse(bias(ip), op)
|
||||
end
|
||||
Flux.Optimise.update!(opt, params(bias), gs)
|
||||
end
|
||||
|
||||
@test Flux.mse(bias(ip), op) ≈ 4.f0
|
||||
end
|
||||
|
||||
@testset "asymmetric padding" begin
|
||||
|
|
|
@ -1,26 +1,9 @@
|
|||
using Test
|
||||
using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
|
||||
σ, binarycrossentropy, logitbinarycrossentropy, flatten,
|
||||
xlogx, xlogy
|
||||
σ, binarycrossentropy, logitbinarycrossentropy, flatten
|
||||
|
||||
const ϵ = 1e-7
|
||||
|
||||
@testset "xlogx & xlogy" begin
|
||||
@test iszero(xlogx(0))
|
||||
@test isnan(xlogx(NaN))
|
||||
@test xlogx(2) ≈ 2.0 * log(2.0)
|
||||
@inferred xlogx(2)
|
||||
@inferred xlogx(0)
|
||||
|
||||
@test iszero(xlogy(0, 1))
|
||||
@test isnan(xlogy(NaN, 1))
|
||||
@test isnan(xlogy(1, NaN))
|
||||
@test isnan(xlogy(NaN, NaN))
|
||||
@test xlogy(2, 3) ≈ 2.0 * log(3.0)
|
||||
@inferred xlogy(2, 3)
|
||||
@inferred xlogy(0, 1)
|
||||
end
|
||||
|
||||
@testset "losses" begin
|
||||
# First, regression-style y's
|
||||
y = [1, 1, 0, 0]
|
||||
|
@ -29,15 +12,15 @@ end
|
|||
@testset "mse" begin
|
||||
@test mse(ŷ, y) ≈ (.1^2 + .9^2)/2
|
||||
end
|
||||
|
||||
|
||||
@testset "mae" begin
|
||||
@test Flux.mae(ŷ, y) ≈ 1/2
|
||||
end
|
||||
|
||||
|
||||
@testset "huber_loss" begin
|
||||
@test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
y = [123.0,456.0,789.0]
|
||||
ŷ = [345.0,332.0,789.0]
|
||||
@testset "msle" begin
|
||||
|
@ -52,7 +35,6 @@ end
|
|||
lossvalue = 1.203972804325936
|
||||
|
||||
@testset "crossentropy" begin
|
||||
@test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ crossentropy([0.1,0.9], [0.1,0.9])
|
||||
@test crossentropy(ŷ, y) ≈ lossvalue
|
||||
end
|
||||
|
||||
|
@ -74,59 +56,58 @@ end
|
|||
|
||||
logŷ, y = randn(3), rand(3)
|
||||
@testset "binarycrossentropy" begin
|
||||
@test binarycrossentropy.(σ.(logŷ), y; ϵ=0) ≈ -y.*log.(σ.(logŷ)) - (1 .- y).*log.(1 .- σ.(logŷ))
|
||||
@test binarycrossentropy.(σ.(logŷ), y) ≈ -y.*log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - (1 .- y).*log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ)))
|
||||
@test binarycrossentropy(σ.(logŷ), y; ϵ=0) ≈ mean(-y.*log.(σ.(logŷ)) - (1 .- y).*log.(1 .- σ.(logŷ)))
|
||||
@test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y.*log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - (1 .- y).*log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))))
|
||||
end
|
||||
|
||||
@testset "logitbinarycrossentropy" begin
|
||||
@test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0)
|
||||
@test logitbinarycrossentropy(logŷ, y) ≈ binarycrossentropy(σ.(logŷ), y; ϵ=0)
|
||||
end
|
||||
|
||||
|
||||
y = [1 2 3]
|
||||
ŷ = [4.0 5.0 6.0]
|
||||
@testset "kldivergence" begin
|
||||
@test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ Flux.kldivergence([0.1,0.9], [0.1,0.9])
|
||||
@test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
|
||||
@test Flux.kldivergence(y, y) ≈ 0
|
||||
@test Flux.kldivergence(y, y) ≈ 0
|
||||
end
|
||||
|
||||
|
||||
y = [1 2 3 4]
|
||||
ŷ = [5.0 6.0 7.0 8.0]
|
||||
@testset "hinge" begin
|
||||
@test Flux.hinge(ŷ, y) ≈ 0
|
||||
@test Flux.hinge(y, 0.5 .* y) ≈ 0.125
|
||||
end
|
||||
|
||||
|
||||
@testset "squared_hinge" begin
|
||||
@test Flux.squared_hinge(ŷ, y) ≈ 0
|
||||
@test Flux.squared_hinge(y, 0.5 .* y) ≈ 0.0625
|
||||
end
|
||||
|
||||
|
||||
y = [0.1 0.2 0.3]
|
||||
ŷ = [0.4 0.5 0.6]
|
||||
@testset "poisson" begin
|
||||
@test Flux.poisson(ŷ, y) ≈ 0.6278353988097339
|
||||
@test Flux.poisson(y, y) ≈ 0.5044459776946685
|
||||
@test Flux.poisson_loss(ŷ, y) ≈ 0.6278353988097339
|
||||
@test Flux.poisson_loss(y, y) ≈ 0.5044459776946685
|
||||
end
|
||||
|
||||
y = [1.0 0.5 0.3 2.4]
|
||||
ŷ = [0 1.4 0.5 1.2]
|
||||
@testset "dice_coeff_loss" begin
|
||||
@test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999
|
||||
@test Flux.dice_coeff_loss(y, y) ≈ 0.0
|
||||
@test Flux.dice_coeff_loss(ŷ, y, dims=(1,2)) ≈ 0.2799999999999999
|
||||
@test Flux.dice_coeff_loss(y, y, dims=(1,2)) ≈ 0.0
|
||||
end
|
||||
|
||||
@testset "tversky_loss" begin
|
||||
@test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
|
||||
@test Flux.tversky_loss(ŷ, y, β = 0.8) ≈ -0.09490740740740744
|
||||
@test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
|
||||
@test Flux.tversky_loss(ŷ, y, dims=(1,2)) ≈ 0.036175710594315236
|
||||
@test Flux.tversky_loss(ŷ, y, dims=(1,2), β = 0.8) ≈ 0.06281407035175879
|
||||
@test Flux.tversky_loss(y, y, dims=(1,2)) ≈ -0.6904761904761902
|
||||
end
|
||||
|
||||
@testset "no spurious promotions" begin
|
||||
for T in (Float32, Float64)
|
||||
y = rand(T, 2)
|
||||
ŷ = rand(T, 2)
|
||||
for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,
|
||||
for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson_loss,
|
||||
Flux.mae, Flux.huber_loss, Flux.msle, Flux.squared_hinge, Flux.dice_coeff_loss, Flux.tversky_loss)
|
||||
fwd, back = Flux.pullback(f, ŷ, y)
|
||||
@test fwd isa T
|
||||
|
|
|
@ -57,57 +57,35 @@ end
|
|||
end
|
||||
|
||||
@testset "ExpDecay" begin
|
||||
|
||||
@testset "Sanity Check" begin
|
||||
o = ExpDecay(0.2, 0.5, 1, 1e-3)
|
||||
p = [0.0]
|
||||
steps = 1:8
|
||||
eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
|
||||
eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
|
||||
@test eta_actual == eta_expected
|
||||
end
|
||||
|
||||
w = randn(10, 10)
|
||||
o = ExpDecay(0.1, 0.1, 1000, 1e-4)
|
||||
w1 = randn(10,10)
|
||||
loss(x) = Flux.mse(w*x, w1*x)
|
||||
flag = 1
|
||||
decay_steps = []
|
||||
for t = 1:10^5
|
||||
prev_eta = o.eta
|
||||
θ = Params([w1])
|
||||
x = rand(10)
|
||||
θ̄ = gradient(() -> loss(x), θ)
|
||||
prev_grad = collect(θ̄[w1])
|
||||
delta = Optimise.apply!(o, w1, θ̄[w1])
|
||||
w1 .-= delta
|
||||
new_eta = o.eta
|
||||
if new_eta != prev_eta
|
||||
push!(decay_steps, t)
|
||||
end
|
||||
array = fill(o.eta, size(prev_grad))
|
||||
if array .* prev_grad != delta
|
||||
flag = 0
|
||||
end
|
||||
end
|
||||
@test flag == 1
|
||||
# Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
|
||||
ground_truth = []
|
||||
for i in 1:4
|
||||
push!(ground_truth, 1000*i) # Expected decay steps for this example.
|
||||
end
|
||||
@test decay_steps == ground_truth
|
||||
@test o.eta == o.clip
|
||||
end
|
||||
|
||||
@testset "Clipping" begin
|
||||
w = randn(10, 10)
|
||||
loss(x) = sum(w * x)
|
||||
θ = Params([w])
|
||||
x = 1000 * randn(10)
|
||||
w̄ = gradient(() -> loss(x), θ)[w]
|
||||
w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
|
||||
@test all(w̄_value .<= 1)
|
||||
w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
|
||||
@test norm(w̄_norm) <= 1
|
||||
end
|
||||
o = ExpDecay(0.1, 0.1, 1000, 1e-4)
|
||||
w1 = randn(10,10)
|
||||
loss(x) = Flux.mse(w*x, w1*x)
|
||||
flag = 1
|
||||
decay_steps = []
|
||||
for t = 1:10^5
|
||||
prev_eta = o.eta
|
||||
θ = Params([w1])
|
||||
x = rand(10)
|
||||
θ̄ = gradient(() -> loss(x), θ)
|
||||
prev_grad = collect(θ̄[w1])
|
||||
delta = Optimise.apply!(o, w1, θ̄[w1])
|
||||
w1 .-= delta
|
||||
new_eta = o.eta
|
||||
if new_eta != prev_eta
|
||||
push!(decay_steps, t)
|
||||
end
|
||||
array = fill(o.eta, size(prev_grad))
|
||||
if array .* prev_grad != delta
|
||||
flag = 0
|
||||
end
|
||||
end
|
||||
@test flag == 1
|
||||
# Test to check if decay happens at decay steps. Eta reaches clip value eventually.
|
||||
ground_truth = []
|
||||
for i in 1:11
|
||||
push!(ground_truth, 1000*i) # Expected decay steps for this example.
|
||||
end
|
||||
@test decay_steps == ground_truth
|
||||
@test o.eta == o.clip
|
||||
end
|
||||
|
|
|
@ -2,45 +2,49 @@ using Flux
|
|||
using Flux.Data
|
||||
using Test
|
||||
using Random, Statistics, LinearAlgebra
|
||||
using Documenter
|
||||
using IterTools: ncycle
|
||||
|
||||
Random.seed!(0)
|
||||
|
||||
@testset "Utils" begin
|
||||
include("utils.jl")
|
||||
end
|
||||
@testset "Flux" begin
|
||||
|
||||
@testset "Onehot" begin
|
||||
include("onehot.jl")
|
||||
end
|
||||
|
||||
@testset "Optimise" begin
|
||||
include("optimise.jl")
|
||||
end
|
||||
|
||||
@testset "Data" begin
|
||||
include("data.jl")
|
||||
end
|
||||
|
||||
@testset "Layers" begin
|
||||
include("layers/basic.jl")
|
||||
include("layers/normalisation.jl")
|
||||
include("layers/stateless.jl")
|
||||
include("layers/conv.jl")
|
||||
end
|
||||
|
||||
@testset "CUDA" begin
|
||||
if Flux.use_cuda[]
|
||||
include("cuda/cuda.jl")
|
||||
else
|
||||
@warn "CUDA unavailable, not testing GPU support"
|
||||
@testset "Utils" begin
|
||||
include("utils.jl")
|
||||
end
|
||||
|
||||
@testset "Onehot" begin
|
||||
include("onehot.jl")
|
||||
end
|
||||
|
||||
@testset "Optimise" begin
|
||||
include("optimise.jl")
|
||||
end
|
||||
|
||||
@testset "Data" begin
|
||||
include("data.jl")
|
||||
end
|
||||
|
||||
@testset "Layers" begin
|
||||
include("layers/basic.jl")
|
||||
include("layers/normalisation.jl")
|
||||
include("layers/stateless.jl")
|
||||
include("layers/conv.jl")
|
||||
end
|
||||
|
||||
@testset "CUDA" begin
|
||||
if Flux.use_cuda[]
|
||||
include("cuda/cuda.jl")
|
||||
else
|
||||
@warn "CUDA unavailable, not testing GPU support"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@static if VERSION >= v"1.4"
|
||||
using Documenter
|
||||
@testset "Docs" begin
|
||||
DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
|
||||
doctest(Flux)
|
||||
if VERSION >= v"1.4"
|
||||
DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
|
||||
doctest(Flux)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end # testset Flux
|
||||
|
|
Loading…
Reference in New Issue