more docs and constructors

This commit is contained in:
Dhairya Gandhi 2020-02-26 22:29:14 +05:30
commit cd931793ef
34 changed files with 967 additions and 880 deletions

24
.github/workflows/CompatHelper.yml vendored Normal file
View File

@ -0,0 +1,24 @@
name: CompatHelper
on:
schedule:
- cron: '00 00 * * *'
jobs:
CompatHelper:
runs-on: ${{ matrix.os }}
strategy:
matrix:
julia-version: [1.3]
julia-arch: [x64]
os: [ubuntu-latest]
steps:
- uses: julia-actions/setup-julia@latest
with:
version: ${{ matrix.julia-version }}
- name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: julia -e 'using CompatHelper; CompatHelper.main()'

11
.github/workflows/TagBot.yml vendored Normal file
View File

@ -0,0 +1,11 @@
name: TagBot
on:
schedule:
- cron: 0 * * * *
jobs:
TagBot:
runs-on: ubuntu-latest
steps:
- uses: JuliaRegistries/TagBot@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -1,51 +1,41 @@
before_script:
- export CI_DISABLE_CURNN_TEST=true
variables:
CI_IMAGE_TAG: 'cuda'
include:
- 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/common.yml'
- 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v6.yml'
.flux:
extends: .test
script:
- julia -e 'using InteractiveUtils;
versioninfo()'
- mkdir $JULIA_DEPOT_PATH # Pkg3.jl#325
- julia --project -e 'using Pkg;
Pkg.instantiate();
Pkg.build();
Pkg.test(; coverage=true);'
image: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
test:v1.0:
extends: .flux
variables:
CI_VERSION_TAG: 'v1.0'
test:v1.1:
extends: .flux
variables:
CI_VERSION_TAG: 'v1.1'
# julia:1.0:
# extends:
# - .julia:1.0
# - .test
# tags:
# - nvidia
#
# julia:1.1:
# extends:
# - .julia:1.1
# - .test
# tags:
# - nvidia
#
# julia:1.2:
# extends:
# - .julia:1.2
# - .test
# tags:
# - nvidia
test:v1.2:
extends: .flux
variables:
CI_VERSION_TAG: 'v1.2'
test:v1.3:
extends: .flux
variables:
CI_VERSION_TAG: 'v1.3'
test:v1.0:
extends: .flux
variables:
CI_VERSION_TAG: 'v1.0'
test:dev:
extends: .flux
variables:
CI_VERSION_TAG: 'dev'
julia:1.3:
extends:
- .julia:1.3
- .test
tags:
- nvidia
julia:nightly:
extends:
- .julia:nightly
- .test
tags:
- nvidia
allow_failure: true

View File

@ -6,7 +6,7 @@ os:
# - osx
julia:
- 1.1
- 1.3
- nightly
matrix:
@ -16,7 +16,7 @@ matrix:
jobs:
include:
- stage: "Documentation"
julia: 1.0
julia: 1.3
os: linux
script:
- julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));

View File

@ -2,15 +2,15 @@
[[AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40"
git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "0.4.1"
version = "0.5.0"
[[AbstractTrees]]
deps = ["Markdown", "Test"]
git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b"
deps = ["Markdown"]
git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
version = "0.2.1"
version = "0.3.0"
[[Adapt]]
deps = ["LinearAlgebra"]
@ -21,46 +21,34 @@ version = "1.0.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinDeps]]
deps = ["Compat", "Libdl", "SHA", "URIParser"]
git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
version = "0.8.10"
[[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"]
git-tree-sha1 = "c7361ce8a2129f20b0e05a89f7070820cfed6648"
deps = ["Libdl", "SHA"]
git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.6"
version = "0.5.8"
[[CEnum]]
git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.2.0"
[[CSTParser]]
deps = ["Tokenize"]
git-tree-sha1 = "c69698c3d4a7255bc1b4bc2afc09f59db910243b"
uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
version = "0.6.2"
[[CUDAapi]]
deps = ["Libdl", "Logging"]
git-tree-sha1 = "e063efb91cfefd7e6afd92c435d01398107a500b"
git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
version = "1.2.0"
version = "2.1.0"
[[CUDAdrv]]
deps = ["CUDAapi", "Libdl", "Printf"]
git-tree-sha1 = "9ce99b5732c70e06ed97c042187baed876fb1698"
deps = ["CEnum", "CUDAapi", "Printf"]
git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
version = "3.1.0"
version = "5.0.1"
[[CUDAnative]]
deps = ["Adapt", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Printf", "TimerOutputs"]
git-tree-sha1 = "52ae1ce10ebfa686e227655c47b19add89308623"
deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
version = "2.3.1"
version = "2.7.0"
[[CodecZlib]]
deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@ -70,9 +58,9 @@ version = "0.6.0"
[[ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965"
git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.8.0"
version = "0.8.1"
[[Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
@ -86,40 +74,22 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
version = "0.2.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "2.1.0"
[[Conda]]
deps = ["JSON", "VersionParsing"]
git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
version = "1.3.0"
[[Crayons]]
deps = ["Test"]
git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.0.0"
[[CuArrays]]
deps = ["AbstractFFTs", "Adapt", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
git-tree-sha1 = "46b48742a84bb839e74215b7e468a4a1c6ba30f9"
deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
version = "1.2.1"
version = "1.6.0"
[[DataAPI]]
git-tree-sha1 = "8903f0219d3472543fc4b2f5ebaf675a07f817c0"
git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.0.1"
version = "1.1.0"
[[DataStructures]]
deps = ["InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a"
git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.17.0"
version = "0.17.7"
[[Dates]]
deps = ["Printf"]
@ -130,32 +100,38 @@ deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[DiffResults]]
deps = ["Compat", "StaticArrays"]
git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c"
deps = ["StaticArrays"]
git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc"
uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
version = "0.0.4"
version = "1.0.2"
[[DiffRules]]
deps = ["Random", "Test"]
git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
deps = ["NaNMath", "Random", "SpecialFunctions"]
git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
version = "0.0.10"
version = "1.0.0"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[FFTW]]
deps = ["AbstractFFTs", "BinaryProvider", "Conda", "Libdl", "LinearAlgebra", "Reexport", "Test"]
git-tree-sha1 = "6c5b420da0b8c12098048561b8d58f81adea506f"
deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
version = "1.0.1"
version = "1.2.0"
[[FFTW_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
version = "3.3.9+3"
[[FillArrays]]
deps = ["LinearAlgebra", "Random", "SparseArrays"]
git-tree-sha1 = "8fba6ddaf66b45dec830233cea0aae43eb1261ad"
git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
version = "0.6.4"
version = "0.8.4"
[[FixedPointNumbers]]
git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@ -163,33 +139,33 @@ uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.6.1"
[[ForwardDiff]]
deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"]
git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b"
deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
uuid = "f6369f11-7733-5829-9624-2563aa707210"
version = "0.10.3"
version = "0.10.8"
[[GPUArrays]]
deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "StaticArrays", "Test"]
git-tree-sha1 = "77e27264276fe97a7e7fb928bf8999a145abc018"
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "1.0.3"
version = "2.0.1"
[[IRTools]]
deps = ["InteractiveUtils", "MacroTools", "Test"]
git-tree-sha1 = "e23faa71b8f54c3fdc99b230b9c2906cafdddca5"
git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
version = "0.2.3"
version = "0.3.0"
[[IntelOpenMP_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
version = "2018.0.3+0"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[JSON]]
deps = ["Dates", "Mmap", "Parsers", "Unicode"]
git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
version = "0.21.0"
[[Juno]]
deps = ["Base64", "Logging", "Media", "Profile", "Test"]
git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8"
@ -198,9 +174,9 @@ version = "0.7.2"
[[LLVM]]
deps = ["CEnum", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "4a05f742837779a00bd8c9a18da6817367c4245d"
git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "1.3.0"
version = "1.3.3"
[[LibGit2]]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
@ -215,11 +191,17 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MKL_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
version = "2019.0.117+0"
[[MacroTools]]
deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"]
git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76"
deps = ["DataStructures", "Markdown", "Random"]
git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.1"
version = "0.5.3"
[[Markdown]]
deps = ["Base64"]
@ -232,24 +214,30 @@ uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
version = "0.5.0"
[[Missings]]
git-tree-sha1 = "29858ce6c8ae629cf2d733bffa329619a1c843d0"
deps = ["DataAPI"]
git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "0.4.2"
version = "0.4.3"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[NNlib]]
deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"]
git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8"
deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
version = "0.6.0"
version = "0.6.2"
[[NaNMath]]
deps = ["Compat"]
git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
version = "0.3.2"
version = "0.3.3"
[[OpenSpecFun_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.3+1"
[[OrderedCollections]]
deps = ["Random", "Serialization", "Test"]
@ -257,14 +245,8 @@ git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.1.0"
[[Parsers]]
deps = ["Dates", "Test"]
git-tree-sha1 = "ef0af6c8601db18c282d092ccbd2f01f3f0cd70b"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "0.3.7"
[[Pkg]]
deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Printf]]
@ -290,10 +272,10 @@ uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "0.2.0"
[[Requires]]
deps = ["Test"]
git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
deps = ["UUIDs"]
git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "0.5.2"
version = "1.0.0"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@ -301,10 +283,6 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
@ -319,16 +297,16 @@ deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]]
deps = ["BinDeps", "BinaryProvider", "Libdl"]
git-tree-sha1 = "3bdd374b6fd78faf0119b8c5d538788dbf910c6e"
deps = ["OpenSpecFun_jll"]
git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "0.8.0"
version = "0.9.0"
[[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6"
git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "0.11.0"
version = "0.12.1"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
@ -345,15 +323,10 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TimerOutputs]]
deps = ["Crayons", "Printf", "Test", "Unicode"]
git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
deps = ["Printf"]
git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.0"
[[Tokenize]]
git-tree-sha1 = "dfcdbbfb2d0370716c815cbd6f8a364efb6f42cf"
uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
version = "0.5.6"
version = "0.5.3"
[[TranscodingStreams]]
deps = ["Random", "Test"]
@ -361,12 +334,6 @@ git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
version = "0.9.5"
[[URIParser]]
deps = ["Test", "Unicode"]
git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
uuid = "30578b45-9adc-5946-b283-645ec420af67"
version = "0.4.0"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
@ -374,30 +341,26 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[VersionParsing]]
deps = ["Compat"]
git-tree-sha1 = "c9d5aa108588b978bd859554660c8a5c4f2f7669"
uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
version = "1.1.3"
[[ZipFile]]
deps = ["BinaryProvider", "Libdl", "Printf"]
git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
deps = ["Libdl", "Printf", "Zlib_jll"]
git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
version = "0.8.3"
version = "0.9.0"
[[Zlib_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.11+7"
[[Zygote]]
deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
git-tree-sha1 = "38241b40ebd8748bcacad5e6c7ba3ab3cc7a15c9"
repo-rev = "master"
repo-url = "https://github.com/FluxML/Zygote.jl.git"
git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
version = "0.3.4"
version = "0.4.5"
[[ZygoteRules]]
deps = ["MacroTools"]
git-tree-sha1 = "c4c29b30b8ff3be13d4244e78be7df2a42bc54d0"
repo-rev = "master"
repo-url = "https://github.com/FluxML/ZygoteRules.jl.git"
git-tree-sha1 = "b3b4882cc9accf6731a08cc39543fbc6b669dca8"
uuid = "700de1a5-db45-46bc-99cf-38207098b444"
version = "0.2.0"

13
NEWS.md
View File

@ -1,3 +1,16 @@
# v0.10.0
* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
- The dependency on Tracker.jl has been removed.
- This means Flux now does not depend on using a specialised `TrackedArray` type, and can be used with normal Array implementations directly.
- Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
* The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
* `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
* Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
* [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
* [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
* `@treelike` has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
* `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
# v0.9.0
* [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
* New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.

View File

@ -1,17 +1,15 @@
name = "Flux"
uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
version = "0.9.0"
version = "0.10.2"
[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@ -24,13 +22,20 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"
[compat]
CUDAapi = "1.1"
CuArrays = "1.2"
AbstractTrees = "0.2, 0.3"
Adapt = "1"
CodecZlib = "0.5, 0.6"
Colors = "0.8, 0.9, 0.10, 0.11"
CuArrays = "1.6"
Juno = "0.5, 0.6, 0.7, 0.8"
MacroTools = "0.3, 0.4, 0.5"
NNlib = "0.6"
Zygote = "0.3"
Reexport = "0.2"
StatsBase = "0"
ZipFile = "0.7, 0.8, 0.9"
Zygote = "0.4"
julia = "1"
[extras]

View File

@ -7,93 +7,9 @@
Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, and provides lightweight abstractions on top of Julia's native GPU and AD support. Flux makes the easy things easy while remaining fully hackable.
```julia
julia> Pkg.add("Flux")
] add Flux
```
See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
If you use Flux in research, please cite the following paper:
```
@article{innes:2018,
author = {Mike Innes},
title = {Flux: Elegant Machine Learning with Julia},
journal = {Journal of Open Source Software},
year = {2018},
doi = {10.21105/joss.00602},
}
```
## Features
Flux has powerful high-level features, and common architectures can be defined in a few lines.
```julia
model = Chain(
Dense(768, 128, σ),
LSTM(128, 256),
LSTM(256, 128),
Dense(128, 10),
softmax)
loss(x, y) = crossentropy(model(x), y)
Flux.train!(loss, data, ADAM(...))
```
Yet you can easily strip away the layers, and directly write the mathematics for your problem. Flux will seamlessly take gradients of any Julia code, so your model looks just like the paper.
```julia
W = param(randn(2, 10))
b = param(randn(2))
y(x) = σ.(W * x .+ b)
```
If that's *still* not enough, you can go as deep as you want, even writing your own CUDA kernels with [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl)! All this can be freely mixed-and-matched in a single model or script, and it all runs interactively via Jupyter or Juno.
```julia
function gpu_add(a, b, c)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
c[i] = a[i] + b[i]
return nothing
end
```
Unusual architectures are no problem in Flux, as you can use all the loops, control flow and even macros that you're used to. Here's a Tree RNN in 4 lines.
```julia
tree() = rand() < 0.5 ? rand(10) : (tree(), tree()) # dummy data
shrink = Dense(20, 10)
combine(a, b) = shrink([a; b])
model(x) = x
model(x::Tuple) = combine(model(x[1]), model(x[2]))
model(tree()) # Sample output
```
Despite this flexibility, Julia's advanced compiler lets us do some powerful optimisations. For example, this definition of `sigmoid` automatically gets fused into a *single* GPU kernel so it's really fast.
```julia
sigmoid(xs) = 1 ./ (1 .+ exp.(.-xs))
```
Similarly, Flux is the first dynamic framework to support [compiling to the browser](https://fluxml.github.io/experiments/) and model import via [formats like ONNX](https://github.com/FluxML/ONNX.jl/), both of which are thinly-veiled compiler problems.
For more on our philosophy on machine learning, check out our article [On Machine Learning & Programming Languages](https://julialang.org/blog/2017/12/ml&pl).
## Contributing & Help
For general questions and help, check out Julia's [community forum](https://discourse.julialang.org/c/domain/ML).
Flux development is carried out via our [GitHub issues](https://github.com/FluxML/Flux.jl/issues), so feel free to open feature requests or PRs here.
For more informal discussions we'd love to have you on the [Julia slack](https://slackinvite.julialang.org/), where we hang out on the #machine-learning channel.
## Related Packages
Check out [Metalhead.jl](https://github.com/FluxML/Metalhead.jl) for common computer vision datasets and trained models.
[MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) provides further common datasets.
If you use Flux in your research, please [cite](CITATION.bib) our work.

View File

@ -1,4 +1,4 @@
status = [
"ci/gitlab/%"
"ci/gitlab%"
]
timeout-sec = 14400
timeout-sec = 7200

View File

@ -219,3 +219,24 @@ Flux.@functor Affine
```
This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
## Utility functions
Flux provides some utility functions to help you generate models in an automated fashion.
`outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
Currently limited to the following layers:
- `Chain`
- `Dense`
- `Conv`
- `Diagonal`
- `Maxout`
- `ConvTranspose`
- `DepthwiseConv`
- `CrossCor`
- `MaxPool`
- `MeanPool`
```@docs
outdims
```

View File

@ -65,3 +65,15 @@ AlphaDropout
LayerNorm
GroupNorm
```
## Cost Functions
```@docs
mse
crossentropy
logitcrossentropy
binarycrossentropy
logitbinarycrossentropy
kldivergence
poisson
hinge
```

View File

@ -113,6 +113,6 @@ You can even store optimiser state alongside the model, to resume training
exactly where you left off.
```julia
opt = ADAM(params(model))
opt = ADAM()
@save "model-$(now()).bson" model opt
```

View File

@ -58,3 +58,83 @@ AMSGrad
NADAM
ADAMW
```
## Optimiser Interface
Flux's optimsers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
```julia
mutable struct Momentum
eta
rho
velocity
end
Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
```
The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
```julia
function apply!(o::Momentum, x, Δ)
η, ρ = o.eta, o.rho
v = get!(o.velocity, x, zero(x))::typeof(x)
@. v = ρ * v - η * Δ
@. Δ = -v
end
```
This is the basic definition of a Momentum update rule given by:
```math
v = ρ * v - η * Δ
w = w - v
```
The `apply!` defines the update rules for an optimiser `opt`, given the parameters and gradients. It returns the updated gradients. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully.
## Composing Optimisers
Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
```julia
opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
```
Here we apply exponential decay to the `Descent` optimser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
It is then applied like any optimser.
```julia
w = randn(10, 10)
w1 = randn(10,10)
ps = Params([w, w1])
loss(x) = Flux.mse(w * x, w1 * x)
loss(rand(10)) # around 9
for t = 1:10^5
θ = Params([w, w1])
θ̄ = gradient(() -> loss(rand(10)), θ)
Flux.Optimise.update!(opt, θ, θ̄)
end
loss(rand(10)) # around 0.9
```
In this manner it is possible to compose optimisers for some added flexibility.
## Decays
Similar to optimisers, Flux also defines some simple decays that can be used in conjunction with other optimisers, or standalone.
```@docs
ExpDecay
InvDecay
WeightDecay
```

View File

@ -1,8 +1,9 @@
# Training
To actually train a model we need three things:
To actually train a model we need four things:
* A *objective function*, that evaluates how well a model is doing given some input data.
* The trainable parameters of the model.
* A collection of data points that will be provided to the objective function.
* An [optimiser](optimisers.md) that will update the model parameters appropriately.
@ -32,6 +33,14 @@ Flux.train!(loss, ps, data, opt)
The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
## Model parameters
The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `params(m)`.
Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
## Datasets
The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:
@ -101,3 +110,30 @@ cb = function ()
accuracy() > 0.9 && Flux.stop()
end
```
## Custom Training loops
The `Flux.train!` function can be very convenient, especially for simple problems.
Its also very flexible with the use of callbacks.
But for some problems its much cleaner to write your own custom training loop.
An example follows that works similar to the default `Flux.train` but with no callbacks.
You don't need callbacks if you just code the calls to your functions directly into the loop.
E.g. in the places marked with comments.
```
function my_custom_train!(loss, ps, data, opt)
ps = Params(ps)
for d in data
gs = gradient(ps) do
training_loss = loss(d...)
# Insert what ever code you want here that needs Training loss, e.g. logging
return training_loss
end
# insert what ever code you want here that needs gradient
# E.g. logging with TensorBoardLogger.jl as histogram so you can see if it is becoming huge
update!(opt, ps, gs)
# Here you might like to check validation set accuracy, and break out to do early stopping
end
end
```
You could simplify this further, for example by hard-coding in the loss function.

View File

@ -6,7 +6,7 @@ using Base: tail
using Zygote, MacroTools, Juno, Reexport, Statistics, Random
using MacroTools: @forward
@reexport using NNlib
using Zygote: Params, @adjoint, gradient, pullback
using Zygote: Params, @adjoint, gradient, pullback, @nograd
export gradient
export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
@ -20,18 +20,9 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
using CUDAapi
if has_cuda()
try
using CuArrays
@eval has_cuarrays() = true
catch ex
@warn "CUDA is installed, but CuArrays.jl fails to load" exception=(ex,catch_backtrace())
@eval has_cuarrays() = false
end
else
has_cuarrays() = false
end
using CuArrays
const use_cuda = Ref(false)
include("utils.jl")
include("onehot.jl")
@ -47,8 +38,26 @@ include("data/Data.jl")
include("deprecations.jl")
if has_cuarrays()
include("cuda/cuda.jl")
function __init__()
precompiling = ccall(:jl_generating_output, Cint, ()) != 0
# we don't want to include the CUDA module when precompiling,
# or we could end up replacing it at run time (triggering a warning)
precompiling && return
if !CuArrays.functional()
# nothing to do here, and either CuArrays or one of its dependencies will have warned
else
use_cuda[] = true
# FIXME: this functionality should be conditional at run time by checking `use_cuda`
# (or even better, get moved to CuArrays.jl as much as possible)
if CuArrays.has_cudnn()
include(joinpath(@__DIR__, "cuda/cuda.jl"))
else
@warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
end
end
end
end # module

View File

@ -2,11 +2,8 @@ module CUDA
using ..CuArrays
if CuArrays.libcudnn !== nothing # TODO: use CuArrays.has_cudnn()
include("curnn.jl")
include("cudnn.jl")
else
@warn "CUDNN is not installed, some functionality will not be available."
end
using CuArrays: CUDNN
include("curnn.jl")
include("cudnn.jl")
end

View File

@ -1,199 +1,5 @@
using CuArrays: libcudnn
using CuArrays.CUDNN: @check, handle, cudnnStatus_t, cudnnTensorDescriptor_t,
cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
import CuArrays.CUDAdrv: CuPtr, CU_NULL
using LinearAlgebra
mutable struct DropoutDesc
ptr::Ptr{Nothing}
states::CuVector{UInt8}
end
Base.unsafe_convert(::Type{Ptr{Nothing}}, dd::DropoutDesc) = dd.ptr
function DropoutDesc(ρ::Real; seed::Integer=0)
d = [C_NULL]
s = Csize_t[0]
@check ccall((:cudnnCreateDropoutDescriptor,libcudnn), cudnnStatus_t, (Ptr{Ptr{Nothing}},), d)
@check ccall((:cudnnDropoutGetStatesSize,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Csize_t}),handle(),s)
states = CuArray{UInt8}(undef, s[]) # TODO: can we drop this when ρ=0?
desc = DropoutDesc(d[], states)
@check ccall((:cudnnSetDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Nothing},Cfloat,CuPtr{Nothing},Csize_t,Culonglong),
desc,handle(),ρ,states,length(states),seed)
finalizer(desc) do x
@check ccall((:cudnnDestroyDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
end
return desc
end
const BATCHNORM_SPATIAL = 1
const BATCHNORM_ACTIVATION = 0
const BATCHNORM_MIN_EPS = 1e-5
@inline _wsize(y) = (map(_ -> 1, size(y)[1:end-2])..., size(y)[end-1], 1)
@inline _reddims(y) = (collect(1:ndims(y)-2)..., ndims(y))
mutable struct BNCache
mean
ivar
end
BNCache() = BNCache(nothing, nothing)
# NOTE: CuDNN supports only 4D and 5D Tensors for BatchNorm Operations
# so reshape a 2D Tensor into 4D
batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2},
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
cache = nothing, alpha = T(1), beta = T(0),
eps = T(1e-5), training = true) where T<:Union{Float32, Float64} =
dropdims(batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), running_mean, running_var, momentum,
cache = cache, alpha = alpha, beta = beta, eps = eps, training = training), dims = (1, 2))
function batchnorm(g::CuArray{T}, b::CuArray{T}, x::Union{CuArray{T, 4},CuArray{T,5}},
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
cache = nothing, alpha = T(1), beta = T(0),
eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
y = similar(x)
cudnnBNForward!(y, g, b, x, running_mean, running_var, momentum, cache = cache,
alpha = alpha, beta = beta, eps = eps, training = training)
y
end
function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray{T},
running_mean::CuArray{T}, running_var::CuArray{T},
momentum; cache = nothing,
alpha = T(1), beta = T(0),
eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
dims = _wsize(x)
if eps < BATCHNORM_MIN_EPS
# warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", BATCHNORM_MIN_EPS)
eps = BATCHNORM_MIN_EPS
end
xd = TensorDesc(x)
yd = TensorDesc(y)
gd = TensorDesc(T, dims)
if training
if cache !== nothing
mean = zeros(CuArray{T}, dims...)
ivar = ones(CuArray{T}, dims...)
else
mean = CU_NULL
ivar = CU_NULL
end
@check ccall((:cudnnBatchNormalizationForwardTraining, libcudnn), cudnnStatus_t,
(cudnnHandle_t,cudnnBatchNormMode_t,
Ptr{T}, Ptr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, CuPtr{T},
Cdouble, CuPtr{T}, CuPtr{T},
Cdouble, CuPtr{T}, CuPtr{T}),
handle(), BATCHNORM_SPATIAL,
Ref(T(alpha)), Ref(T(beta)),
xd, x,
yd, y,
gd, g, b,
momentum, running_mean, running_var,
eps, mean, ivar)
if cache !== nothing
cache.mean = mean
cache.ivar = ivar
end
else
@check ccall((:cudnnBatchNormalizationForwardInference, libcudnn), cudnnStatus_t,
(Ptr{cudnnHandle_t},cudnnBatchNormMode_t,
Ptr{T}, Ptr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, CuPtr{T},
CuPtr{T}, CuPtr{T},
Cdouble),
handle(), BATCHNORM_SPATIAL,
Ref(T(alpha)), Ref(T(beta)),
xd, x,
yd, y,
gd, g, b,
running_mean, running_var,
eps)
end
end
function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2}, dy::CuArray{T, 2},
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
cache = nothing, eps = T(1e-5), alpha = T(1),
beta = T(0), training = true) where T<:Union{Float32, Float64}
dg, db, dx = ∇batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), reshape(dy, 1, 1, size(dy, 1),
size(dy, 2)), running_mean, running_var, momentum, cache = cache, eps = eps,
alpha = alpha, beta = beta, training = training)
(dg, db, dropdims(dx, dims = (1, 2)))
end
function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
cache = nothing, eps = T(1e-5), alpha = T(1),
beta = T(0), training = true) where T<:Union{Float32, Float64}
dg = similar(g)
db = similar(b)
dx = similar(x)
cudnnBNBackward!(dg, g, db, dx, x, dy, running_mean, running_var, T(momentum),
training = training, cache = cache, eps = eps, alpha = alpha, beta = beta)
(dg, db, dx)
end
function cudnnBNBackward!(dg::CuArray{T}, g::CuArray{T}, db::CuArray{T},
dx::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
running_mean::CuArray{T}, running_var::CuArray{T},
momentum; cache = nothing, eps = T(1e-5),
alpha = T(1), beta = T(0),
dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
if training
xd = TensorDesc(x)
dyd = TensorDesc(dy)
dxd = TensorDesc(dx)
gd = TensorDesc(T, _wsize(x))
if cache !== nothing
mean, ivar = cache.mean, cache.ivar
info("mean and ivar are fetched from the cache")
else
mean, ivar = CU_NULL, CU_NULL
end
if eps < BATCHNORM_MIN_EPS
eps = BATCHNORM_MIN_EPS
end
@check ccall((:cudnnBatchNormalizationBackward, libcudnn), cudnnStatus_t,
(cudnnHandle_t,cudnnBatchNormMode_t,
Ptr{T}, Ptr{T},
Ptr{T}, Ptr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, CuPtr{T}, CuPtr{T},
Cdouble, CuPtr{T}, CuPtr{T}),
handle(), BATCHNORM_SPATIAL,
Ref(T(alpha)), Ref(T(beta)),
Ref(T(dalpha)), Ref(T(dbeta)),
xd, x,
dyd, dy,
dxd, dx,
gd, g, dg, db,
eps, mean, ivar)
else
ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps)
dx .= dy .* reshape(g, _wsize(x)) .* ivar
dg .= squeeze(sum(dy .* (x .- reshape(running_mean, _wsize(x))) .* ivar, _reddims(dy)), dims = (1,2,4))
db .= squeeze(sum(dy, _reddims(dy)), dims = (1,2,4))
end
end
# Flux Interface
import ..Flux: data
import CuArrays.CUDNN: batchnorm, ∇batchnorm
(BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, cache = nothing) where T<:Union{Float32, Float64} =
BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = Flux.istraining()))

View File

@ -1,273 +1,25 @@
using CuArrays: libcudnn
using CuArrays.CUDNN: @check, cudnnStatus_t, cudnnTensorDescriptor_t,
cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
import CuArrays.CUDAdrv: CuPtr, CU_NULL
using LinearAlgebra
const RNN_RELU = 0 # Stock RNN with ReLu activation
const RNN_TANH = 1 # Stock RNN with tanh activation
const LSTM = 2 # LSTM with no peephole connections
const GRU = 3 # Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1)
const LINEAR_INPUT = 0
const SKIP_INPUT = 1
const UNIDIRECTIONAL = 0
const BIDIRECTIONAL = 1
const RNN_ALGO_STANDARD = 0
const RNN_ALGO_PERSIST_STATIC = 1
const RNN_ALGO_PERSIST_DYNAMIC = 2
# param layout:
# RNN: [weight, bias] × [input, hidden]
# GRU: [weight, bias] × [input, hidden] × [reset, update, newmem]
# LSTM: [weight, bias] × [input, hidden] × [input, forget, newmem, output]
function params(w::CuVector, input, hidden, n = 1)
slice(offset, shape) = reshape(view(w, offset.+(1:prod(shape))), shape)
wx = slice(0, (input, hidden*n))
wh = slice(length(wx), (hidden, hidden*n))
bias = view(w, length(wx)+length(wh) .+ (1:hidden*n))
(wx, wh), bias
end
mutable struct RNNDesc{T}
mode::Int
input::Int
hidden::Int
params::CuVector{T}
weights::NTuple{2,CuMatrix{T}}
bias::CuVector{T}
ptr::Ptr{Nothing}
end
Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr
function rnnParamSize(T, r, input)
size = Csize_t[0]
@check ccall((:cudnnGetRNNParamsSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Ptr{Nothing},Ptr{Csize_t},Cint),
handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T))
return Int(size[])÷sizeof(T)
end
ngates(mode) = [1, 1, 4, 3][mode+1]
ngates(r::RNNDesc) = ngates(r.mode)
function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T
d = [C_NULL]
@check ccall((:cudnnCreateRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Ptr{Nothing}},),d)
dropoutDesc = DropoutDesc(0)
inputMode = LINEAR_INPUT
direction = UNIDIRECTIONAL
algo = RNN_ALGO_STANDARD
@check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint),
handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
w = CuArrays.zeros(T, rnnParamSize(T, d[], input))
# TODO: avoid reserve allocation here
rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
finalizer(rd) do x
@check ccall((:cudnnDestroyRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
end
return rd
end
function rnnWorkspaceSize(r::RNNDesc, seqlen, xdesc)
size = Csize_t[0]
@check ccall((:cudnnGetRNNWorkspaceSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Ptr{Ptr{Nothing}},Ptr{Csize_t}),
handle(), r, seqlen, xdesc, size)
return Int(size[])
end
const workspace = Ref{Union{Nothing,CuVector{UInt8}}}(nothing)
function getworkspace(bytes)
if workspace[] === nothing || length(workspace[]) < bytes
workspace[] = CuVector{UInt8}(undef, bytes)
end
workspace[]
end
getworkspace(r::RNNDesc, seqlen, xdesc) =
getworkspace(rnnWorkspaceSize(r, seqlen, xdesc))
function rnnTrainingReserveSize(r::RNNDesc, seqlen, xdesc)
size = Csize_t[0]
@check ccall((:cudnnGetRNNTrainingReserveSize,libcudnn), cudnnStatus_t, (Ptr{Nothing}, Ptr{Nothing}, Cint, Ptr{Ptr{Nothing}}, Ptr{Csize_t}),
handle(), r, seqlen, xdesc, size)
return Int(size[])
end
function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
workspace, reserve=nothing) where T
if reserve == nothing
@check ccall((:cudnnRNNForwardInference, libcudnn), cudnnStatus_t,
(Ptr{Nothing}, Ptr{Nothing}, Cint,
Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
CuPtr{Nothing}, Csize_t),
handle(), rnn, seqlen,
xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
workspace, length(workspace))
else
@check ccall((:cudnnRNNForwardTraining, libcudnn), cudnnStatus_t,
(Ptr{Nothing}, Ptr{Nothing}, Cint,
Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
handle(), rnn, seqlen,
xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
workspace, length(workspace), reserve, length(reserve))
end
end
xDesc(x) = [TensorDesc(eltype(x), (1, size(x, 1), size(x, 2)))]
hDesc(h::Nothing) = C_NULL, CU_NULL
hDesc(x::Integer) = (@assert x == 0; hDesc(nothing))
function hDesc(h::CuArray)
TensorDesc(eltype(h), (size(h, 1), size(h, 2), 1)), h
end
# TODO: can we just manipulate strides here?
# TODO: should use repmat, but this isn't implemented.
hBatch(x::AbstractVector, h::CuVector) = h
hBatch(x::AbstractMatrix, h::CuVector) = h .* CuArrays.ones(1, size(x, 2))
hBatch(x::AbstractMatrix, h::CuMatrix) = h .* CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1)
function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
h = hBatch(x, h_)
c = c_ == nothing ? nothing : hBatch(x, c_)
@assert size(x, 1) == rnn.input
@assert size(h, 1) == rnn.hidden
@assert size(x, 2) == size(h, 2)
seqLength = 1
xdesc = xDesc(x)
y = x isa AbstractVector ? similar(x, rnn.hidden) : similar(x, rnn.hidden, size(x, 2))
ho = similar(h)
ydesc = xDesc(y)
workspace = getworkspace(rnn, seqLength, xdesc)
reserve = train == Val{true} ?
CuVector{UInt8}(undef, rnnTrainingReserveSize(rnn, seqLength, xdesc)) :
nothing
co = c == nothing ? c : similar(c)
cudnnRNNForward(rnn, seqLength,
xdesc, x,
hDesc(h)...,
hDesc(c)...,
FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
ydesc, y,
hDesc(ho)...,
hDesc(co)...,
workspace, reserve)
result = c == nothing ? (y, ho) : (y, ho, co)
return train == Val{true} ? (reserve, result) : result
end
forwardTrain(rnn::RNNDesc{T}, x::CuArray{T}, h::CuArray{T}, c = nothing) where T =
forward(rnn, x, h, c, Val{true})
function cudnnRNNBackwardData(rnn::RNNDesc{T}, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, rs) where T
@check ccall((:cudnnRNNBackwardData,libcudnn),cudnnStatus_t,
(Ptr{Nothing}, Ptr{Nothing}, Cint,
Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing},
CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
handle(), rnn, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, length(ws), rs, length(rs))
end
function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
# Same as above, any more efficient way?
dy = dy_ isa Integer ? zero(y) : dy_
yd = xDesc(y)
dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2))
dh = similar(h)
dc = c == nothing ? nothing : similar(c)
cudnnRNNBackwardData(rnn, 1,
yd, y, yd, dy, hDesc(dho)..., hDesc(dco)...,
FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)...,
workspace[], reserve)
return c == nothing ? (dx, dh) : (dx, dh, dc)
end
backwardData(rnn, y, dy, dho, hx, reserve) =
backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve)
function cudnnRNNBackwardWeights(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, yd, y, dwd, dw,
workspace, reserve) where T
@check ccall((:cudnnRNNBackwardWeights,libcudnn), cudnnStatus_t,
(Ptr{Nothing}, Ptr{Nothing}, Cint, # handle, rnnDesc, seqLength
Ptr{Ptr{Nothing}}, CuPtr{T}, #x
Ptr{Nothing}, CuPtr{T}, #hx
Ptr{Ptr{Nothing}}, CuPtr{T}, #y
CuPtr{Nothing}, Csize_t, #ws
Ptr{Nothing}, CuPtr{T}, #dw
CuPtr{Nothing}, Csize_t), #rs
handle(), rnn, seqlen, xd, x, hd, h, yd, y,
workspace, length(workspace), dwd, dw, reserve, length(reserve))
end
function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T
dw = zero(rnn.params)
cudnnRNNBackwardWeights(rnn, 1,
xDesc(x), x, hDesc(h)..., xDesc(y), y,
FilterDesc(T, (1, 1, length(dw))), dw,
workspace[], reserve)
return params(dw, rnn.input, rnn.hidden, ngates(rnn))
end
# Interface
import ..Flux: Flux, relu
using CuArrays.CUDAnative
using CuArrays: @cuindex, cudims
function LinearAlgebra.copy_transpose!(dst::CuArray, src::CuArray)
function kernel(dst, src)
I = @cuindex dst
dst[I...] = src[reverse(I)...]
return
end
blk, thr = cudims(dst)
@cuda blocks=blk threads=thr kernel(dst, src)
return dst
end
CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuArray{T,2},<:CuArray{T,1}}
CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}}
CuLSTM{T} = Flux.LSTMCell{<:CuArray{T,2},<:CuArray{T,1}}
CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}}
function copyparams!(m::CuRNNs, d::RNNDesc)
Wi, Wh = d.weights
copy_transpose!(Wi, m.Wi)
copy_transpose!(Wh, m.Wh)
copy_transpose!(d.bias, m.b)
return
end
function RNNDesc(m::CuRNNs{T}) where T
function CUDNN.RNNDesc(m::CuRNNs{T}) where T
h, i = length(m.h), size(m.Wi, 2)
mode = m isa CuRNN ?
(m.σ == tanh ? RNN_TANH : RNN_RELU) :
m isa CuGRU ? GRU : LSTM
r = RNNDesc{T}(mode, i, h)
(m.σ == tanh ? CUDNN.CUDNN_RNN_TANH : CUDNN.CUDNN_RNN_RELU) :
m isa CuGRU ? CUDNN.CUDNN_GRU : CUDNN.CUDNN_LSTM
r = CUDNN.RNNDesc{T}(mode, i, h)
return r
end
const descs = WeakKeyDict()
function desc(rnn)
d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = RNNDesc(rnn))
copyparams!(rnn, d)
d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = CUDNN.RNNDesc(rnn))
CUDNN.setweights!(d, rnn.Wi, rnn.Wh, rnn.b)
return d
end
@ -275,17 +27,17 @@ import Zygote
using Zygote: @adjoint
function (m::CuRNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
y, h = forward(desc(m), x, h)
y, h = CUDNN.forward(desc(m), x, h)
return h, y
end
function (m::CuGRU{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
y, h = forward(desc(m), x, h)
y, h = CUDNN.forward(desc(m), x, h)
return h, y
end
function (m::CuLSTM{T})(h::NTuple{2,CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
y, h, c = forward(desc(m), x, h[1], h[2])
y, h, c = CUDNN.forward(desc(m), x, h[1], h[2])
return (h, c), y
end
@ -303,7 +55,7 @@ unbroadcast(x::AbstractArray, Δ) =
coerce_cuda(x::Union{CuArray,Nothing}) = x
coerce_cuda(x::Tuple) = coerce_cuda.(x)
coerce_cuda(x) = x .+ CuArrays.fill(0)
coerce_cuda(x::AbstractArray) = x .+ CuArrays.fill(0)
function struct_grad!(cx::Zygote.Context, x, )
for f in fieldnames(typeof(x))
@ -316,28 +68,23 @@ end
for RNN in (CuRNN, CuGRU)
@eval @adjoint function (m::$RNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
reserve, (y, ho) = forwardTrain(desc(m), x, h)
(y, ho), back = CUDNN.pullback(desc(m), x, h)
(ho, y), function (Δ)
dho, dy = coerce_cuda(Δ)
h_ = hBatch(x, h)
dx, dh = backwardData(descs[m], y, dy, dho, h_, reserve)
(dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing))
(dm, unbroadcast(h, dh), dx)
dho, dy = coerce_cuda(Δ) # Support FillArrays etc.
= back(dy, dho)
dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(.Wi),Wh=transpose(.Wh),b=.b,h=nothing))
(dm, unbroadcast(h, .h), .x)
end
end
end
@adjoint function (m::CuLSTM)((h, c)::Tuple{CuArray{T},CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
reserve, (y, ho, co) = forwardTrain(desc(m), x, h, c)
(y, ho, co), back = CUDNN.pullback(desc(m), x, h, c)
((ho, co), y), function (Δ)
dhc, dy = coerce_cuda(Δ)
dhc, dy = coerce_cuda(Δ) # Support FillArrays etc.
dho, dco = dhc === nothing ? (nothing, nothing) : dhc
h_ = hBatch(x, h)
c_ = hBatch(x, c)
dx, dh, dc = backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve)
(dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
dm = struct_grad!(__context__, m, (Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing,c=nothing))
(dm, (unbroadcast(h, dh), unbroadcast(c, dc)), dx)
= back(dy, dho, dco)
dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(.Wi),Wh=transpose(.Wh),b=.b,h=nothing,c=nothing))
(dm, (unbroadcast(h, .h), unbroadcast(c, .c)), .x)
end
end

View File

@ -39,7 +39,7 @@ end
trainable(m) = functor(m)[1]
params!(p::Params, x::AbstractArray{<:Real}, seen = IdSet()) = push!(p, x)
params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
function params!(p::Params, x, seen = IdSet())
x in seen && return
@ -73,13 +73,7 @@ end
cpu(m) = fmap(x -> adapt(Array, x), m)
const gpu_adaptor = if has_cuarrays()
CuArrays.cu
else
identity
end
gpu(x) = fmap(gpu_adaptor, x)
gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x
# Precision

View File

@ -39,24 +39,39 @@ function Base.show(io::IO, c::Chain)
print(io, ")")
end
"""
outdims(c::Chain, isize)
Calculate the output dimensions given the input dimensions, `isize`.
```julia
m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
outdims(m, (10, 10)) == (6, 6)
```
"""
outdims(c::Chain, isize) = foldl(, map(l -> (x -> outdims(l, x)), c.layers))(isize)
# This is a temporary and naive implementation
# it might be replaced in the future for better performance
# see issue https://github.com/FluxML/Flux.jl/issues/702
# Johnny Chen -- @johnnychen94
# only slightly changed to better handle interaction with Zygote @dsweber2
"""
activations(c::Chain, input)
Calculate the forward results of each layers in Chain `c` with `input` as model input.
"""
function activations(c::Chain, input)
rst = []
for l in c
x = get(rst, length(rst), input)
push!(rst, l(x))
end
return rst
extraChain(c.layers, input)
end
function extraChain(fs::Tuple, x)
res = first(fs)(x)
return (res, extraChain(Base.tail(fs), res)...)
end
extraChain(::Tuple{}, x) = ()
"""
Dense(in::Integer, out::Integer, σ = identity)
@ -112,6 +127,19 @@ end
(a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
a(T.(x))
"""
outdims(l::Dense, isize)
Calculate the output dimensions given the input dimensions, `isize`.
```julia
m = Dense(10, 5)
outdims(m, (5, 2)) == (5,)
outdims(m, (10,)) == (5,)
```
"""
outdims(l::Dense, isize) = (size(l.W)[1],)
"""
Diagonal(in::Integer)
@ -141,6 +169,7 @@ function Base.show(io::IO, l::Diagonal)
print(io, "Diagonal(", length(l.α), ")")
end
outdims(l::Diagonal, isize) = (length(l.α),)
"""
Maxout(over)
@ -189,6 +218,8 @@ function (mo::Maxout)(input::AbstractArray)
mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
end
outdims(l::Maxout, isize) = outdims(first(l.over), isize)
"""
SkipConnection(layers, connection)

View File

@ -1,4 +1,9 @@
using NNlib: conv, ∇conv_data, depthwiseconv
using NNlib: conv, ∇conv_data, depthwiseconv, output_size
# pad dims of x with dims of y until ndims(x) == ndims(y)
_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
_convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+ (ksize .- 1).*dsize .- (pad[1:2:end] .+ pad[2:2:end])
expand(N, i::Tuple) = i
expand(N, i::Integer) = ntuple(_ -> i, N)
@ -17,7 +22,7 @@ Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
out = 16
Conv((2, 2), 1=>16, relu)
Data should be stored in WHCN order (width, height, # channels, # batches).
Data should be stored in WHCN order (width, height, # channels, batch size).
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array.
@ -106,8 +111,23 @@ end
a(T.(x))
"""
ConvTranspose(filter::Tuple, in=>out)
ConvTranspose(filter::Tuple, in=>out, activation)
outdims(l::Conv, isize::Tuple)
Calculate the output dimensions given the input dimensions, `isize`.
Batch size and channel size are ignored as per `NNlib.jl`.
```julia
m = Conv((3, 3), 3 => 16)
outdims(m, (10, 10)) == (8, 8)
outdims(m, (10, 10, 1, 3)) == (8, 8)
```
"""
outdims(l::Conv, isize) =
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
"""
ConvTranspose(size, in=>out)
ConvTranspose(size, in=>out, relu)
Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively.
@ -178,6 +198,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
)
end
# TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
@nograd conv_transpose_dims
function (c::ConvTranspose)(x::AbstractArray)
# ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
@ -198,6 +221,8 @@ end
(a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
a(T.(x))
outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
"""
DepthwiseConv(filter::Tuple, in=>out)
DepthwiseConv(filter::Tuple, in=>out, activation)
@ -298,9 +323,12 @@ end
(a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
a(T.(x))
outdims(l::DepthwiseConv, isize) =
output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
"""
CrossCor(size, in=>out)
CrossCor(size, in=>out, relu)
CrossCor(size, in=>out, activation)
Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively.
@ -351,6 +379,11 @@ function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ
return CrossCor(σ, w, b, stride, pad, dilation)
end
function CrossCor(;weight::AbstractArray, bias::Union{Zeros, AbstractVector{T}},
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
end
function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
@ -387,6 +420,9 @@ end
(a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
a(T.(x))
outdims(l::CrossCor, isize) =
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
"""
MaxPool(k)
@ -416,6 +452,8 @@ function Base.show(io::IO, m::MaxPool)
print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
end
outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
"""
MeanPool(k)
@ -443,3 +481,5 @@ end
function Base.show(io::IO, m::MeanPool)
print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
end
outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))

View File

@ -1,5 +1,5 @@
gate(h, n) = (1:h) .+ h*(n-1)
gate(x::AbstractVector, h, n) = x[gate(h,n)]
gate(x::AbstractVector, h, n) = @view x[gate(h,n)]
gate(x::AbstractMatrix, h, n) = x[gate(h,n),:]
# Stateful recurrence
@ -45,8 +45,7 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
"""
reset!(rnn)
Reset the hidden state of a recurrent layer back to its original value. See also
`truncate!`.
Reset the hidden state of a recurrent layer back to its original value.
Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to

View File

@ -1,13 +1,24 @@
using CuArrays
using NNlib: logsoftmax, logσ
# Cost functions
mse(, y) = sum(( .- y).^2) * 1 // length(y)
function crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
-sum(y .* log.() .* weight) * 1 // size(y, 2)
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
return -sum(y .* log.()) * 1 // size(y, 2)
end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
return -sum(y .* log.()) .* weight * 1 // size(y, 2)
end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
return -sum(y .* log.() .* weight) * 1 // size(y, 2)
end
crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(, y, weight)
function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
end
@ -25,6 +36,9 @@ Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerica
"""
binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ)
# Re-definition to fix interaction with CuArrays.
CuArrays.@cufunc binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ)
"""
logitbinarycrossentropy(logŷ, y)
@ -39,13 +53,60 @@ but it is more numerically stable.
"""
logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
# Re-definition to fix interaction with CuArrays.
CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
"""
normalise(x::AbstractArray; dims=1)
Normalises x to mean 0 and standard deviation 1, across the dimensions given by dims. Defaults to normalising over columns.
Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
julia> a = reshape(collect(1:9), 3, 3)
3×3 Array{Int64,2}:
1 4 7
2 5 8
3 6 9
julia> normalise(a)
3×3 Array{Float64,2}:
-1.22474 -1.22474 -1.22474
0.0 0.0 0.0
1.22474 1.22474 1.22474
julia> normalise(a, dims=2)
3×3 Array{Float64,2}:
-1.22474 0.0 1.22474
-1.22474 0.0 1.22474
-1.22474 0.0 1.22474
"""
function normalise(x::AbstractArray; dims=1)
μ′ = mean(x, dims = dims)
σ = std(x, dims = dims, mean = μ′, corrected=false)
return (x .- μ′) ./ σ
end
"""
kldivergence(, y)
KLDivergence is a measure of how much one probability distribution is different from the other.
It is always non-negative and zero only when both the distributions are equal everywhere.
[KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
"""
function kldivergence(, y)
entropy = sum(y .* log.(y)) *1 //size(y,2)
cross_entropy = crossentropy(, y)
return entropy + cross_entropy
end
"""
poisson(, y)
Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
[Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
"""
poisson(, y) = sum( .- y .* log.()) *1 // size(y,2)
"""
hinge(, y)
Measures the loss given the prediction and true labels y(containing 1 or -1).
[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
"""
hinge(, y) = sum(max.(0, 1 .- .* y)) *1 // size(y,2)

View File

@ -37,12 +37,10 @@ import Adapt: adapt, adapt_structure
adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
if has_cuarrays()
import .CuArrays: CuArray, cudaconvert
import Base.Broadcast: BroadcastStyle, ArrayStyle
BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
end
import .CuArrays: CuArray, cudaconvert
import Base.Broadcast: BroadcastStyle, ArrayStyle
BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
"""
onehot(l, labels[, unk])
@ -127,6 +125,4 @@ onecold(y::AbstractMatrix, labels...) =
onecold(y::OneHotMatrix, labels...) =
mapreduce(x -> Flux.onecold(x, labels...), |, y.data, dims = 2, init = 0)
# TODO probably still want this as a custom adjoint Zygote
# onecold(x::TrackedVector, l...) = onecold(data(x), l...)
# onecold(x::TrackedMatrix, l...) = onecold(data(x), l...)
@nograd onecold, onehot, onehotbatch

View File

@ -1,5 +1,4 @@
using Flux
using Base: @get!
using MacroTools: @forward
const ϵ = 1e-8
@ -7,10 +6,28 @@ const ϵ = 1e-8
# TODO: should use weak refs
"""
Descent(η)
Descent(η)
Classic gradient descent optimiser with learning rate `η`.
For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`.
For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
## Parameters
- Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
## Example
```julia-repl
opt = Descent() # uses default η (0.1)
opt = Descent(0.3) # use provided η
ps = params(model)
gs = gradient(ps) do
loss(x, y)
end
Flux.Optimise.update!(opt, ps, gs)
```
"""
mutable struct Descent
eta::Float64
@ -23,9 +40,20 @@ function apply!(o::Descent, x, Δ)
end
"""
Momentum(η = 0.01; ρ = 0.9)
Momentum(η, ρ)
Gradient descent with learning rate `η` and momentum `ρ`.
## Parameters
- Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`.
- Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`.
## Examples
```julia
opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9
opt = Momentum(0.01, 0.99)
```
"""
mutable struct Momentum
eta::Float64
@ -43,9 +71,20 @@ function apply!(o::Momentum, x, Δ)
end
"""
Nesterov(eta, ρ = 0.9)
Nesterov(η, ρ)
Gradient descent with learning rate `η` and Nesterov momentum `ρ`.
## Parameters
- Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
- Nesterov Momentum (ρ): Paramters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
## Examples
```julia
opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9
opt = Nesterov(0.003, 0.95)
```
"""
mutable struct Nesterov
eta::Float64
@ -64,11 +103,23 @@ function apply!(o::Nesterov, x, Δ)
end
"""
RMSProp(η = 0.001, ρ = 0.9)
RMSProp(η, ρ)
Implements the RMSProp algortihm. Often a good choice for recurrent networks. Paramters other than learning rate generally don't need tuning.
## Parameters
- Learning Rate (η): Defaults to `0.001`.
- Rho (ρ): Defaults to `0.9`.
## Examples
```julia
opt = RMSProp() # uses default η = 0.001 and ρ = 0.9
opt = RMSProp(0.002, 0.95)
```
## References
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
optimiser. Parameters other than learning rate don't need tuning. Often a good
choice for recurrent networks.
"""
mutable struct RMSProp
eta::Float64
@ -86,8 +137,22 @@ function apply!(o::RMSProp, x, Δ)
end
"""
ADAM(η = 0.001, β = (0.9, 0.999))
ADAM(η, β::Tuple)
Implements the ADAM optimiser.
## Paramters
- Learning Rate (`η`): Defaults to `0.001`.
- Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
opt = ADAM(0.001, (0.9, 0.8))
```
## References
[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
"""
mutable struct ADAM
@ -109,8 +174,23 @@ function apply!(o::ADAM, x, Δ)
end
"""
RADAM(η = 0.001, β = (0.9, 0.999))
RADAM(η, β::Tuple)
Implements the rectified ADAM optimizer.
## Parameters
- Learning Rate (η): Defaults to `0.001`
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
opt = RADAM(0.001, (0.9, 0.8))
```
## References
[RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
"""
mutable struct RADAM
@ -139,10 +219,22 @@ function apply!(o::RADAM, x, Δ)
end
"""
AdaMax(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08)
AdaMax(η, β::Tuple)
[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on
the -norm.
Variant of ADAM based on -norm.
## Parameters
- Learning Rate (η): Defaults to `0.001`
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = AdaMax() # uses default η and β
opt = AdaMax(0.001, (0.9, 0.995))
```
## References
[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser.
"""
mutable struct AdaMax
eta::Float64
@ -163,8 +255,21 @@ function apply!(o::AdaMax, x, Δ)
end
"""
ADAGrad(η = 0.1; ϵ = 1e-8)
ADAGrad(η)
Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated.
## Parameters
- Learning Rate (η): Defaults to `0.1`
## Examples
```julia
opt = ADAGrad() # uses default η = 0.1
opt = ADAGrad(0.001)
```
## References
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
Parameters don't need tuning.
"""
@ -177,16 +282,27 @@ ADAGrad(η = 0.1) = ADAGrad(η, IdDict())
function apply!(o::ADAGrad, x, Δ)
η = o.eta
acc = get!(o.acc, x, fill(ϵ, size(x)))::typeof(x)
acc = get!(o.acc, x, fill!(zero(x), ϵ))::typeof(x)
@. acc += Δ^2
@. Δ *= η / (acc + ϵ)
end
"""
ADADelta(ρ = 0.9, ϵ = 1e-8)
ADADelta(ρ)
[ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need
tuning.
Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning.
## Parameters
- Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`.
## Examples
```julia
opt = ADADelta() # uses default ρ = 0.9
opt = ADADelta(0.89)
```
## References
[ADADelta](https://arxiv.org/abs/1212.5701) optimiser.
"""
mutable struct ADADelta
rho::Float64
@ -205,10 +321,22 @@ function apply!(o::ADADelta, x, Δ)
end
"""
AMSGrad(η = 0.001, β = (0.9, 0.999))
AMSGrad(η, β::Tuple)
[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need
tuning.
Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning.
## Parameters
- Learning Rate (η): Defaults to `0.001`.
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = AMSGrad() # uses default η and β
opt = AMSGrad(0.001, (0.89, 0.995))
```
## References
[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser.
"""
mutable struct AMSGrad
eta::Float64
@ -220,18 +348,30 @@ AMSGrad(η = 0.001, β = (0.9, 0.999)) = AMSGrad(η, β, IdDict())
function apply!(o::AMSGrad, x, Δ)
η, β = o.eta, o.beta
mt, vt, v̂t = get!(o.state, x, (fill(ϵ, size(x)), fill(ϵ, size(x)), fill(ϵ, size(x))))
mt, vt, v̂t = get!(o.state, x, (fill!(zero(x), ϵ), fill!(zero(x), ϵ), fill!(zero(x), ϵ)))
@. mt = β[1] * mt + (1 - β[1]) * Δ
@. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
@. v̂t = max.(v̂t, vt)
@. v̂t = max(v̂t, vt)
@. Δ = η * mt / (v̂t + ϵ)
end
"""
NADAM(η = 0.001, β = (0.9, 0.999))
NADAM(η, β::Tuple)
[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need
tuning.
Nesterov variant of ADAM. Parameters don't need tuning.
## Parameters
- Learning Rate (η): Defaults to `0.001`.
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = NADAM() # uses default η and β
opt = NADAM(0.002, (0.89, 0.995))
```
## References
[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser.
"""
mutable struct NADAM
eta::Float64
@ -252,9 +392,23 @@ function apply!(o::NADAM, x, Δ)
end
"""
ADAMW((η = 0.001, β = (0.9, 0.999), decay = 0)
ADAMW(η, β::Tuple, decay)
[ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam.
Variant of ADAM defined by fixing weight decay regularization.
## Parameters
- Learning Rate (η): Defaults to `0.001`.
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999).
- decay: Decay applied to weights during optimisation. Defaults to 0.
## Examples
```julia
opt = ADAMW() # uses default η, β and decay
opt = ADAMW(0.001, (0.89, 0.995), 0.1)
```
## References
[ADAMW](https://arxiv.org/abs/1711.05101)
"""
ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
Optimiser(ADAM(η, β), WeightDecay(decay))
@ -287,9 +441,15 @@ function apply!(o::Optimiser, x, Δ)
end
"""
`InvDecay(γ)`
InvDecay(γ)
Apply inverse time decay to an optimiser
Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
```
## Parameters
- gamma (γ): Defaults to `0.001`
## Example
```julia
Optimiser(InvDecay(..), Opt(..))
```
@ -310,13 +470,22 @@ function apply!(o::InvDecay, x, Δ)
end
"""
`ExpDecay(eta, decay, decay_step, clip)`
ExpDecay(eta, decay, decay_step, clip)
Schedule the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`.
Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
## Parameters
- Learning Rate (eta): Defaults to `0.001`.
- decay: Factor by which the learning rate is discounted. Defaults to `0.1`.
- decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`.
- clip: Minimum value of learning rate. Defaults to `1e-4`.
## Example
To apply exponential decay to an optimiser:
```julia
Optimiser(ExpDecay(..), Opt(..))
opt = Optimiser(ExpDecay(), ADAM())
```
"""
mutable struct ExpDecay
@ -340,9 +509,12 @@ function apply!(o::ExpDecay, x, Δ)
end
"""
`WeightDecay(wd)`
WeightDecay(wd)
Decay the weight parameter by `wd`
Decays the weight by `wd`
## Parameters
- weight decay (wd): 0
"""
mutable struct WeightDecay
wd::Real

View File

@ -1,6 +1,11 @@
# Arrays
glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0/sum(dims))
glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0/sum(dims))
nfan() = 1, 1 #fan_in, fan_out
nfan(n) = 1, n #A vector is treated as a n×1 matrix
nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
ones(T::Type, dims...) = Base.ones(T, dims...)
zeros(T::Type, dims...) = Base.zeros(T, dims...)
@ -98,6 +103,48 @@ function batchseq(xs, pad = nothing, n = maximum(length(x) for x in xs))
[batch([xs_[j][i] for j = 1:length(xs_)]) for i = 1:n]
end
# Flattening models to weight vectors, and back
function _restructure(m, xs)
i = 0
fmap(m) do x
x isa AbstractArray || return x
x = reshape(xs[i.+(1:length(x))], size(x))
i += length(x)
return x
end
end
"""
destructure(m)
Flatten a model's parameters into a single weight vector.
julia> m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
julia> θ, re = destructure(m);
julia> θ
67-element Array{Float32,1}:
-0.1407104
...
The second return value `re` allows you to reconstruct the original network after making
modifications to the weight vector (for example, with a hypernetwork).
julia> re(θ .* 2)
Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
"""
function destructure(m)
xs = Zygote.Buffer([])
fmap(m) do x
x isa AbstractArray && push!(xs, x)
return x
end
return vcat(vec.(copy(xs))...), p -> _restructure(m, p)
end
# Other
"""

View File

@ -25,9 +25,16 @@ cm = gpu(m)
@test all(p isa CuArray for p in params(cm))
@test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
x = [1,2,3]
x = [1.,2.,3.]
cx = gpu(x)
@test Flux.crossentropy(x,x) Flux.crossentropy(cx,cx)
@test Flux.crossentropy(x,x, weight=1.0) Flux.crossentropy(cx,cx, weight=1.0)
@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
x = [-1.1491, 0.8619, 0.3127]
y = [1, 1, 0.]
@test Flux.binarycrossentropy.(σ.(x),y) Array(Flux.binarycrossentropy.(cu(σ.(x)),cu(y)))
@test Flux.logitbinarycrossentropy.(x,y) Array(Flux.logitbinarycrossentropy.(cu(x),cu(y)))
xs = rand(5, 5)
ys = Flux.onehotbatch(1:5,1:5)
@ -51,10 +58,10 @@ end
@test y[3,:] isa CuArray
end
if CuArrays.libcudnn != nothing
@info "Testing Flux/CUDNN"
include("cudnn.jl")
if !haskey(ENV, "CI_DISABLE_CURNN_TEST")
include("curnn.jl")
end
if CuArrays.has_cudnn()
@info "Testing Flux/CUDNN"
include("cudnn.jl")
include("curnn.jl")
else
@warn "CUDNN unavailable, not testing GPU DNN support"
end

View File

@ -22,8 +22,8 @@ end
rand(10, batch_size)
cux = gpu(x)
y, back = pullback((r, x) -> (r(x)), rnn, x)
cuy, cuback = pullback((r, x) -> (r(x)), curnn, cux)
y, back = pullback((r, x) -> r(x), rnn, x)
cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
@test y collect(cuy)
@test haskey(Flux.CUDA.descs, curnn.cell)

View File

@ -4,11 +4,13 @@ import Flux: activations
@testset "basic" begin
@testset "helpers" begin
@testset "activations" begin
dummy_model = Chain(Dense(10,5,σ),Dense(5,2),softmax)
x = rand(10)
@test activations(Chain(), x) == []
@test activations(dummy_model, x)[1] == dummy_model[1](x)
@test activations(dummy_model, x)[2] == x |> dummy_model[1] |> dummy_model[2]
dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x))
x = randn(10)
@test activations(dummy_model, x)[1] == x.^2
@test activations(dummy_model, x)[2] == (x.^2 .- 3)
@test activations(dummy_model, x)[3] == tan.(x.^2 .- 3)
@test activations(Chain(), x) == ()
@test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type
end
end
@ -19,6 +21,12 @@ import Flux: activations
# numeric test should be put into testset of corresponding layer
end
@testset "Activations" begin
c = Chain(Dense(3,5,relu), Dense(5,1,relu))
X = Float32.([1.0; 1.0; 1.0])
@test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
end
@testset "Dense" begin
@test length(Dense(10, 5)(randn(10))) == 5
@test_throws DimensionMismatch Dense(10, 5)(randn(1))
@ -84,4 +92,19 @@ import Flux: activations
@test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
end
end
@testset "output dimensions" begin
m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
@test Flux.outdims(m, (10, 10)) == (6, 6)
m = Dense(10, 5)
@test Flux.outdims(m, (5, 2)) == (5,)
@test Flux.outdims(m, (10,)) == (5,)
m = Flux.Diagonal(10)
@test Flux.outdims(m, (10,)) == (10,)
m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
@test Flux.outdims(m, (10, 10)) == (8, 8)
end
end

View File

@ -1,5 +1,6 @@
using Flux, Test
using Flux: maxpool, meanpool
using Flux: gradient
@testset "Pooling" begin
x = randn(Float32, 10, 10, 3, 2)
@ -83,6 +84,10 @@ end
y = Conv((3,3), 1 => 1)(x)
x_hat = ConvTranspose((3, 3), 1 => 1)(y)
@test size(x_hat) == size(x)
m = ConvTranspose((3,3), 1=>1)
# Test that the gradient call does not throw: #900
@test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
end
@testset "CrossCor" begin
@ -90,7 +95,7 @@ end
w = rand(2,2,1,1)
y = CrossCor(w, [0.0])
@test sum(w .* x[1:2, 1:2, :, :]) == y(x)[1, 1, 1, 1]
@test isapprox(sum(w .* x[1:2, 1:2, :, :]), y(x)[1, 1, 1, 1], rtol=1e-7)
r = zeros(Float32, 28, 28, 1, 5)
m = Chain(
@ -113,17 +118,17 @@ end
l = Conv((3,3), 1=>1)
expected = zeros(eltype(l.weight),5,5,1,1)
expected[2:end-1,2:end-1,1,1] = l.weight
@test expected == l(data)
@test expected l(data)
l = Conv((3,1), 1=>1)
expected = zeros(eltype(l.weight),5,7,1,1)
expected[2:end-1,4,1,1] = l.weight
@test expected == l(data)
@test expected l(data)
l = Conv((1,3), 1=>1)
expected = zeros(eltype(l.weight),7,5,1,1)
expected[4,2:end-1,1,1] = l.weight
@test expected == l(data)
@test expected l(data)
@test begin
# we test that the next expression does not throw
@ -131,3 +136,55 @@ end
true
end
end
@testset "conv output dimensions" begin
m = Conv((3, 3), 3 => 16)
@test Flux.outdims(m, (10, 10)) == (8, 8)
m = Conv((3, 3), 3 => 16; stride = 2)
@test Flux.outdims(m, (5, 5)) == (2, 2)
m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = ConvTranspose((3, 3), 3 => 16)
@test Flux.outdims(m, (8, 8)) == (10, 10)
m = ConvTranspose((3, 3), 3 => 16; stride = 2)
@test Flux.outdims(m, (2, 2)) == (5, 5)
m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
@test Flux.outdims(m, (4, 4)) == (5, 5)
m = DepthwiseConv((3, 3), 3 => 6)
@test Flux.outdims(m, (10, 10)) == (8, 8)
m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
@test Flux.outdims(m, (5, 5)) == (2, 2)
m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = CrossCor((3, 3), 3 => 16)
@test Flux.outdims(m, (10, 10)) == (8, 8)
m = CrossCor((3, 3), 3 => 16; stride = 2)
@test Flux.outdims(m, (5, 5)) == (2, 2)
m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = MaxPool((2, 2))
@test Flux.outdims(m, (10, 10)) == (5, 5)
m = MaxPool((2, 2); stride = 1)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = MaxPool((2, 2); stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = MeanPool((2, 2))
@test Flux.outdims(m, (10, 10)) == (5, 5)
m = MeanPool((2, 2); stride = 1)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = MeanPool((2, 2); stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
end

View File

@ -191,6 +191,7 @@ end
end
if VERSION >= v"1.1"
@testset "GroupNorm" begin
# begin tests
squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
@ -289,5 +290,5 @@ end
x = Float32.(reshape(collect(1:prod(sizes)), sizes))
@test BN(x) GN(x)
end
end
end

View File

@ -49,12 +49,33 @@ const ϵ = 1e-7
@testset "logitbinarycrossentropy" begin
@test logitbinarycrossentropy.(logŷ, y) binarycrossentropy.(σ.(logŷ), y; ϵ=0)
end
y = [1 2 3]
y1 = [4.0 5.0 6.0]
@testset "kldivergence" begin
@test Flux.kldivergence(y, y1) 4.761838062403337
@test Flux.kldivergence(y, y) 0
end
y = [1 2 3 4]
y1 = [5.0 6.0 7.0 8.0]
@testset "hinge" begin
@test Flux.hinge(y, y1) 0
@test Flux.hinge(y, 0.5 .* y) 0.125
end
y = [0.1 0.2 0.3]
y1 = [0.4 0.5 0.6]
@testset "poisson" begin
@test Flux.poisson(y, y1) 1.0160455586700767
@test Flux.poisson(y, y) 0.5044459776946685
end
@testset "no spurious promotions" begin
for T in (Float32, Float64)
y = rand(T, 2)
ŷ = rand(T, 2)
for f in (mse, crossentropy, logitcrossentropy)
for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
fwd, back = Flux.pullback(f, , y)
@test fwd isa T
@test eltype(back(one(T))[1]) == T

View File

@ -19,7 +19,7 @@ include("layers/normalisation.jl")
include("layers/stateless.jl")
include("layers/conv.jl")
if isdefined(Flux, :CUDA)
if Flux.use_cuda[]
include("cuda/cuda.jl")
else
@warn "CUDA unavailable, not testing GPU support"

View File

@ -1,6 +1,6 @@
using Flux
using Flux: throttle, glorot_uniform, glorot_normal, stack, unstack
using StatsBase: std
using Flux: throttle, nfan, glorot_uniform, glorot_normal, stack, unstack
using StatsBase: var
using Random
using Test
@ -56,18 +56,26 @@ end
# Set random seed so that these tests don't fail randomly
Random.seed!(0)
# glorot_uniform should yield a kernel with stddev ~= sqrt(6/(n_in + n_out)),
# and glorot_normal should yield a kernel with stddev != 2/(n_in _ n_out)
for (n_in, n_out) in [(100, 100), (100, 400)]
v = glorot_uniform(n_in, n_out)
@test minimum(v) > -1.1*sqrt(6/(n_in + n_out))
@test minimum(v) < -0.9*sqrt(6/(n_in + n_out))
@test maximum(v) > 0.9*sqrt(6/(n_in + n_out))
@test maximum(v) < 1.1*sqrt(6/(n_in + n_out))
@testset "Fan in/out" begin
@test nfan() == (1, 1) #For a constant
@test nfan(100) == (1, 100) #For vector
@test nfan(100, 200) == (200, 100) #For Dense layer
@test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
@test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
@test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
end
v = glorot_normal(n_in, n_out)
@test std(v) > 0.9*sqrt(2/(n_in + n_out))
@test std(v) < 1.1*sqrt(2/(n_in + n_out))
@testset "glorot" begin
# glorot_uniform and glorot_normal should both yield a kernel with
# variance ≈ 2/(fan_in + fan_out)
for dims [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
for init [glorot_uniform, glorot_normal]
v = init(dims...)
fan_in, fan_out = nfan(dims...)
σ2 = 2 / (fan_in + fan_out)
@test 0.9σ2 < var(v) < 1.1σ2
end
end
end
end