more docs and constructors

This commit is contained in:
Dhairya Gandhi 2020-02-26 22:29:14 +05:30
commit cd931793ef
34 changed files with 967 additions and 880 deletions

24
.github/workflows/CompatHelper.yml vendored Normal file
View File

@ -0,0 +1,24 @@
name: CompatHelper
on:
schedule:
- cron: '00 00 * * *'
jobs:
CompatHelper:
runs-on: ${{ matrix.os }}
strategy:
matrix:
julia-version: [1.3]
julia-arch: [x64]
os: [ubuntu-latest]
steps:
- uses: julia-actions/setup-julia@latest
with:
version: ${{ matrix.julia-version }}
- name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: julia -e 'using CompatHelper; CompatHelper.main()'

11
.github/workflows/TagBot.yml vendored Normal file
View File

@ -0,0 +1,11 @@
name: TagBot
on:
schedule:
- cron: 0 * * * *
jobs:
TagBot:
runs-on: ubuntu-latest
steps:
- uses: JuliaRegistries/TagBot@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -1,51 +1,41 @@
before_script:
- export CI_DISABLE_CURNN_TEST=true
variables:
CI_IMAGE_TAG: 'cuda'
include: include:
- 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/common.yml' - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v6.yml'
.flux: image: nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
extends: .test
script:
- julia -e 'using InteractiveUtils;
versioninfo()'
- mkdir $JULIA_DEPOT_PATH # Pkg3.jl#325
- julia --project -e 'using Pkg;
Pkg.instantiate();
Pkg.build();
Pkg.test(; coverage=true);'
test:v1.0:
extends: .flux
variables:
CI_VERSION_TAG: 'v1.0'
test:v1.1: # julia:1.0:
extends: .flux # extends:
variables: # - .julia:1.0
CI_VERSION_TAG: 'v1.1' # - .test
# tags:
# - nvidia
#
# julia:1.1:
# extends:
# - .julia:1.1
# - .test
# tags:
# - nvidia
#
# julia:1.2:
# extends:
# - .julia:1.2
# - .test
# tags:
# - nvidia
test:v1.2: julia:1.3:
extends: .flux extends:
variables: - .julia:1.3
CI_VERSION_TAG: 'v1.2' - .test
tags:
test:v1.3: - nvidia
extends: .flux
variables:
CI_VERSION_TAG: 'v1.3'
test:v1.0:
extends: .flux
variables:
CI_VERSION_TAG: 'v1.0'
test:dev:
extends: .flux
variables:
CI_VERSION_TAG: 'dev'
julia:nightly:
extends:
- .julia:nightly
- .test
tags:
- nvidia
allow_failure: true allow_failure: true

View File

@ -6,7 +6,7 @@ os:
# - osx # - osx
julia: julia:
- 1.1 - 1.3
- nightly - nightly
matrix: matrix:
@ -16,7 +16,7 @@ matrix:
jobs: jobs:
include: include:
- stage: "Documentation" - stage: "Documentation"
julia: 1.0 julia: 1.3
os: linux os: linux
script: script:
- julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));

View File

@ -2,15 +2,15 @@
[[AbstractFFTs]] [[AbstractFFTs]]
deps = ["LinearAlgebra"] deps = ["LinearAlgebra"]
git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40" git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "0.4.1" version = "0.5.0"
[[AbstractTrees]] [[AbstractTrees]]
deps = ["Markdown", "Test"] deps = ["Markdown"]
git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b" git-tree-sha1 = "8201f932428d25a2e2903300764515754847d87d"
uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
version = "0.2.1" version = "0.3.0"
[[Adapt]] [[Adapt]]
deps = ["LinearAlgebra"] deps = ["LinearAlgebra"]
@ -21,46 +21,34 @@ version = "1.0.0"
[[Base64]] [[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinDeps]]
deps = ["Compat", "Libdl", "SHA", "URIParser"]
git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
version = "0.8.10"
[[BinaryProvider]] [[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"] deps = ["Libdl", "SHA"]
git-tree-sha1 = "c7361ce8a2129f20b0e05a89f7070820cfed6648" git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.6" version = "0.5.8"
[[CEnum]] [[CEnum]]
git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb" git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.2.0" version = "0.2.0"
[[CSTParser]]
deps = ["Tokenize"]
git-tree-sha1 = "c69698c3d4a7255bc1b4bc2afc09f59db910243b"
uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
version = "0.6.2"
[[CUDAapi]] [[CUDAapi]]
deps = ["Libdl", "Logging"] deps = ["Libdl", "Logging"]
git-tree-sha1 = "e063efb91cfefd7e6afd92c435d01398107a500b" git-tree-sha1 = "56a813440ac98a1aa64672ab460a1512552211a7"
uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
version = "1.2.0" version = "2.1.0"
[[CUDAdrv]] [[CUDAdrv]]
deps = ["CUDAapi", "Libdl", "Printf"] deps = ["CEnum", "CUDAapi", "Printf"]
git-tree-sha1 = "9ce99b5732c70e06ed97c042187baed876fb1698" git-tree-sha1 = "1fce616fa0806c67c133eb1d2f68f0f1a7504665"
uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
version = "3.1.0" version = "5.0.1"
[[CUDAnative]] [[CUDAnative]]
deps = ["Adapt", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Printf", "TimerOutputs"] deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
git-tree-sha1 = "52ae1ce10ebfa686e227655c47b19add89308623" git-tree-sha1 = "6e11d5c2c91fc623952e94c4fb73f9c4db74795a"
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
version = "2.3.1" version = "2.7.0"
[[CodecZlib]] [[CodecZlib]]
deps = ["BinaryProvider", "Libdl", "TranscodingStreams"] deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
@ -70,9 +58,9 @@ version = "0.6.0"
[[ColorTypes]] [[ColorTypes]]
deps = ["FixedPointNumbers", "Random"] deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965" git-tree-sha1 = "7b62b728a5f3dd6ee3b23910303ccf27e82fad5e"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.8.0" version = "0.8.1"
[[Colors]] [[Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"] deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
@ -86,40 +74,22 @@ git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
version = "0.2.0" version = "0.2.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "2.1.0"
[[Conda]]
deps = ["JSON", "VersionParsing"]
git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
version = "1.3.0"
[[Crayons]]
deps = ["Test"]
git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.0.0"
[[CuArrays]] [[CuArrays]]
deps = ["AbstractFFTs", "Adapt", "CUDAapi", "CUDAdrv", "CUDAnative", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"] deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
git-tree-sha1 = "46b48742a84bb839e74215b7e468a4a1c6ba30f9" git-tree-sha1 = "51fbe053dea29ed2513e02d38380007310cf4c4b"
uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
version = "1.2.1" version = "1.6.0"
[[DataAPI]] [[DataAPI]]
git-tree-sha1 = "8903f0219d3472543fc4b2f5ebaf675a07f817c0" git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.0.1" version = "1.1.0"
[[DataStructures]] [[DataStructures]]
deps = ["InteractiveUtils", "OrderedCollections"] deps = ["InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a" git-tree-sha1 = "f784254f428fb8fd7ac15982e5862a38a44523d3"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.17.0" version = "0.17.7"
[[Dates]] [[Dates]]
deps = ["Printf"] deps = ["Printf"]
@ -130,32 +100,38 @@ deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[DiffResults]] [[DiffResults]]
deps = ["Compat", "StaticArrays"] deps = ["StaticArrays"]
git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c" git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc"
uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
version = "0.0.4" version = "1.0.2"
[[DiffRules]] [[DiffRules]]
deps = ["Random", "Test"] deps = ["NaNMath", "Random", "SpecialFunctions"]
git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7" git-tree-sha1 = "10dca52cf6d4a62d82528262921daf63b99704a2"
uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
version = "0.0.10" version = "1.0.0"
[[Distributed]] [[Distributed]]
deps = ["Random", "Serialization", "Sockets"] deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[FFTW]] [[FFTW]]
deps = ["AbstractFFTs", "BinaryProvider", "Conda", "Libdl", "LinearAlgebra", "Reexport", "Test"] deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
git-tree-sha1 = "6c5b420da0b8c12098048561b8d58f81adea506f" git-tree-sha1 = "109d82fa4b00429f9afcce873e9f746f11f018d3"
uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
version = "1.0.1" version = "1.2.0"
[[FFTW_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58"
uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
version = "3.3.9+3"
[[FillArrays]] [[FillArrays]]
deps = ["LinearAlgebra", "Random", "SparseArrays"] deps = ["LinearAlgebra", "Random", "SparseArrays"]
git-tree-sha1 = "8fba6ddaf66b45dec830233cea0aae43eb1261ad" git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5"
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
version = "0.6.4" version = "0.8.4"
[[FixedPointNumbers]] [[FixedPointNumbers]]
git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b" git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@ -163,33 +139,33 @@ uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.6.1" version = "0.6.1"
[[ForwardDiff]] [[ForwardDiff]]
deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"] deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b" git-tree-sha1 = "840700059391d36e2498d89c2e82c08f261f2a2a"
uuid = "f6369f11-7733-5829-9624-2563aa707210" uuid = "f6369f11-7733-5829-9624-2563aa707210"
version = "0.10.3" version = "0.10.8"
[[GPUArrays]] [[GPUArrays]]
deps = ["Adapt", "FFTW", "FillArrays", "LinearAlgebra", "Printf", "Random", "Serialization", "StaticArrays", "Test"] deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
git-tree-sha1 = "77e27264276fe97a7e7fb928bf8999a145abc018" git-tree-sha1 = "e756da6cee76a5f1436a05827fa8fdf3badc577f"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "1.0.3" version = "2.0.1"
[[IRTools]] [[IRTools]]
deps = ["InteractiveUtils", "MacroTools", "Test"] deps = ["InteractiveUtils", "MacroTools", "Test"]
git-tree-sha1 = "e23faa71b8f54c3fdc99b230b9c2906cafdddca5" git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
uuid = "7869d1d1-7146-5819-86e3-90919afe41df" uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
version = "0.2.3" version = "0.3.0"
[[IntelOpenMP_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6"
uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
version = "2018.0.3+0"
[[InteractiveUtils]] [[InteractiveUtils]]
deps = ["Markdown"] deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[JSON]]
deps = ["Dates", "Mmap", "Parsers", "Unicode"]
git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
version = "0.21.0"
[[Juno]] [[Juno]]
deps = ["Base64", "Logging", "Media", "Profile", "Test"] deps = ["Base64", "Logging", "Media", "Profile", "Test"]
git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8" git-tree-sha1 = "30d94657a422d09cb97b6f86f04f750fa9c50df8"
@ -198,9 +174,9 @@ version = "0.7.2"
[[LLVM]] [[LLVM]]
deps = ["CEnum", "Libdl", "Printf", "Unicode"] deps = ["CEnum", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "4a05f742837779a00bd8c9a18da6817367c4245d" git-tree-sha1 = "1d08d7e4250f452f6cb20e4574daaebfdbee0ff7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0" uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "1.3.0" version = "1.3.3"
[[LibGit2]] [[LibGit2]]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
@ -215,11 +191,17 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]] [[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MKL_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "61069ae718b8ab1e325bbfb4e5268902e7ea08e3"
uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
version = "2019.0.117+0"
[[MacroTools]] [[MacroTools]]
deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"] deps = ["DataStructures", "Markdown", "Random"]
git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76" git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.1" version = "0.5.3"
[[Markdown]] [[Markdown]]
deps = ["Base64"] deps = ["Base64"]
@ -232,24 +214,30 @@ uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
version = "0.5.0" version = "0.5.0"
[[Missings]] [[Missings]]
git-tree-sha1 = "29858ce6c8ae629cf2d733bffa329619a1c843d0" deps = ["DataAPI"]
git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "0.4.2" version = "0.4.3"
[[Mmap]] [[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804" uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[NNlib]] [[NNlib]]
deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"] deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8" git-tree-sha1 = "135c0de4794d5e214b06f1fb4787af4a72896e61"
uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
version = "0.6.0" version = "0.6.2"
[[NaNMath]] [[NaNMath]]
deps = ["Compat"] git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
version = "0.3.2" version = "0.3.3"
[[OpenSpecFun_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.3+1"
[[OrderedCollections]] [[OrderedCollections]]
deps = ["Random", "Serialization", "Test"] deps = ["Random", "Serialization", "Test"]
@ -257,14 +245,8 @@ git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.1.0" version = "1.1.0"
[[Parsers]]
deps = ["Dates", "Test"]
git-tree-sha1 = "ef0af6c8601db18c282d092ccbd2f01f3f0cd70b"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "0.3.7"
[[Pkg]] [[Pkg]]
deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Printf]] [[Printf]]
@ -290,10 +272,10 @@ uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "0.2.0" version = "0.2.0"
[[Requires]] [[Requires]]
deps = ["Test"] deps = ["UUIDs"]
git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1" git-tree-sha1 = "999513b7dea8ac17359ed50ae8ea089e4464e35e"
uuid = "ae029012-a4dd-5104-9daa-d747884805df" uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "0.5.2" version = "1.0.0"
[[SHA]] [[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@ -301,10 +283,6 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]] [[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]] [[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc" uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
@ -319,16 +297,16 @@ deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[SpecialFunctions]] [[SpecialFunctions]]
deps = ["BinDeps", "BinaryProvider", "Libdl"] deps = ["OpenSpecFun_jll"]
git-tree-sha1 = "3bdd374b6fd78faf0119b8c5d538788dbf910c6e" git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b" uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "0.8.0" version = "0.9.0"
[[StaticArrays]] [[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"] deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6" git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
uuid = "90137ffa-7385-5640-81b9-e52037218182" uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "0.11.0" version = "0.12.1"
[[Statistics]] [[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"] deps = ["LinearAlgebra", "SparseArrays"]
@ -345,15 +323,10 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TimerOutputs]] [[TimerOutputs]]
deps = ["Crayons", "Printf", "Test", "Unicode"] deps = ["Printf"]
git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c" git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.0" version = "0.5.3"
[[Tokenize]]
git-tree-sha1 = "dfcdbbfb2d0370716c815cbd6f8a364efb6f42cf"
uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
version = "0.5.6"
[[TranscodingStreams]] [[TranscodingStreams]]
deps = ["Random", "Test"] deps = ["Random", "Test"]
@ -361,12 +334,6 @@ git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
version = "0.9.5" version = "0.9.5"
[[URIParser]]
deps = ["Test", "Unicode"]
git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
uuid = "30578b45-9adc-5946-b283-645ec420af67"
version = "0.4.0"
[[UUIDs]] [[UUIDs]]
deps = ["Random", "SHA"] deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
@ -374,30 +341,26 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[Unicode]] [[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[VersionParsing]]
deps = ["Compat"]
git-tree-sha1 = "c9d5aa108588b978bd859554660c8a5c4f2f7669"
uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
version = "1.1.3"
[[ZipFile]] [[ZipFile]]
deps = ["BinaryProvider", "Libdl", "Printf"] deps = ["Libdl", "Printf", "Zlib_jll"]
git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d" git-tree-sha1 = "5de8320a46812da1a8ca98b16a8a4546d44efa62"
uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
version = "0.8.3" version = "0.9.0"
[[Zlib_jll]]
deps = ["Libdl", "Pkg"]
git-tree-sha1 = "5618a43055eb09377edca21d19d0e99bce24a9c3"
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.11+7"
[[Zygote]] [[Zygote]]
deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"] deps = ["DiffRules", "FFTW", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
git-tree-sha1 = "38241b40ebd8748bcacad5e6c7ba3ab3cc7a15c9" git-tree-sha1 = "74382bcc4c1e8075e14554da67d75565f8fb7827"
repo-rev = "master"
repo-url = "https://github.com/FluxML/Zygote.jl.git"
uuid = "e88e6eb3-aa80-5325-afca-941959d7151f" uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
version = "0.3.4" version = "0.4.5"
[[ZygoteRules]] [[ZygoteRules]]
deps = ["MacroTools"] deps = ["MacroTools"]
git-tree-sha1 = "c4c29b30b8ff3be13d4244e78be7df2a42bc54d0" git-tree-sha1 = "b3b4882cc9accf6731a08cc39543fbc6b669dca8"
repo-rev = "master"
repo-url = "https://github.com/FluxML/ZygoteRules.jl.git"
uuid = "700de1a5-db45-46bc-99cf-38207098b444" uuid = "700de1a5-db45-46bc-99cf-38207098b444"
version = "0.2.0" version = "0.2.0"

13
NEWS.md
View File

@ -1,3 +1,16 @@
# v0.10.0
* The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
- The dependency on Tracker.jl has been removed.
- This means Flux now does not depend on using a specialised `TrackedArray` type, and can be used with normal Array implementations directly.
- Tracker compatibility is maintained in most common cases, but Zygote will be the preferred AD backend for Flux from now on.
* The CUDNN wrappers have been [moved from Flux into CuArrays](https://github.com/FluxML/Flux.jl/pull/874), to allow for better supporting the CUDA backend, and improve user experience, not to mention making Flux lean.
* `*crossentropy` functions now [work as expected with CuArrays](https://github.com/FluxML/Flux.jl/pull/926). [PR for binarycrossentropy](https://github.com/FluxML/Flux.jl/pull/940).
* Added [clearer docs](https://github.com/FluxML/Flux.jl/pull/904) around training and the Optimiser interface.
* [Layer initialisations](https://github.com/FluxML/Flux.jl/pull/937) have been improved with a clearer API on how to extend it for other purposes.
* [Better messaging around CUDA availability](https://github.com/FluxML/Flux.jl/pull/924), with hooks to initialize the GPU as default where possible.
* `@treelike` has been formalised as a [functor](https://github.com/FluxML/Flux.jl/pull/865), with an effective deprecation.
* `testmode!` is deprecated in favour of [istraining](https://github.com/FluxML/Flux.jl/pull/669)
# v0.9.0 # v0.9.0
* [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor. * [Depthwise convolutional layer API changes](https://github.com/FluxML/Flux.jl/pull/756) from `in => mult` channel specification to `in => out` channel specification, and deprecates implicit `out` constructor.
* New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures. * New [SkipConnection](https://github.com/FluxML/Flux.jl/pull/446), which can be used to train residual neural network architectures.

View File

@ -1,17 +1,15 @@
name = "Flux" name = "Flux"
uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
version = "0.9.0" version = "0.10.2"
[deps] [deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d" Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@ -24,13 +22,20 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"
[compat] [compat]
CUDAapi = "1.1" AbstractTrees = "0.2, 0.3"
CuArrays = "1.2" Adapt = "1"
CodecZlib = "0.5, 0.6"
Colors = "0.8, 0.9, 0.10, 0.11"
CuArrays = "1.6"
Juno = "0.5, 0.6, 0.7, 0.8"
MacroTools = "0.3, 0.4, 0.5"
NNlib = "0.6" NNlib = "0.6"
Zygote = "0.3" Reexport = "0.2"
StatsBase = "0"
ZipFile = "0.7, 0.8, 0.9"
Zygote = "0.4"
julia = "1" julia = "1"
[extras] [extras]

View File

@ -7,93 +7,9 @@
Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, and provides lightweight abstractions on top of Julia's native GPU and AD support. Flux makes the easy things easy while remaining fully hackable. Flux is an elegant approach to machine learning. It's a 100% pure-Julia stack, and provides lightweight abstractions on top of Julia's native GPU and AD support. Flux makes the easy things easy while remaining fully hackable.
```julia ```julia
julia> Pkg.add("Flux") ] add Flux
``` ```
See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples. See the [documentation](https://fluxml.github.io/Flux.jl/) or the [model zoo](https://github.com/FluxML/model-zoo/) for examples.
If you use Flux in research, please cite the following paper: If you use Flux in your research, please [cite](CITATION.bib) our work.
```
@article{innes:2018,
author = {Mike Innes},
title = {Flux: Elegant Machine Learning with Julia},
journal = {Journal of Open Source Software},
year = {2018},
doi = {10.21105/joss.00602},
}
```
## Features
Flux has powerful high-level features, and common architectures can be defined in a few lines.
```julia
model = Chain(
Dense(768, 128, σ),
LSTM(128, 256),
LSTM(256, 128),
Dense(128, 10),
softmax)
loss(x, y) = crossentropy(model(x), y)
Flux.train!(loss, data, ADAM(...))
```
Yet you can easily strip away the layers, and directly write the mathematics for your problem. Flux will seamlessly take gradients of any Julia code, so your model looks just like the paper.
```julia
W = param(randn(2, 10))
b = param(randn(2))
y(x) = σ.(W * x .+ b)
```
If that's *still* not enough, you can go as deep as you want, even writing your own CUDA kernels with [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl)! All this can be freely mixed-and-matched in a single model or script, and it all runs interactively via Jupyter or Juno.
```julia
function gpu_add(a, b, c)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
c[i] = a[i] + b[i]
return nothing
end
```
Unusual architectures are no problem in Flux, as you can use all the loops, control flow and even macros that you're used to. Here's a Tree RNN in 4 lines.
```julia
tree() = rand() < 0.5 ? rand(10) : (tree(), tree()) # dummy data
shrink = Dense(20, 10)
combine(a, b) = shrink([a; b])
model(x) = x
model(x::Tuple) = combine(model(x[1]), model(x[2]))
model(tree()) # Sample output
```
Despite this flexibility, Julia's advanced compiler lets us do some powerful optimisations. For example, this definition of `sigmoid` automatically gets fused into a *single* GPU kernel so it's really fast.
```julia
sigmoid(xs) = 1 ./ (1 .+ exp.(.-xs))
```
Similarly, Flux is the first dynamic framework to support [compiling to the browser](https://fluxml.github.io/experiments/) and model import via [formats like ONNX](https://github.com/FluxML/ONNX.jl/), both of which are thinly-veiled compiler problems.
For more on our philosophy on machine learning, check out our article [On Machine Learning & Programming Languages](https://julialang.org/blog/2017/12/ml&pl).
## Contributing & Help
For general questions and help, check out Julia's [community forum](https://discourse.julialang.org/c/domain/ML).
Flux development is carried out via our [GitHub issues](https://github.com/FluxML/Flux.jl/issues), so feel free to open feature requests or PRs here.
For more informal discussions we'd love to have you on the [Julia slack](https://slackinvite.julialang.org/), where we hang out on the #machine-learning channel.
## Related Packages
Check out [Metalhead.jl](https://github.com/FluxML/Metalhead.jl) for common computer vision datasets and trained models.
[MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) provides further common datasets.

View File

@ -1,4 +1,4 @@
status = [ status = [
"ci/gitlab/%" "ci/gitlab%"
] ]
timeout-sec = 14400 timeout-sec = 7200

View File

@ -219,3 +219,24 @@ Flux.@functor Affine
``` ```
This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md). This enables a useful extra set of functionality for our `Affine` layer, such as [collecting its parameters](../training/optimisers.md) or [moving it to the GPU](../gpu.md).
## Utility functions
Flux provides some utility functions to help you generate models in an automated fashion.
`outdims` enables you to calculate the spatial output dimensions of layers like `Conv` when applied to input images of a given size.
Currently limited to the following layers:
- `Chain`
- `Dense`
- `Conv`
- `Diagonal`
- `Maxout`
- `ConvTranspose`
- `DepthwiseConv`
- `CrossCor`
- `MaxPool`
- `MeanPool`
```@docs
outdims
```

View File

@ -65,3 +65,15 @@ AlphaDropout
LayerNorm LayerNorm
GroupNorm GroupNorm
``` ```
## Cost Functions
```@docs
mse
crossentropy
logitcrossentropy
binarycrossentropy
logitbinarycrossentropy
kldivergence
poisson
hinge
```

View File

@ -113,6 +113,6 @@ You can even store optimiser state alongside the model, to resume training
exactly where you left off. exactly where you left off.
```julia ```julia
opt = ADAM(params(model)) opt = ADAM()
@save "model-$(now()).bson" model opt @save "model-$(now()).bson" model opt
``` ```

View File

@ -58,3 +58,83 @@ AMSGrad
NADAM NADAM
ADAMW ADAMW
``` ```
## Optimiser Interface
Flux's optimsers are built around a `struct` that holds all the optimiser parameters along with a definition of how to apply the update rule associated with it. We do this via the `apply!` function which takes the optimiser as the first argument followed by the parameter and its corresponding gradient.
In this manner Flux also allows one to create custom optimisers to be used seamlessly. Let's work this with a simple example.
```julia
mutable struct Momentum
eta
rho
velocity
end
Momentum(eta::Real, rho::Real) = Momentum(eta, rho, IdDict())
```
The `Momentum` type will act as our optimiser in this case. Notice that we have added all the parameters as fields, along with the velocity which we will use as our state dictionary. Each parameter in our models will get an entry in there. We can now define the rule applied when this optimiser is invoked.
```julia
function apply!(o::Momentum, x, Δ)
η, ρ = o.eta, o.rho
v = get!(o.velocity, x, zero(x))::typeof(x)
@. v = ρ * v - η * Δ
@. Δ = -v
end
```
This is the basic definition of a Momentum update rule given by:
```math
v = ρ * v - η * Δ
w = w - v
```
The `apply!` defines the update rules for an optimiser `opt`, given the parameters and gradients. It returns the updated gradients. Here, every parameter `x` is retrieved from the running state `v` and subsequently updates the state of the optimiser.
Flux internally calls on this function via the `update!` function. It shares the API with `apply!` but ensures that multiple parameters are handled gracefully.
## Composing Optimisers
Flux defines a special kind of optimiser called simply as `Optimiser` which takes in a arbitrary optimisers as input. Its behaviour is similar to the usual optimisers, but differs in that it acts by calling the optimisers listed in it sequentially. Each optimiser produces a modified gradient
that will be fed into the next, and the resultant update will be applied to the parameter as usual. A classic use case is where adding decays is desirable. Flux defines some basic decays including `ExpDecay`, `InvDecay` etc.
```julia
opt = Optimiser(ExpDecay(0.001, 0.1, 1000, 1e-4), Descent())
```
Here we apply exponential decay to the `Descent` optimser. The defaults of `ExpDecay` say that its learning rate will be decayed every 1000 steps.
It is then applied like any optimser.
```julia
w = randn(10, 10)
w1 = randn(10,10)
ps = Params([w, w1])
loss(x) = Flux.mse(w * x, w1 * x)
loss(rand(10)) # around 9
for t = 1:10^5
θ = Params([w, w1])
θ̄ = gradient(() -> loss(rand(10)), θ)
Flux.Optimise.update!(opt, θ, θ̄)
end
loss(rand(10)) # around 0.9
```
In this manner it is possible to compose optimisers for some added flexibility.
## Decays
Similar to optimisers, Flux also defines some simple decays that can be used in conjunction with other optimisers, or standalone.
```@docs
ExpDecay
InvDecay
WeightDecay
```

View File

@ -1,8 +1,9 @@
# Training # Training
To actually train a model we need three things: To actually train a model we need four things:
* A *objective function*, that evaluates how well a model is doing given some input data. * A *objective function*, that evaluates how well a model is doing given some input data.
* The trainable parameters of the model.
* A collection of data points that will be provided to the objective function. * A collection of data points that will be provided to the objective function.
* An [optimiser](optimisers.md) that will update the model parameters appropriately. * An [optimiser](optimisers.md) that will update the model parameters appropriately.
@ -32,6 +33,14 @@ Flux.train!(loss, ps, data, opt)
The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want. The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
At first glance it may seem strange that the model that we want to train is not part of the input arguments of `Flux.train!` too. However the target of the optimizer is not the model itself, but the objective function that represents the departure between modelled and observed data. In other words, the model is implicitly defined in the objective function, and there is no need to give it explicitly. Passing the objective function instead of the model and a cost function separately provides more flexibility, and the possibility of optimizing the calculations.
## Model parameters
The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `params(m)`.
Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
## Datasets ## Datasets
The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point: The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:
@ -101,3 +110,30 @@ cb = function ()
accuracy() > 0.9 && Flux.stop() accuracy() > 0.9 && Flux.stop()
end end
``` ```
## Custom Training loops
The `Flux.train!` function can be very convenient, especially for simple problems.
Its also very flexible with the use of callbacks.
But for some problems its much cleaner to write your own custom training loop.
An example follows that works similar to the default `Flux.train` but with no callbacks.
You don't need callbacks if you just code the calls to your functions directly into the loop.
E.g. in the places marked with comments.
```
function my_custom_train!(loss, ps, data, opt)
ps = Params(ps)
for d in data
gs = gradient(ps) do
training_loss = loss(d...)
# Insert what ever code you want here that needs Training loss, e.g. logging
return training_loss
end
# insert what ever code you want here that needs gradient
# E.g. logging with TensorBoardLogger.jl as histogram so you can see if it is becoming huge
update!(opt, ps, gs)
# Here you might like to check validation set accuracy, and break out to do early stopping
end
end
```
You could simplify this further, for example by hard-coding in the loss function.

View File

@ -6,7 +6,7 @@ using Base: tail
using Zygote, MacroTools, Juno, Reexport, Statistics, Random using Zygote, MacroTools, Juno, Reexport, Statistics, Random
using MacroTools: @forward using MacroTools: @forward
@reexport using NNlib @reexport using NNlib
using Zygote: Params, @adjoint, gradient, pullback using Zygote: Params, @adjoint, gradient, pullback, @nograd
export gradient export gradient
export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool, export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
@ -20,18 +20,9 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM,
ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay ADAMW, RADAM, InvDecay, ExpDecay, WeightDecay
using CUDAapi
if has_cuda() using CuArrays
try const use_cuda = Ref(false)
using CuArrays
@eval has_cuarrays() = true
catch ex
@warn "CUDA is installed, but CuArrays.jl fails to load" exception=(ex,catch_backtrace())
@eval has_cuarrays() = false
end
else
has_cuarrays() = false
end
include("utils.jl") include("utils.jl")
include("onehot.jl") include("onehot.jl")
@ -47,8 +38,26 @@ include("data/Data.jl")
include("deprecations.jl") include("deprecations.jl")
if has_cuarrays() function __init__()
include("cuda/cuda.jl") precompiling = ccall(:jl_generating_output, Cint, ()) != 0
# we don't want to include the CUDA module when precompiling,
# or we could end up replacing it at run time (triggering a warning)
precompiling && return
if !CuArrays.functional()
# nothing to do here, and either CuArrays or one of its dependencies will have warned
else
use_cuda[] = true
# FIXME: this functionality should be conditional at run time by checking `use_cuda`
# (or even better, get moved to CuArrays.jl as much as possible)
if CuArrays.has_cudnn()
include(joinpath(@__DIR__, "cuda/cuda.jl"))
else
@warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
end
end
end end
end # module end # module

View File

@ -2,11 +2,8 @@ module CUDA
using ..CuArrays using ..CuArrays
if CuArrays.libcudnn !== nothing # TODO: use CuArrays.has_cudnn() using CuArrays: CUDNN
include("curnn.jl") include("curnn.jl")
include("cudnn.jl") include("cudnn.jl")
else
@warn "CUDNN is not installed, some functionality will not be available."
end
end end

View File

@ -1,199 +1,5 @@
using CuArrays: libcudnn import ..Flux: data
using CuArrays.CUDNN: @check, handle, cudnnStatus_t, cudnnTensorDescriptor_t, import CuArrays.CUDNN: batchnorm, ∇batchnorm
cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
import CuArrays.CUDAdrv: CuPtr, CU_NULL
using LinearAlgebra
mutable struct DropoutDesc
ptr::Ptr{Nothing}
states::CuVector{UInt8}
end
Base.unsafe_convert(::Type{Ptr{Nothing}}, dd::DropoutDesc) = dd.ptr
function DropoutDesc(ρ::Real; seed::Integer=0)
d = [C_NULL]
s = Csize_t[0]
@check ccall((:cudnnCreateDropoutDescriptor,libcudnn), cudnnStatus_t, (Ptr{Ptr{Nothing}},), d)
@check ccall((:cudnnDropoutGetStatesSize,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Csize_t}),handle(),s)
states = CuArray{UInt8}(undef, s[]) # TODO: can we drop this when ρ=0?
desc = DropoutDesc(d[], states)
@check ccall((:cudnnSetDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},Ptr{Nothing},Cfloat,CuPtr{Nothing},Csize_t,Culonglong),
desc,handle(),ρ,states,length(states),seed)
finalizer(desc) do x
@check ccall((:cudnnDestroyDropoutDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
end
return desc
end
const BATCHNORM_SPATIAL = 1
const BATCHNORM_ACTIVATION = 0
const BATCHNORM_MIN_EPS = 1e-5
@inline _wsize(y) = (map(_ -> 1, size(y)[1:end-2])..., size(y)[end-1], 1)
@inline _reddims(y) = (collect(1:ndims(y)-2)..., ndims(y))
mutable struct BNCache
mean
ivar
end
BNCache() = BNCache(nothing, nothing)
# NOTE: CuDNN supports only 4D and 5D Tensors for BatchNorm Operations
# so reshape a 2D Tensor into 4D
batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2},
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
cache = nothing, alpha = T(1), beta = T(0),
eps = T(1e-5), training = true) where T<:Union{Float32, Float64} =
dropdims(batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), running_mean, running_var, momentum,
cache = cache, alpha = alpha, beta = beta, eps = eps, training = training), dims = (1, 2))
function batchnorm(g::CuArray{T}, b::CuArray{T}, x::Union{CuArray{T, 4},CuArray{T,5}},
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
cache = nothing, alpha = T(1), beta = T(0),
eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
y = similar(x)
cudnnBNForward!(y, g, b, x, running_mean, running_var, momentum, cache = cache,
alpha = alpha, beta = beta, eps = eps, training = training)
y
end
function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray{T},
running_mean::CuArray{T}, running_var::CuArray{T},
momentum; cache = nothing,
alpha = T(1), beta = T(0),
eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
dims = _wsize(x)
if eps < BATCHNORM_MIN_EPS
# warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", BATCHNORM_MIN_EPS)
eps = BATCHNORM_MIN_EPS
end
xd = TensorDesc(x)
yd = TensorDesc(y)
gd = TensorDesc(T, dims)
if training
if cache !== nothing
mean = zeros(CuArray{T}, dims...)
ivar = ones(CuArray{T}, dims...)
else
mean = CU_NULL
ivar = CU_NULL
end
@check ccall((:cudnnBatchNormalizationForwardTraining, libcudnn), cudnnStatus_t,
(cudnnHandle_t,cudnnBatchNormMode_t,
Ptr{T}, Ptr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, CuPtr{T},
Cdouble, CuPtr{T}, CuPtr{T},
Cdouble, CuPtr{T}, CuPtr{T}),
handle(), BATCHNORM_SPATIAL,
Ref(T(alpha)), Ref(T(beta)),
xd, x,
yd, y,
gd, g, b,
momentum, running_mean, running_var,
eps, mean, ivar)
if cache !== nothing
cache.mean = mean
cache.ivar = ivar
end
else
@check ccall((:cudnnBatchNormalizationForwardInference, libcudnn), cudnnStatus_t,
(Ptr{cudnnHandle_t},cudnnBatchNormMode_t,
Ptr{T}, Ptr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, CuPtr{T},
CuPtr{T}, CuPtr{T},
Cdouble),
handle(), BATCHNORM_SPATIAL,
Ref(T(alpha)), Ref(T(beta)),
xd, x,
yd, y,
gd, g, b,
running_mean, running_var,
eps)
end
end
function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2}, dy::CuArray{T, 2},
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
cache = nothing, eps = T(1e-5), alpha = T(1),
beta = T(0), training = true) where T<:Union{Float32, Float64}
dg, db, dx = ∇batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), reshape(dy, 1, 1, size(dy, 1),
size(dy, 2)), running_mean, running_var, momentum, cache = cache, eps = eps,
alpha = alpha, beta = beta, training = training)
(dg, db, dropdims(dx, dims = (1, 2)))
end
function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
cache = nothing, eps = T(1e-5), alpha = T(1),
beta = T(0), training = true) where T<:Union{Float32, Float64}
dg = similar(g)
db = similar(b)
dx = similar(x)
cudnnBNBackward!(dg, g, db, dx, x, dy, running_mean, running_var, T(momentum),
training = training, cache = cache, eps = eps, alpha = alpha, beta = beta)
(dg, db, dx)
end
function cudnnBNBackward!(dg::CuArray{T}, g::CuArray{T}, db::CuArray{T},
dx::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
running_mean::CuArray{T}, running_var::CuArray{T},
momentum; cache = nothing, eps = T(1e-5),
alpha = T(1), beta = T(0),
dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
if training
xd = TensorDesc(x)
dyd = TensorDesc(dy)
dxd = TensorDesc(dx)
gd = TensorDesc(T, _wsize(x))
if cache !== nothing
mean, ivar = cache.mean, cache.ivar
info("mean and ivar are fetched from the cache")
else
mean, ivar = CU_NULL, CU_NULL
end
if eps < BATCHNORM_MIN_EPS
eps = BATCHNORM_MIN_EPS
end
@check ccall((:cudnnBatchNormalizationBackward, libcudnn), cudnnStatus_t,
(cudnnHandle_t,cudnnBatchNormMode_t,
Ptr{T}, Ptr{T},
Ptr{T}, Ptr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, CuPtr{T}, CuPtr{T},
Cdouble, CuPtr{T}, CuPtr{T}),
handle(), BATCHNORM_SPATIAL,
Ref(T(alpha)), Ref(T(beta)),
Ref(T(dalpha)), Ref(T(dbeta)),
xd, x,
dyd, dy,
dxd, dx,
gd, g, dg, db,
eps, mean, ivar)
else
ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps)
dx .= dy .* reshape(g, _wsize(x)) .* ivar
dg .= squeeze(sum(dy .* (x .- reshape(running_mean, _wsize(x))) .* ivar, _reddims(dy)), dims = (1,2,4))
db .= squeeze(sum(dy, _reddims(dy)), dims = (1,2,4))
end
end
# Flux Interface
(BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, cache = nothing) where T<:Union{Float32, Float64} = (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, cache = nothing) where T<:Union{Float32, Float64} =
BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = Flux.istraining())) BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, training = Flux.istraining()))

View File

@ -1,273 +1,25 @@
using CuArrays: libcudnn
using CuArrays.CUDNN: @check, cudnnStatus_t, cudnnTensorDescriptor_t,
cudnnBatchNormMode_t, cudnnHandle_t, cudnnDataType, TensorDesc, FilterDesc
import CuArrays.CUDAdrv: CuPtr, CU_NULL
using LinearAlgebra
const RNN_RELU = 0 # Stock RNN with ReLu activation
const RNN_TANH = 1 # Stock RNN with tanh activation
const LSTM = 2 # LSTM with no peephole connections
const GRU = 3 # Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1)
const LINEAR_INPUT = 0
const SKIP_INPUT = 1
const UNIDIRECTIONAL = 0
const BIDIRECTIONAL = 1
const RNN_ALGO_STANDARD = 0
const RNN_ALGO_PERSIST_STATIC = 1
const RNN_ALGO_PERSIST_DYNAMIC = 2
# param layout:
# RNN: [weight, bias] × [input, hidden]
# GRU: [weight, bias] × [input, hidden] × [reset, update, newmem]
# LSTM: [weight, bias] × [input, hidden] × [input, forget, newmem, output]
function params(w::CuVector, input, hidden, n = 1)
slice(offset, shape) = reshape(view(w, offset.+(1:prod(shape))), shape)
wx = slice(0, (input, hidden*n))
wh = slice(length(wx), (hidden, hidden*n))
bias = view(w, length(wx)+length(wh) .+ (1:hidden*n))
(wx, wh), bias
end
mutable struct RNNDesc{T}
mode::Int
input::Int
hidden::Int
params::CuVector{T}
weights::NTuple{2,CuMatrix{T}}
bias::CuVector{T}
ptr::Ptr{Nothing}
end
Base.unsafe_convert(::Type{Ptr{Nothing}}, d::RNNDesc) = d.ptr
function rnnParamSize(T, r, input)
size = Csize_t[0]
@check ccall((:cudnnGetRNNParamsSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Ptr{Nothing},Ptr{Csize_t},Cint),
handle(), r, TensorDesc(T, (1,input,1)), size, cudnnDataType(T))
return Int(size[])÷sizeof(T)
end
ngates(mode) = [1, 1, 4, 3][mode+1]
ngates(r::RNNDesc) = ngates(r.mode)
function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T
d = [C_NULL]
@check ccall((:cudnnCreateRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Ptr{Nothing}},),d)
dropoutDesc = DropoutDesc(0)
inputMode = LINEAR_INPUT
direction = UNIDIRECTIONAL
algo = RNN_ALGO_STANDARD
@check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint),
handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
w = CuArrays.zeros(T, rnnParamSize(T, d[], input))
# TODO: avoid reserve allocation here
rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
finalizer(rd) do x
@check ccall((:cudnnDestroyRNNDescriptor,libcudnn),cudnnStatus_t,(Ptr{Nothing},),x)
end
return rd
end
function rnnWorkspaceSize(r::RNNDesc, seqlen, xdesc)
size = Csize_t[0]
@check ccall((:cudnnGetRNNWorkspaceSize, libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Ptr{Ptr{Nothing}},Ptr{Csize_t}),
handle(), r, seqlen, xdesc, size)
return Int(size[])
end
const workspace = Ref{Union{Nothing,CuVector{UInt8}}}(nothing)
function getworkspace(bytes)
if workspace[] === nothing || length(workspace[]) < bytes
workspace[] = CuVector{UInt8}(undef, bytes)
end
workspace[]
end
getworkspace(r::RNNDesc, seqlen, xdesc) =
getworkspace(rnnWorkspaceSize(r, seqlen, xdesc))
function rnnTrainingReserveSize(r::RNNDesc, seqlen, xdesc)
size = Csize_t[0]
@check ccall((:cudnnGetRNNTrainingReserveSize,libcudnn), cudnnStatus_t, (Ptr{Nothing}, Ptr{Nothing}, Cint, Ptr{Ptr{Nothing}}, Ptr{Csize_t}),
handle(), r, seqlen, xdesc, size)
return Int(size[])
end
function cudnnRNNForward(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
workspace, reserve=nothing) where T
if reserve == nothing
@check ccall((:cudnnRNNForwardInference, libcudnn), cudnnStatus_t,
(Ptr{Nothing}, Ptr{Nothing}, Cint,
Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T},
CuPtr{Nothing}, Csize_t),
handle(), rnn, seqlen,
xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
workspace, length(workspace))
else
@check ccall((:cudnnRNNForwardTraining, libcudnn), cudnnStatus_t,
(Ptr{Nothing}, Ptr{Nothing}, Cint,
Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
handle(), rnn, seqlen,
xd, x, hd, h, cd, c, wd, w, yd, y, hod, ho, cod, co,
workspace, length(workspace), reserve, length(reserve))
end
end
xDesc(x) = [TensorDesc(eltype(x), (1, size(x, 1), size(x, 2)))]
hDesc(h::Nothing) = C_NULL, CU_NULL
hDesc(x::Integer) = (@assert x == 0; hDesc(nothing))
function hDesc(h::CuArray)
TensorDesc(eltype(h), (size(h, 1), size(h, 2), 1)), h
end
# TODO: can we just manipulate strides here?
# TODO: should use repmat, but this isn't implemented.
hBatch(x::AbstractVector, h::CuVector) = h
hBatch(x::AbstractMatrix, h::CuVector) = h .* CuArrays.ones(1, size(x, 2))
hBatch(x::AbstractMatrix, h::CuMatrix) = h .* CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1)
function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
h = hBatch(x, h_)
c = c_ == nothing ? nothing : hBatch(x, c_)
@assert size(x, 1) == rnn.input
@assert size(h, 1) == rnn.hidden
@assert size(x, 2) == size(h, 2)
seqLength = 1
xdesc = xDesc(x)
y = x isa AbstractVector ? similar(x, rnn.hidden) : similar(x, rnn.hidden, size(x, 2))
ho = similar(h)
ydesc = xDesc(y)
workspace = getworkspace(rnn, seqLength, xdesc)
reserve = train == Val{true} ?
CuVector{UInt8}(undef, rnnTrainingReserveSize(rnn, seqLength, xdesc)) :
nothing
co = c == nothing ? c : similar(c)
cudnnRNNForward(rnn, seqLength,
xdesc, x,
hDesc(h)...,
hDesc(c)...,
FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
ydesc, y,
hDesc(ho)...,
hDesc(co)...,
workspace, reserve)
result = c == nothing ? (y, ho) : (y, ho, co)
return train == Val{true} ? (reserve, result) : result
end
forwardTrain(rnn::RNNDesc{T}, x::CuArray{T}, h::CuArray{T}, c = nothing) where T =
forward(rnn, x, h, c, Val{true})
function cudnnRNNBackwardData(rnn::RNNDesc{T}, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, rs) where T
@check ccall((:cudnnRNNBackwardData,libcudnn),cudnnStatus_t,
(Ptr{Nothing}, Ptr{Nothing}, Cint,
Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing},
CuPtr{T}, Ptr{Ptr{Nothing}}, CuPtr{T}, Ptr{Nothing}, CuPtr{T}, Ptr{Nothing}, CuPtr{T},
CuPtr{Nothing}, Csize_t, CuPtr{Nothing}, Csize_t),
handle(), rnn, seqlen, yd, y, dyd, dy, dhod, dho, dcod, dco,
wd, w, hd, h, cd, c, dxd, dx, dhd, dh, dcd, dc, ws, length(ws), rs, length(rs))
end
function backwardData(rnn::RNNDesc{T}, y, dy_, dho, dco, h, c, reserve) where T
# Same as above, any more efficient way?
dy = dy_ isa Integer ? zero(y) : dy_
yd = xDesc(y)
dx = y isa AbstractVector ? similar(dy, rnn.input) : similar(dy, rnn.input, size(dy, 2))
dh = similar(h)
dc = c == nothing ? nothing : similar(c)
cudnnRNNBackwardData(rnn, 1,
yd, y, yd, dy, hDesc(dho)..., hDesc(dco)...,
FilterDesc(T, (1, 1, length(rnn.params))), rnn.params,
hDesc(h)..., hDesc(c)..., xDesc(dx), dx, hDesc(dh)..., hDesc(dc)...,
workspace[], reserve)
return c == nothing ? (dx, dh) : (dx, dh, dc)
end
backwardData(rnn, y, dy, dho, hx, reserve) =
backwardData(rnn, y, dy, dho, nothing, hx, nothing, reserve)
function cudnnRNNBackwardWeights(rnn::RNNDesc{T}, seqlen, xd, x, hd, h, yd, y, dwd, dw,
workspace, reserve) where T
@check ccall((:cudnnRNNBackwardWeights,libcudnn), cudnnStatus_t,
(Ptr{Nothing}, Ptr{Nothing}, Cint, # handle, rnnDesc, seqLength
Ptr{Ptr{Nothing}}, CuPtr{T}, #x
Ptr{Nothing}, CuPtr{T}, #hx
Ptr{Ptr{Nothing}}, CuPtr{T}, #y
CuPtr{Nothing}, Csize_t, #ws
Ptr{Nothing}, CuPtr{T}, #dw
CuPtr{Nothing}, Csize_t), #rs
handle(), rnn, seqlen, xd, x, hd, h, yd, y,
workspace, length(workspace), dwd, dw, reserve, length(reserve))
end
function backwardWeights(rnn::RNNDesc{T}, x, h, y, reserve) where T
dw = zero(rnn.params)
cudnnRNNBackwardWeights(rnn, 1,
xDesc(x), x, hDesc(h)..., xDesc(y), y,
FilterDesc(T, (1, 1, length(dw))), dw,
workspace[], reserve)
return params(dw, rnn.input, rnn.hidden, ngates(rnn))
end
# Interface
import ..Flux: Flux, relu import ..Flux: Flux, relu
using CuArrays.CUDAnative using CuArrays.CUDAnative
using CuArrays: @cuindex, cudims
function LinearAlgebra.copy_transpose!(dst::CuArray, src::CuArray)
function kernel(dst, src)
I = @cuindex dst
dst[I...] = src[reverse(I)...]
return
end
blk, thr = cudims(dst)
@cuda blocks=blk threads=thr kernel(dst, src)
return dst
end
CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuArray{T,2},<:CuArray{T,1}} CuRNN{T} = Flux.RNNCell{<:Union{typeof(tanh),typeof(relu)},<:CuArray{T,2},<:CuArray{T,1}}
CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}} CuGRU{T} = Flux.GRUCell{<:CuArray{T,2},<:CuArray{T,1}}
CuLSTM{T} = Flux.LSTMCell{<:CuArray{T,2},<:CuArray{T,1}} CuLSTM{T} = Flux.LSTMCell{<:CuArray{T,2},<:CuArray{T,1}}
CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}} CuRNNs{T} = Union{CuRNN{T},CuGRU{T},CuLSTM{T}}
function copyparams!(m::CuRNNs, d::RNNDesc) function CUDNN.RNNDesc(m::CuRNNs{T}) where T
Wi, Wh = d.weights
copy_transpose!(Wi, m.Wi)
copy_transpose!(Wh, m.Wh)
copy_transpose!(d.bias, m.b)
return
end
function RNNDesc(m::CuRNNs{T}) where T
h, i = length(m.h), size(m.Wi, 2) h, i = length(m.h), size(m.Wi, 2)
mode = m isa CuRNN ? mode = m isa CuRNN ?
(m.σ == tanh ? RNN_TANH : RNN_RELU) : (m.σ == tanh ? CUDNN.CUDNN_RNN_TANH : CUDNN.CUDNN_RNN_RELU) :
m isa CuGRU ? GRU : LSTM m isa CuGRU ? CUDNN.CUDNN_GRU : CUDNN.CUDNN_LSTM
r = RNNDesc{T}(mode, i, h) r = CUDNN.RNNDesc{T}(mode, i, h)
return r return r
end end
const descs = WeakKeyDict() const descs = WeakKeyDict()
function desc(rnn) function desc(rnn)
d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = RNNDesc(rnn)) d = haskey(descs, rnn) ? descs[rnn] : (descs[rnn] = CUDNN.RNNDesc(rnn))
copyparams!(rnn, d) CUDNN.setweights!(d, rnn.Wi, rnn.Wh, rnn.b)
return d return d
end end
@ -275,17 +27,17 @@ import Zygote
using Zygote: @adjoint using Zygote: @adjoint
function (m::CuRNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64} function (m::CuRNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
y, h = forward(desc(m), x, h) y, h = CUDNN.forward(desc(m), x, h)
return h, y return h, y
end end
function (m::CuGRU{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64} function (m::CuGRU{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
y, h = forward(desc(m), x, h) y, h = CUDNN.forward(desc(m), x, h)
return h, y return h, y
end end
function (m::CuLSTM{T})(h::NTuple{2,CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64} function (m::CuLSTM{T})(h::NTuple{2,CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
y, h, c = forward(desc(m), x, h[1], h[2]) y, h, c = CUDNN.forward(desc(m), x, h[1], h[2])
return (h, c), y return (h, c), y
end end
@ -303,7 +55,7 @@ unbroadcast(x::AbstractArray, Δ) =
coerce_cuda(x::Union{CuArray,Nothing}) = x coerce_cuda(x::Union{CuArray,Nothing}) = x
coerce_cuda(x::Tuple) = coerce_cuda.(x) coerce_cuda(x::Tuple) = coerce_cuda.(x)
coerce_cuda(x) = x .+ CuArrays.fill(0) coerce_cuda(x::AbstractArray) = x .+ CuArrays.fill(0)
function struct_grad!(cx::Zygote.Context, x, ) function struct_grad!(cx::Zygote.Context, x, )
for f in fieldnames(typeof(x)) for f in fieldnames(typeof(x))
@ -316,28 +68,23 @@ end
for RNN in (CuRNN, CuGRU) for RNN in (CuRNN, CuGRU)
@eval @adjoint function (m::$RNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64} @eval @adjoint function (m::$RNN{T})(h::CuArray{T}, x::CuArray{T}) where T <: Union{Float32,Float64}
reserve, (y, ho) = forwardTrain(desc(m), x, h) (y, ho), back = CUDNN.pullback(desc(m), x, h)
(ho, y), function (Δ) (ho, y), function (Δ)
dho, dy = coerce_cuda(Δ) dho, dy = coerce_cuda(Δ) # Support FillArrays etc.
h_ = hBatch(x, h) = back(dy, dho)
dx, dh = backwardData(descs[m], y, dy, dho, h_, reserve) dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(.Wi),Wh=transpose(.Wh),b=.b,h=nothing))
(dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve) (dm, unbroadcast(h, .h), .x)
dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing))
(dm, unbroadcast(h, dh), dx)
end end
end end
end end
@adjoint function (m::CuLSTM)((h, c)::Tuple{CuArray{T},CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64} @adjoint function (m::CuLSTM)((h, c)::Tuple{CuArray{T},CuArray{T}}, x::CuArray{T}) where T <: Union{Float32,Float64}
reserve, (y, ho, co) = forwardTrain(desc(m), x, h, c) (y, ho, co), back = CUDNN.pullback(desc(m), x, h, c)
((ho, co), y), function (Δ) ((ho, co), y), function (Δ)
dhc, dy = coerce_cuda(Δ) dhc, dy = coerce_cuda(Δ) # Support FillArrays etc.
dho, dco = dhc === nothing ? (nothing, nothing) : dhc dho, dco = dhc === nothing ? (nothing, nothing) : dhc
h_ = hBatch(x, h) = back(dy, dho, dco)
c_ = hBatch(x, c) dm = struct_grad!(__context__, m, (σ=nothing,Wi=transpose(.Wi),Wh=transpose(.Wh),b=.b,h=nothing,c=nothing))
dx, dh, dc = backwardData(descs[m], y, dy, dho, dco, h_, c_, reserve) (dm, (unbroadcast(h, .h), unbroadcast(c, .c)), .x)
(dWi, dWh), db = backwardWeights(descs[m], x, h_, y, reserve)
dm = struct_grad!(__context__, m, (Wi=transpose(dWi),Wh=transpose(dWh),b=db,h=nothing,c=nothing))
(dm, (unbroadcast(h, dh), unbroadcast(c, dc)), dx)
end end
end end

View File

@ -39,7 +39,7 @@ end
trainable(m) = functor(m)[1] trainable(m) = functor(m)[1]
params!(p::Params, x::AbstractArray{<:Real}, seen = IdSet()) = push!(p, x) params!(p::Params, x::AbstractArray{<:Number}, seen = IdSet()) = push!(p, x)
function params!(p::Params, x, seen = IdSet()) function params!(p::Params, x, seen = IdSet())
x in seen && return x in seen && return
@ -73,13 +73,7 @@ end
cpu(m) = fmap(x -> adapt(Array, x), m) cpu(m) = fmap(x -> adapt(Array, x), m)
const gpu_adaptor = if has_cuarrays() gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x
CuArrays.cu
else
identity
end
gpu(x) = fmap(gpu_adaptor, x)
# Precision # Precision

View File

@ -39,24 +39,39 @@ function Base.show(io::IO, c::Chain)
print(io, ")") print(io, ")")
end end
"""
outdims(c::Chain, isize)
Calculate the output dimensions given the input dimensions, `isize`.
```julia
m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
outdims(m, (10, 10)) == (6, 6)
```
"""
outdims(c::Chain, isize) = foldl(, map(l -> (x -> outdims(l, x)), c.layers))(isize)
# This is a temporary and naive implementation # This is a temporary and naive implementation
# it might be replaced in the future for better performance # it might be replaced in the future for better performance
# see issue https://github.com/FluxML/Flux.jl/issues/702 # see issue https://github.com/FluxML/Flux.jl/issues/702
# Johnny Chen -- @johnnychen94 # Johnny Chen -- @johnnychen94
# only slightly changed to better handle interaction with Zygote @dsweber2
""" """
activations(c::Chain, input) activations(c::Chain, input)
Calculate the forward results of each layers in Chain `c` with `input` as model input. Calculate the forward results of each layers in Chain `c` with `input` as model input.
""" """
function activations(c::Chain, input) function activations(c::Chain, input)
rst = [] extraChain(c.layers, input)
for l in c
x = get(rst, length(rst), input)
push!(rst, l(x))
end
return rst
end end
function extraChain(fs::Tuple, x)
res = first(fs)(x)
return (res, extraChain(Base.tail(fs), res)...)
end
extraChain(::Tuple{}, x) = ()
""" """
Dense(in::Integer, out::Integer, σ = identity) Dense(in::Integer, out::Integer, σ = identity)
@ -112,6 +127,19 @@ end
(a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = (a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
a(T.(x)) a(T.(x))
"""
outdims(l::Dense, isize)
Calculate the output dimensions given the input dimensions, `isize`.
```julia
m = Dense(10, 5)
outdims(m, (5, 2)) == (5,)
outdims(m, (10,)) == (5,)
```
"""
outdims(l::Dense, isize) = (size(l.W)[1],)
""" """
Diagonal(in::Integer) Diagonal(in::Integer)
@ -141,6 +169,7 @@ function Base.show(io::IO, l::Diagonal)
print(io, "Diagonal(", length(l.α), ")") print(io, "Diagonal(", length(l.α), ")")
end end
outdims(l::Diagonal, isize) = (length(l.α),)
""" """
Maxout(over) Maxout(over)
@ -189,6 +218,8 @@ function (mo::Maxout)(input::AbstractArray)
mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over) mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
end end
outdims(l::Maxout, isize) = outdims(first(l.over), isize)
""" """
SkipConnection(layers, connection) SkipConnection(layers, connection)

View File

@ -1,4 +1,9 @@
using NNlib: conv, ∇conv_data, depthwiseconv using NNlib: conv, ∇conv_data, depthwiseconv, output_size
# pad dims of x with dims of y until ndims(x) == ndims(y)
_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
_convtransoutdims(isize, ksize, ssize, dsize, pad) = (isize .- 1).*ssize .+ 1 .+ (ksize .- 1).*dsize .- (pad[1:2:end] .+ pad[2:2:end])
expand(N, i::Tuple) = i expand(N, i::Tuple) = i
expand(N, i::Integer) = ntuple(_ -> i, N) expand(N, i::Integer) = ntuple(_ -> i, N)
@ -17,7 +22,7 @@ Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
out = 16 out = 16
Conv((2, 2), 1=>16, relu) Conv((2, 2), 1=>16, relu)
Data should be stored in WHCN order (width, height, # channels, # batches). Data should be stored in WHCN order (width, height, # channels, batch size).
In other words, a 100×100 RGB image would be a `100×100×3×1` array, In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array. and a batch of 50 would be a `100×100×3×50` array.
@ -106,8 +111,23 @@ end
a(T.(x)) a(T.(x))
""" """
ConvTranspose(filter::Tuple, in=>out) outdims(l::Conv, isize::Tuple)
ConvTranspose(filter::Tuple, in=>out, activation)
Calculate the output dimensions given the input dimensions, `isize`.
Batch size and channel size are ignored as per `NNlib.jl`.
```julia
m = Conv((3, 3), 3 => 16)
outdims(m, (10, 10)) == (8, 8)
outdims(m, (10, 10, 1, 3)) == (8, 8)
```
"""
outdims(l::Conv, isize) =
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
"""
ConvTranspose(size, in=>out)
ConvTranspose(size, in=>out, relu)
Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`. Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively. `in` and `out` specify the number of input and output channels respectively.
@ -178,6 +198,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
) )
end end
# TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
@nograd conv_transpose_dims
function (c::ConvTranspose)(x::AbstractArray) function (c::ConvTranspose)(x::AbstractArray)
# ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1))) # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1) σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
@ -198,6 +221,8 @@ end
(a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = (a::ConvTranspose{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
a(T.(x)) a(T.(x))
outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
""" """
DepthwiseConv(filter::Tuple, in=>out) DepthwiseConv(filter::Tuple, in=>out)
DepthwiseConv(filter::Tuple, in=>out, activation) DepthwiseConv(filter::Tuple, in=>out, activation)
@ -298,9 +323,12 @@ end
(a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = (a::DepthwiseConv{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
a(T.(x)) a(T.(x))
outdims(l::DepthwiseConv, isize) =
output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
""" """
CrossCor(size, in=>out) CrossCor(size, in=>out)
CrossCor(size, in=>out, relu) CrossCor(size, in=>out, activation)
Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`. Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively. `in` and `out` specify the number of input and output channels respectively.
@ -351,6 +379,11 @@ function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ
return CrossCor(σ, w, b, stride, pad, dilation) return CrossCor(σ, w, b, stride, pad, dilation)
end end
function CrossCor(;weight::AbstractArray, bias::Union{Zeros, AbstractVector{T}},
activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
end
function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1, init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
@ -387,6 +420,9 @@ end
(a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = (a::CrossCor{<:Any,<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
a(T.(x)) a(T.(x))
outdims(l::CrossCor, isize) =
output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
""" """
MaxPool(k) MaxPool(k)
@ -416,6 +452,8 @@ function Base.show(io::IO, m::MaxPool)
print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")") print(io, "MaxPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
end end
outdims(l::MaxPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))
""" """
MeanPool(k) MeanPool(k)
@ -443,3 +481,5 @@ end
function Base.show(io::IO, m::MeanPool) function Base.show(io::IO, m::MeanPool)
print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")") print(io, "MeanPool(", m.k, ", pad = ", m.pad, ", stride = ", m.stride, ")")
end end
outdims(l::MeanPool{N}, isize) where N = output_size(PoolDims(_paddims(isize, (l.k..., 1, 1)), l.k; stride = l.stride, padding = l.pad))

View File

@ -1,5 +1,5 @@
gate(h, n) = (1:h) .+ h*(n-1) gate(h, n) = (1:h) .+ h*(n-1)
gate(x::AbstractVector, h, n) = x[gate(h,n)] gate(x::AbstractVector, h, n) = @view x[gate(h,n)]
gate(x::AbstractMatrix, h, n) = x[gate(h,n),:] gate(x::AbstractMatrix, h, n) = x[gate(h,n),:]
# Stateful recurrence # Stateful recurrence
@ -45,8 +45,7 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
""" """
reset!(rnn) reset!(rnn)
Reset the hidden state of a recurrent layer back to its original value. See also Reset the hidden state of a recurrent layer back to its original value.
`truncate!`.
Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to

View File

@ -1,13 +1,24 @@
using CuArrays
using NNlib: logsoftmax, logσ using NNlib: logsoftmax, logσ
# Cost functions # Cost functions
mse(, y) = sum(( .- y).^2) * 1 // length(y) mse(, y) = sum(( .- y).^2) * 1 // length(y)
function crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
-sum(y .* log.() .* weight) * 1 // size(y, 2) return -sum(y .* log.()) * 1 // size(y, 2)
end end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
return -sum(y .* log.()) .* weight * 1 // size(y, 2)
end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
return -sum(y .* log.() .* weight) * 1 // size(y, 2)
end
crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(, y, weight)
function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1) function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2) return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
end end
@ -25,6 +36,9 @@ Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerica
""" """
binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ) binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ)
# Re-definition to fix interaction with CuArrays.
CuArrays.@cufunc binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ)
""" """
logitbinarycrossentropy(logŷ, y) logitbinarycrossentropy(logŷ, y)
@ -39,13 +53,60 @@ but it is more numerically stable.
""" """
logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ) logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
# Re-definition to fix interaction with CuArrays.
CuArrays.@cufunc logitbinarycrossentropy(logŷ, y) = (1 - y)*logŷ - logσ(logŷ)
""" """
normalise(x::AbstractArray; dims=1) normalise(x::AbstractArray; dims=1)
Normalises x to mean 0 and standard deviation 1, across the dimensions given by dims. Defaults to normalising over columns. Normalises `x` to mean 0 and standard deviation 1, across the dimensions given by `dims`. Defaults to normalising over columns.
julia> a = reshape(collect(1:9), 3, 3)
3×3 Array{Int64,2}:
1 4 7
2 5 8
3 6 9
julia> normalise(a)
3×3 Array{Float64,2}:
-1.22474 -1.22474 -1.22474
0.0 0.0 0.0
1.22474 1.22474 1.22474
julia> normalise(a, dims=2)
3×3 Array{Float64,2}:
-1.22474 0.0 1.22474
-1.22474 0.0 1.22474
-1.22474 0.0 1.22474
""" """
function normalise(x::AbstractArray; dims=1) function normalise(x::AbstractArray; dims=1)
μ′ = mean(x, dims = dims) μ′ = mean(x, dims = dims)
σ = std(x, dims = dims, mean = μ′, corrected=false) σ = std(x, dims = dims, mean = μ′, corrected=false)
return (x .- μ′) ./ σ return (x .- μ′) ./ σ
end end
"""
kldivergence(, y)
KLDivergence is a measure of how much one probability distribution is different from the other.
It is always non-negative and zero only when both the distributions are equal everywhere.
[KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
"""
function kldivergence(, y)
entropy = sum(y .* log.(y)) *1 //size(y,2)
cross_entropy = crossentropy(, y)
return entropy + cross_entropy
end
"""
poisson(, y)
Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
[Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
"""
poisson(, y) = sum( .- y .* log.()) *1 // size(y,2)
"""
hinge(, y)
Measures the loss given the prediction and true labels y(containing 1 or -1).
[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
"""
hinge(, y) = sum(max.(0, 1 .- .* y)) *1 // size(y,2)

View File

@ -37,12 +37,10 @@ import Adapt: adapt, adapt_structure
adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data)) adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
if has_cuarrays() import .CuArrays: CuArray, cudaconvert
import .CuArrays: CuArray, cudaconvert import Base.Broadcast: BroadcastStyle, ArrayStyle
import Base.Broadcast: BroadcastStyle, ArrayStyle BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}() cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
end
""" """
onehot(l, labels[, unk]) onehot(l, labels[, unk])
@ -127,6 +125,4 @@ onecold(y::AbstractMatrix, labels...) =
onecold(y::OneHotMatrix, labels...) = onecold(y::OneHotMatrix, labels...) =
mapreduce(x -> Flux.onecold(x, labels...), |, y.data, dims = 2, init = 0) mapreduce(x -> Flux.onecold(x, labels...), |, y.data, dims = 2, init = 0)
# TODO probably still want this as a custom adjoint Zygote @nograd onecold, onehot, onehotbatch
# onecold(x::TrackedVector, l...) = onecold(data(x), l...)
# onecold(x::TrackedMatrix, l...) = onecold(data(x), l...)

View File

@ -1,5 +1,4 @@
using Flux using Flux
using Base: @get!
using MacroTools: @forward using MacroTools: @forward
const ϵ = 1e-8 const ϵ = 1e-8
@ -7,10 +6,28 @@ const ϵ = 1e-8
# TODO: should use weak refs # TODO: should use weak refs
""" """
Descent(η) Descent(η)
Classic gradient descent optimiser with learning rate `η`. Classic gradient descent optimiser with learning rate `η`.
For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`. For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
## Parameters
- Learning Rate (η): The amount by which the gradients are discounted before updating the weights. Defaults to `0.1`.
## Example
```julia-repl
opt = Descent() # uses default η (0.1)
opt = Descent(0.3) # use provided η
ps = params(model)
gs = gradient(ps) do
loss(x, y)
end
Flux.Optimise.update!(opt, ps, gs)
```
""" """
mutable struct Descent mutable struct Descent
eta::Float64 eta::Float64
@ -23,9 +40,20 @@ function apply!(o::Descent, x, Δ)
end end
""" """
Momentum(η = 0.01; ρ = 0.9) Momentum(η, ρ)
Gradient descent with learning rate `η` and momentum `ρ`. Gradient descent with learning rate `η` and momentum `ρ`.
## Parameters
- Learning Rate (`η`): Amount by which gradients are discounted before updating the weights. Defaults to `0.01`.
- Momentum (`ρ`): Parameter that accelerates descent in the relevant direction and dampens oscillations. Defaults to `0.9`.
## Examples
```julia
opt = Momentum() # uses defaults of η = 0.01 and ρ = 0.9
opt = Momentum(0.01, 0.99)
```
""" """
mutable struct Momentum mutable struct Momentum
eta::Float64 eta::Float64
@ -43,9 +71,20 @@ function apply!(o::Momentum, x, Δ)
end end
""" """
Nesterov(eta, ρ = 0.9) Nesterov(η, ρ)
Gradient descent with learning rate `η` and Nesterov momentum `ρ`. Gradient descent with learning rate `η` and Nesterov momentum `ρ`.
## Parameters
- Learning Rate (η): Amount by which the gradients are dicsounted berfore updating the weights. Defaults to `0.001`.
- Nesterov Momentum (ρ): Paramters controlling the amount of nesterov momentum to be applied. Defaults to `0.9`.
## Examples
```julia
opt = Nesterov() # uses defaults η = 0.001 and ρ = 0.9
opt = Nesterov(0.003, 0.95)
```
""" """
mutable struct Nesterov mutable struct Nesterov
eta::Float64 eta::Float64
@ -64,11 +103,23 @@ function apply!(o::Nesterov, x, Δ)
end end
""" """
RMSProp(η = 0.001, ρ = 0.9) RMSProp(η, ρ)
Implements the RMSProp algortihm. Often a good choice for recurrent networks. Paramters other than learning rate generally don't need tuning.
## Parameters
- Learning Rate (η): Defaults to `0.001`.
- Rho (ρ): Defaults to `0.9`.
## Examples
```julia
opt = RMSProp() # uses default η = 0.001 and ρ = 0.9
opt = RMSProp(0.002, 0.95)
```
## References
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
optimiser. Parameters other than learning rate don't need tuning. Often a good
choice for recurrent networks.
""" """
mutable struct RMSProp mutable struct RMSProp
eta::Float64 eta::Float64
@ -86,8 +137,22 @@ function apply!(o::RMSProp, x, Δ)
end end
""" """
ADAM(η = 0.001, β = (0.9, 0.999)) ADAM(η, β::Tuple)
Implements the ADAM optimiser.
## Paramters
- Learning Rate (`η`): Defaults to `0.001`.
- Beta (`β::Tuple`): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = ADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
opt = ADAM(0.001, (0.9, 0.8))
```
## References
[ADAM](https://arxiv.org/abs/1412.6980v8) optimiser. [ADAM](https://arxiv.org/abs/1412.6980v8) optimiser.
""" """
mutable struct ADAM mutable struct ADAM
@ -109,8 +174,23 @@ function apply!(o::ADAM, x, Δ)
end end
""" """
RADAM(η = 0.001, β = (0.9, 0.999)) RADAM(η, β::Tuple)
Implements the rectified ADAM optimizer.
## Parameters
- Learning Rate (η): Defaults to `0.001`
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = RADAM() # uses the default η = 0.001 and β = (0.9, 0.999)
opt = RADAM(0.001, (0.9, 0.8))
```
## References
[RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM). [RADAM](https://arxiv.org/pdf/1908.03265v1.pdf) optimiser (Rectified ADAM).
""" """
mutable struct RADAM mutable struct RADAM
@ -139,10 +219,22 @@ function apply!(o::RADAM, x, Δ)
end end
""" """
AdaMax(params, η = 0.001; β1 = 0.9, β2 = 0.999, ϵ = 1e-08) AdaMax(η, β::Tuple)
[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser. Variant of ADAM based on Variant of ADAM based on -norm.
the -norm.
## Parameters
- Learning Rate (η): Defaults to `0.001`
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = AdaMax() # uses default η and β
opt = AdaMax(0.001, (0.9, 0.995))
```
## References
[AdaMax](https://arxiv.org/abs/1412.6980v9) optimiser.
""" """
mutable struct AdaMax mutable struct AdaMax
eta::Float64 eta::Float64
@ -163,8 +255,21 @@ function apply!(o::AdaMax, x, Δ)
end end
""" """
ADAGrad(η = 0.1; ϵ = 1e-8) ADAGrad(η)
Implements AdaGrad. It has parameter specific learning rates based on how frequently it is updated.
## Parameters
- Learning Rate (η): Defaults to `0.1`
## Examples
```julia
opt = ADAGrad() # uses default η = 0.1
opt = ADAGrad(0.001)
```
## References
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser. [ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser.
Parameters don't need tuning. Parameters don't need tuning.
""" """
@ -177,16 +282,27 @@ ADAGrad(η = 0.1) = ADAGrad(η, IdDict())
function apply!(o::ADAGrad, x, Δ) function apply!(o::ADAGrad, x, Δ)
η = o.eta η = o.eta
acc = get!(o.acc, x, fill(ϵ, size(x)))::typeof(x) acc = get!(o.acc, x, fill!(zero(x), ϵ))::typeof(x)
@. acc += Δ^2 @. acc += Δ^2
@. Δ *= η / (acc + ϵ) @. Δ *= η / (acc + ϵ)
end end
""" """
ADADelta(ρ = 0.9, ϵ = 1e-8) ADADelta(ρ)
[ADADelta](https://arxiv.org/abs/1212.5701) optimiser. Parameters don't need Version of ADAGrad that adapts learning rate based on a window of past gradient updates. Parameters don't need tuning.
tuning.
## Parameters
- Rho (ρ): Factor by which gradient is decayed at each time step. Defaults to `0.9`.
## Examples
```julia
opt = ADADelta() # uses default ρ = 0.9
opt = ADADelta(0.89)
```
## References
[ADADelta](https://arxiv.org/abs/1212.5701) optimiser.
""" """
mutable struct ADADelta mutable struct ADADelta
rho::Float64 rho::Float64
@ -205,10 +321,22 @@ function apply!(o::ADADelta, x, Δ)
end end
""" """
AMSGrad(η = 0.001, β = (0.9, 0.999)) AMSGrad(η, β::Tuple)
[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser. Parameters don't need Implements AMSGrad version of the ADAM optimiser. Parameters don't need tuning.
tuning.
## Parameters
- Learning Rate (η): Defaults to `0.001`.
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = AMSGrad() # uses default η and β
opt = AMSGrad(0.001, (0.89, 0.995))
```
## References
[AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) optimiser.
""" """
mutable struct AMSGrad mutable struct AMSGrad
eta::Float64 eta::Float64
@ -220,18 +348,30 @@ AMSGrad(η = 0.001, β = (0.9, 0.999)) = AMSGrad(η, β, IdDict())
function apply!(o::AMSGrad, x, Δ) function apply!(o::AMSGrad, x, Δ)
η, β = o.eta, o.beta η, β = o.eta, o.beta
mt, vt, v̂t = get!(o.state, x, (fill(ϵ, size(x)), fill(ϵ, size(x)), fill(ϵ, size(x)))) mt, vt, v̂t = get!(o.state, x, (fill!(zero(x), ϵ), fill!(zero(x), ϵ), fill!(zero(x), ϵ)))
@. mt = β[1] * mt + (1 - β[1]) * Δ @. mt = β[1] * mt + (1 - β[1]) * Δ
@. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2 @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
@. v̂t = max.(v̂t, vt) @. v̂t = max(v̂t, vt)
@. Δ = η * mt / (v̂t + ϵ) @. Δ = η * mt / (v̂t + ϵ)
end end
""" """
NADAM(η = 0.001, β = (0.9, 0.999)) NADAM(η, β::Tuple)
[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser. Parameters don't need Nesterov variant of ADAM. Parameters don't need tuning.
tuning.
## Parameters
- Learning Rate (η): Defaults to `0.001`.
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to `(0.9, 0.999)`.
## Examples
```julia
opt = NADAM() # uses default η and β
opt = NADAM(0.002, (0.89, 0.995))
```
## References
[NADAM](http://cs229.stanford.edu/proj2015/054_report.pdf) optimiser.
""" """
mutable struct NADAM mutable struct NADAM
eta::Float64 eta::Float64
@ -252,9 +392,23 @@ function apply!(o::NADAM, x, Δ)
end end
""" """
ADAMW((η = 0.001, β = (0.9, 0.999), decay = 0) ADAMW(η, β::Tuple, decay)
[ADAMW](https://arxiv.org/abs/1711.05101) fixing weight decay regularization in Adam. Variant of ADAM defined by fixing weight decay regularization.
## Parameters
- Learning Rate (η): Defaults to `0.001`.
- Beta (β::Tuple): The first element refers to β1 and the second to β2. Defaults to (0.9, 0.999).
- decay: Decay applied to weights during optimisation. Defaults to 0.
## Examples
```julia
opt = ADAMW() # uses default η, β and decay
opt = ADAMW(0.001, (0.89, 0.995), 0.1)
```
## References
[ADAMW](https://arxiv.org/abs/1711.05101)
""" """
ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) = ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
Optimiser(ADAM(η, β), WeightDecay(decay)) Optimiser(ADAM(η, β), WeightDecay(decay))
@ -287,9 +441,15 @@ function apply!(o::Optimiser, x, Δ)
end end
""" """
`InvDecay(γ)` InvDecay(γ)
Apply inverse time decay to an optimiser Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
```
## Parameters
- gamma (γ): Defaults to `0.001`
## Example
```julia ```julia
Optimiser(InvDecay(..), Opt(..)) Optimiser(InvDecay(..), Opt(..))
``` ```
@ -310,13 +470,22 @@ function apply!(o::InvDecay, x, Δ)
end end
""" """
`ExpDecay(eta, decay, decay_step, clip)` ExpDecay(eta, decay, decay_step, clip)
Schedule the learning rate `eta` by `decay` every `decay_step` till a minimum of `clip`. Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
## Parameters
- Learning Rate (eta): Defaults to `0.001`.
- decay: Factor by which the learning rate is discounted. Defaults to `0.1`.
- decay_step: Schedules decay operations by setting number of steps between two decay operations. Defaults to `1000`.
- clip: Minimum value of learning rate. Defaults to `1e-4`.
## Example
To apply exponential decay to an optimiser: To apply exponential decay to an optimiser:
```julia ```julia
Optimiser(ExpDecay(..), Opt(..)) Optimiser(ExpDecay(..), Opt(..))
opt = Optimiser(ExpDecay(), ADAM())
``` ```
""" """
mutable struct ExpDecay mutable struct ExpDecay
@ -340,9 +509,12 @@ function apply!(o::ExpDecay, x, Δ)
end end
""" """
`WeightDecay(wd)` WeightDecay(wd)
Decay the weight parameter by `wd` Decays the weight by `wd`
## Parameters
- weight decay (wd): 0
""" """
mutable struct WeightDecay mutable struct WeightDecay
wd::Real wd::Real

View File

@ -1,6 +1,11 @@
# Arrays # Arrays
glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0/sum(dims)) nfan() = 1, 1 #fan_in, fan_out
glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0/sum(dims)) nfan(n) = 1, n #A vector is treated as a n×1 matrix
nfan(n_out, n_in) = n_in, n_out #In case of Dense kernels: arranged as matrices
nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) #In case of convolution kernels
glorot_uniform(dims...) = (rand(Float32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
glorot_normal(dims...) = randn(Float32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
ones(T::Type, dims...) = Base.ones(T, dims...) ones(T::Type, dims...) = Base.ones(T, dims...)
zeros(T::Type, dims...) = Base.zeros(T, dims...) zeros(T::Type, dims...) = Base.zeros(T, dims...)
@ -98,6 +103,48 @@ function batchseq(xs, pad = nothing, n = maximum(length(x) for x in xs))
[batch([xs_[j][i] for j = 1:length(xs_)]) for i = 1:n] [batch([xs_[j][i] for j = 1:length(xs_)]) for i = 1:n]
end end
# Flattening models to weight vectors, and back
function _restructure(m, xs)
i = 0
fmap(m) do x
x isa AbstractArray || return x
x = reshape(xs[i.+(1:length(x))], size(x))
i += length(x)
return x
end
end
"""
destructure(m)
Flatten a model's parameters into a single weight vector.
julia> m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
julia> θ, re = destructure(m);
julia> θ
67-element Array{Float32,1}:
-0.1407104
...
The second return value `re` allows you to reconstruct the original network after making
modifications to the weight vector (for example, with a hypernetwork).
julia> re(θ .* 2)
Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
"""
function destructure(m)
xs = Zygote.Buffer([])
fmap(m) do x
x isa AbstractArray && push!(xs, x)
return x
end
return vcat(vec.(copy(xs))...), p -> _restructure(m, p)
end
# Other # Other
""" """

View File

@ -25,9 +25,16 @@ cm = gpu(m)
@test all(p isa CuArray for p in params(cm)) @test all(p isa CuArray for p in params(cm))
@test cm(gpu(rand(10, 10))) isa CuArray{Float32,2} @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
x = [1,2,3] x = [1.,2.,3.]
cx = gpu(x) cx = gpu(x)
@test Flux.crossentropy(x,x) Flux.crossentropy(cx,cx) @test Flux.crossentropy(x,x) Flux.crossentropy(cx,cx)
@test Flux.crossentropy(x,x, weight=1.0) Flux.crossentropy(cx,cx, weight=1.0)
@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
x = [-1.1491, 0.8619, 0.3127]
y = [1, 1, 0.]
@test Flux.binarycrossentropy.(σ.(x),y) Array(Flux.binarycrossentropy.(cu(σ.(x)),cu(y)))
@test Flux.logitbinarycrossentropy.(x,y) Array(Flux.logitbinarycrossentropy.(cu(x),cu(y)))
xs = rand(5, 5) xs = rand(5, 5)
ys = Flux.onehotbatch(1:5,1:5) ys = Flux.onehotbatch(1:5,1:5)
@ -51,10 +58,10 @@ end
@test y[3,:] isa CuArray @test y[3,:] isa CuArray
end end
if CuArrays.libcudnn != nothing if CuArrays.has_cudnn()
@info "Testing Flux/CUDNN" @info "Testing Flux/CUDNN"
include("cudnn.jl") include("cudnn.jl")
if !haskey(ENV, "CI_DISABLE_CURNN_TEST") include("curnn.jl")
include("curnn.jl") else
end @warn "CUDNN unavailable, not testing GPU DNN support"
end end

View File

@ -22,8 +22,8 @@ end
rand(10, batch_size) rand(10, batch_size)
cux = gpu(x) cux = gpu(x)
y, back = pullback((r, x) -> (r(x)), rnn, x) y, back = pullback((r, x) -> r(x), rnn, x)
cuy, cuback = pullback((r, x) -> (r(x)), curnn, cux) cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
@test y collect(cuy) @test y collect(cuy)
@test haskey(Flux.CUDA.descs, curnn.cell) @test haskey(Flux.CUDA.descs, curnn.cell)

View File

@ -4,11 +4,13 @@ import Flux: activations
@testset "basic" begin @testset "basic" begin
@testset "helpers" begin @testset "helpers" begin
@testset "activations" begin @testset "activations" begin
dummy_model = Chain(Dense(10,5,σ),Dense(5,2),softmax) dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x))
x = rand(10) x = randn(10)
@test activations(Chain(), x) == [] @test activations(dummy_model, x)[1] == x.^2
@test activations(dummy_model, x)[1] == dummy_model[1](x) @test activations(dummy_model, x)[2] == (x.^2 .- 3)
@test activations(dummy_model, x)[2] == x |> dummy_model[1] |> dummy_model[2] @test activations(dummy_model, x)[3] == tan.(x.^2 .- 3)
@test activations(Chain(), x) == ()
@test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type @test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type
end end
end end
@ -19,6 +21,12 @@ import Flux: activations
# numeric test should be put into testset of corresponding layer # numeric test should be put into testset of corresponding layer
end end
@testset "Activations" begin
c = Chain(Dense(3,5,relu), Dense(5,1,relu))
X = Float32.([1.0; 1.0; 1.0])
@test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
end
@testset "Dense" begin @testset "Dense" begin
@test length(Dense(10, 5)(randn(10))) == 5 @test length(Dense(10, 5)(randn(10))) == 5
@test_throws DimensionMismatch Dense(10, 5)(randn(1)) @test_throws DimensionMismatch Dense(10, 5)(randn(1))
@ -84,4 +92,19 @@ import Flux: activations
@test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4) @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
end end
end end
@testset "output dimensions" begin
m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
@test Flux.outdims(m, (10, 10)) == (6, 6)
m = Dense(10, 5)
@test Flux.outdims(m, (5, 2)) == (5,)
@test Flux.outdims(m, (10,)) == (5,)
m = Flux.Diagonal(10)
@test Flux.outdims(m, (10,)) == (10,)
m = Maxout(() -> Conv((3, 3), 3 => 16), 2)
@test Flux.outdims(m, (10, 10)) == (8, 8)
end
end end

View File

@ -1,5 +1,6 @@
using Flux, Test using Flux, Test
using Flux: maxpool, meanpool using Flux: maxpool, meanpool
using Flux: gradient
@testset "Pooling" begin @testset "Pooling" begin
x = randn(Float32, 10, 10, 3, 2) x = randn(Float32, 10, 10, 3, 2)
@ -83,6 +84,10 @@ end
y = Conv((3,3), 1 => 1)(x) y = Conv((3,3), 1 => 1)(x)
x_hat = ConvTranspose((3, 3), 1 => 1)(y) x_hat = ConvTranspose((3, 3), 1 => 1)(y)
@test size(x_hat) == size(x) @test size(x_hat) == size(x)
m = ConvTranspose((3,3), 1=>1)
# Test that the gradient call does not throw: #900
@test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
end end
@testset "CrossCor" begin @testset "CrossCor" begin
@ -90,7 +95,7 @@ end
w = rand(2,2,1,1) w = rand(2,2,1,1)
y = CrossCor(w, [0.0]) y = CrossCor(w, [0.0])
@test sum(w .* x[1:2, 1:2, :, :]) == y(x)[1, 1, 1, 1] @test isapprox(sum(w .* x[1:2, 1:2, :, :]), y(x)[1, 1, 1, 1], rtol=1e-7)
r = zeros(Float32, 28, 28, 1, 5) r = zeros(Float32, 28, 28, 1, 5)
m = Chain( m = Chain(
@ -113,17 +118,17 @@ end
l = Conv((3,3), 1=>1) l = Conv((3,3), 1=>1)
expected = zeros(eltype(l.weight),5,5,1,1) expected = zeros(eltype(l.weight),5,5,1,1)
expected[2:end-1,2:end-1,1,1] = l.weight expected[2:end-1,2:end-1,1,1] = l.weight
@test expected == l(data) @test expected l(data)
l = Conv((3,1), 1=>1) l = Conv((3,1), 1=>1)
expected = zeros(eltype(l.weight),5,7,1,1) expected = zeros(eltype(l.weight),5,7,1,1)
expected[2:end-1,4,1,1] = l.weight expected[2:end-1,4,1,1] = l.weight
@test expected == l(data) @test expected l(data)
l = Conv((1,3), 1=>1) l = Conv((1,3), 1=>1)
expected = zeros(eltype(l.weight),7,5,1,1) expected = zeros(eltype(l.weight),7,5,1,1)
expected[4,2:end-1,1,1] = l.weight expected[4,2:end-1,1,1] = l.weight
@test expected == l(data) @test expected l(data)
@test begin @test begin
# we test that the next expression does not throw # we test that the next expression does not throw
@ -131,3 +136,55 @@ end
true true
end end
end end
@testset "conv output dimensions" begin
m = Conv((3, 3), 3 => 16)
@test Flux.outdims(m, (10, 10)) == (8, 8)
m = Conv((3, 3), 3 => 16; stride = 2)
@test Flux.outdims(m, (5, 5)) == (2, 2)
m = Conv((3, 3), 3 => 16; stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = ConvTranspose((3, 3), 3 => 16)
@test Flux.outdims(m, (8, 8)) == (10, 10)
m = ConvTranspose((3, 3), 3 => 16; stride = 2)
@test Flux.outdims(m, (2, 2)) == (5, 5)
m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
@test Flux.outdims(m, (4, 4)) == (5, 5)
m = DepthwiseConv((3, 3), 3 => 6)
@test Flux.outdims(m, (10, 10)) == (8, 8)
m = DepthwiseConv((3, 3), 3 => 6; stride = 2)
@test Flux.outdims(m, (5, 5)) == (2, 2)
m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = CrossCor((3, 3), 3 => 16)
@test Flux.outdims(m, (10, 10)) == (8, 8)
m = CrossCor((3, 3), 3 => 16; stride = 2)
@test Flux.outdims(m, (5, 5)) == (2, 2)
m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = MaxPool((2, 2))
@test Flux.outdims(m, (10, 10)) == (5, 5)
m = MaxPool((2, 2); stride = 1)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = MaxPool((2, 2); stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
m = MeanPool((2, 2))
@test Flux.outdims(m, (10, 10)) == (5, 5)
m = MeanPool((2, 2); stride = 1)
@test Flux.outdims(m, (5, 5)) == (4, 4)
m = MeanPool((2, 2); stride = 2, pad = 3)
@test Flux.outdims(m, (5, 5)) == (5, 5)
end

View File

@ -191,6 +191,7 @@ end
end end
if VERSION >= v"1.1"
@testset "GroupNorm" begin @testset "GroupNorm" begin
# begin tests # begin tests
squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
@ -289,5 +290,5 @@ end
x = Float32.(reshape(collect(1:prod(sizes)), sizes)) x = Float32.(reshape(collect(1:prod(sizes)), sizes))
@test BN(x) GN(x) @test BN(x) GN(x)
end end
end
end end

View File

@ -49,12 +49,33 @@ const ϵ = 1e-7
@testset "logitbinarycrossentropy" begin @testset "logitbinarycrossentropy" begin
@test logitbinarycrossentropy.(logŷ, y) binarycrossentropy.(σ.(logŷ), y; ϵ=0) @test logitbinarycrossentropy.(logŷ, y) binarycrossentropy.(σ.(logŷ), y; ϵ=0)
end end
y = [1 2 3]
y1 = [4.0 5.0 6.0]
@testset "kldivergence" begin
@test Flux.kldivergence(y, y1) 4.761838062403337
@test Flux.kldivergence(y, y) 0
end
y = [1 2 3 4]
y1 = [5.0 6.0 7.0 8.0]
@testset "hinge" begin
@test Flux.hinge(y, y1) 0
@test Flux.hinge(y, 0.5 .* y) 0.125
end
y = [0.1 0.2 0.3]
y1 = [0.4 0.5 0.6]
@testset "poisson" begin
@test Flux.poisson(y, y1) 1.0160455586700767
@test Flux.poisson(y, y) 0.5044459776946685
end
@testset "no spurious promotions" begin @testset "no spurious promotions" begin
for T in (Float32, Float64) for T in (Float32, Float64)
y = rand(T, 2) y = rand(T, 2)
ŷ = rand(T, 2) ŷ = rand(T, 2)
for f in (mse, crossentropy, logitcrossentropy) for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
fwd, back = Flux.pullback(f, , y) fwd, back = Flux.pullback(f, , y)
@test fwd isa T @test fwd isa T
@test eltype(back(one(T))[1]) == T @test eltype(back(one(T))[1]) == T

View File

@ -19,7 +19,7 @@ include("layers/normalisation.jl")
include("layers/stateless.jl") include("layers/stateless.jl")
include("layers/conv.jl") include("layers/conv.jl")
if isdefined(Flux, :CUDA) if Flux.use_cuda[]
include("cuda/cuda.jl") include("cuda/cuda.jl")
else else
@warn "CUDA unavailable, not testing GPU support" @warn "CUDA unavailable, not testing GPU support"

View File

@ -1,6 +1,6 @@
using Flux using Flux
using Flux: throttle, glorot_uniform, glorot_normal, stack, unstack using Flux: throttle, nfan, glorot_uniform, glorot_normal, stack, unstack
using StatsBase: std using StatsBase: var
using Random using Random
using Test using Test
@ -56,18 +56,26 @@ end
# Set random seed so that these tests don't fail randomly # Set random seed so that these tests don't fail randomly
Random.seed!(0) Random.seed!(0)
# glorot_uniform should yield a kernel with stddev ~= sqrt(6/(n_in + n_out)), @testset "Fan in/out" begin
# and glorot_normal should yield a kernel with stddev != 2/(n_in _ n_out) @test nfan() == (1, 1) #For a constant
for (n_in, n_out) in [(100, 100), (100, 400)] @test nfan(100) == (1, 100) #For vector
v = glorot_uniform(n_in, n_out) @test nfan(100, 200) == (200, 100) #For Dense layer
@test minimum(v) > -1.1*sqrt(6/(n_in + n_out)) @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer
@test minimum(v) < -0.9*sqrt(6/(n_in + n_out)) @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer
@test maximum(v) > 0.9*sqrt(6/(n_in + n_out)) @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer
@test maximum(v) < 1.1*sqrt(6/(n_in + n_out)) end
v = glorot_normal(n_in, n_out) @testset "glorot" begin
@test std(v) > 0.9*sqrt(2/(n_in + n_out)) # glorot_uniform and glorot_normal should both yield a kernel with
@test std(v) < 1.1*sqrt(2/(n_in + n_out)) # variance ≈ 2/(fan_in + fan_out)
for dims [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)]
for init [glorot_uniform, glorot_normal]
v = init(dims...)
fan_in, fan_out = nfan(dims...)
σ2 = 2 / (fan_in + fan_out)
@test 0.9σ2 < var(v) < 1.1σ2
end
end
end end
end end