diff --git a/docs/src/performance.md b/docs/src/performance.md index 682b7231..06a4f690 100644 --- a/docs/src/performance.md +++ b/docs/src/performance.md @@ -14,8 +14,8 @@ Which means allocations occur much faster. And you use less memory. -## Make sure your custom activation functions preserve the type of their inputs -Not only should your activation functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1), +## Make sure your activation and loss functions preserve the type of their inputs +Not only should your activation and loss functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1), they should also preserve the type of their inputs. A very artificial example using an activation function like @@ -26,6 +26,7 @@ A very artificial example using an activation function like will result in performance on `Float32` input orders of magnitude slower than the normal `tanh` would, because it results in having to use slow mixed type multiplication in the dense layers. +Similar situations can occur in the loss function during backpropagation. Which means if you change your data say from `Float64` to `Float32` (which should give a speedup: see above), you will see a large slow-down @@ -41,7 +42,7 @@ While one could change your activation function (e.g. to use `0.01f0x`) to avoid the idiomatic (and safe way) is to use `oftype`. ``` - leaky_tanh(x) = oftype(x/1, 0.01) + tanh(x) + leaky_tanh(x) = oftype(x/1, 0.01)x + tanh(x) ``` @@ -60,7 +61,7 @@ end It is much faster to concatenate them into a matrix, as this will hit BLAS matrix-matrix multiplication, which is much faster than the equivalent sequence of matrix-vector multiplications. -Even though this means allocating new memory to store them contiguously. +The improvement is enough that it is worthwhile allocating new memory to store them contiguously. ```julia x_batch = reduce(hcat, xs) diff --git a/src/cuda/curnn.jl b/src/cuda/curnn.jl index fbe73e45..daacb0e8 100644 --- a/src/cuda/curnn.jl +++ b/src/cuda/curnn.jl @@ -64,7 +64,7 @@ function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T @check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint), handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T)) - w = cuzeros(T, rnnParamSize(T, d[], input)) + w = CuArrays.zeros(T, rnnParamSize(T, d[], input)) # TODO: avoid reserve allocation here rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[]) finalizer(rd) do x @@ -131,8 +131,8 @@ end # TODO: can we just manipulate strides here? # TODO: should use repmat, but this isn't implemented. hBatch(x::AbstractVector, h::CuVector) = h -hBatch(x::AbstractMatrix, h::CuVector) = h .* cuones(1, size(x, 2)) -hBatch(x::AbstractMatrix, h::CuMatrix) = h .* cuones(1, size(h,2) == 1 ? size(x,2) : 1) +hBatch(x::AbstractMatrix, h::CuVector) = h .* CuArrays.ones(1, size(x, 2)) +hBatch(x::AbstractMatrix, h::CuMatrix) = h .* CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1) function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T h = hBatch(x, h_) diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 12d4e2e3..83eeee21 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -110,7 +110,7 @@ end (a::Dense{<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = invoke(a, Tuple{AbstractArray}, x) -(a::Dense{<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = +(a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} = a(T.(x)) """ diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index 2319cfdb..939a4678 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -23,7 +23,7 @@ function apply!(o::Descent, x, Δ) end """ - Momentum(params, η = 0.01; ρ = 0.9) + Momentum(η = 0.01; ρ = 0.9) Gradient descent with learning rate `η` and momentum `ρ`. """ diff --git a/src/treelike.jl b/src/treelike.jl index 443a91e2..ccb0fe81 100644 --- a/src/treelike.jl +++ b/src/treelike.jl @@ -31,6 +31,7 @@ end function prefor(f, x; seen = IdSet()) x ∈ seen && return + push!(seen, x) f(x) foreach(x -> prefor(f, x, seen = seen), children(x)) return diff --git a/test/utils.jl b/test/utils.jl index 7bcf72c3..366f02b0 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -85,6 +85,16 @@ end @test size.(params(m)) == [(5, 10), (5,)] m = RNN(10, 5) @test size.(params(m)) == [(5, 10), (5, 5), (5,), (5,)] + + # Layer duplicated in same chain, params just once pls. + c = Chain(m, m) + @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5,)] + + # Self-referential array. Just want params, no stack overflow pls. + r = Any[nothing,m] + Flux.children(a::Vector{Any}) = Tuple(a) + r[1] = r + @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5,)] end @testset "Basic Stacking" begin