Merge remote-tracking branch 'origin/master' into HEAD
This commit is contained in:
commit
dec1b37e8e
@ -14,8 +14,8 @@ Which means allocations occur much faster.
|
|||||||
And you use less memory.
|
And you use less memory.
|
||||||
|
|
||||||
|
|
||||||
## Make sure your custom activation functions preserve the type of their inputs
|
## Make sure your activation and loss functions preserve the type of their inputs
|
||||||
Not only should your activation functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
|
Not only should your activation and loss functions be [type-stable](https://docs.julialang.org/en/v1/manual/performance-tips/#Write-%22type-stable%22-functions-1),
|
||||||
they should also preserve the type of their inputs.
|
they should also preserve the type of their inputs.
|
||||||
|
|
||||||
A very artificial example using an activation function like
|
A very artificial example using an activation function like
|
||||||
@ -26,6 +26,7 @@ A very artificial example using an activation function like
|
|||||||
|
|
||||||
will result in performance on `Float32` input orders of magnitude slower than the normal `tanh` would,
|
will result in performance on `Float32` input orders of magnitude slower than the normal `tanh` would,
|
||||||
because it results in having to use slow mixed type multiplication in the dense layers.
|
because it results in having to use slow mixed type multiplication in the dense layers.
|
||||||
|
Similar situations can occur in the loss function during backpropagation.
|
||||||
|
|
||||||
Which means if you change your data say from `Float64` to `Float32` (which should give a speedup: see above),
|
Which means if you change your data say from `Float64` to `Float32` (which should give a speedup: see above),
|
||||||
you will see a large slow-down
|
you will see a large slow-down
|
||||||
@ -41,7 +42,7 @@ While one could change your activation function (e.g. to use `0.01f0x`) to avoid
|
|||||||
the idiomatic (and safe way) is to use `oftype`.
|
the idiomatic (and safe way) is to use `oftype`.
|
||||||
|
|
||||||
```
|
```
|
||||||
leaky_tanh(x) = oftype(x/1, 0.01) + tanh(x)
|
leaky_tanh(x) = oftype(x/1, 0.01)x + tanh(x)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@ -60,7 +61,7 @@ end
|
|||||||
|
|
||||||
It is much faster to concatenate them into a matrix,
|
It is much faster to concatenate them into a matrix,
|
||||||
as this will hit BLAS matrix-matrix multiplication, which is much faster than the equivalent sequence of matrix-vector multiplications.
|
as this will hit BLAS matrix-matrix multiplication, which is much faster than the equivalent sequence of matrix-vector multiplications.
|
||||||
Even though this means allocating new memory to store them contiguously.
|
The improvement is enough that it is worthwhile allocating new memory to store them contiguously.
|
||||||
|
|
||||||
```julia
|
```julia
|
||||||
x_batch = reduce(hcat, xs)
|
x_batch = reduce(hcat, xs)
|
||||||
|
@ -64,7 +64,7 @@ function RNNDesc{T}(mode::Int, input::Int, hidden::Int; layers = 1) where T
|
|||||||
@check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint),
|
@check ccall((:cudnnSetRNNDescriptor_v6,libcudnn), cudnnStatus_t, (Ptr{Nothing},Ptr{Nothing},Cint,Cint,Ptr{Nothing},Cint,Cint,Cint,Cint,Cint),
|
||||||
handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
|
handle(),d[],hidden,layers,dropoutDesc,inputMode,direction,mode,algo,cudnnDataType(T))
|
||||||
|
|
||||||
w = cuzeros(T, rnnParamSize(T, d[], input))
|
w = CuArrays.zeros(T, rnnParamSize(T, d[], input))
|
||||||
# TODO: avoid reserve allocation here
|
# TODO: avoid reserve allocation here
|
||||||
rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
|
rd = RNNDesc{T}(mode, input, hidden, w, params(w, input, hidden, ngates(mode))..., d[])
|
||||||
finalizer(rd) do x
|
finalizer(rd) do x
|
||||||
@ -131,8 +131,8 @@ end
|
|||||||
# TODO: can we just manipulate strides here?
|
# TODO: can we just manipulate strides here?
|
||||||
# TODO: should use repmat, but this isn't implemented.
|
# TODO: should use repmat, but this isn't implemented.
|
||||||
hBatch(x::AbstractVector, h::CuVector) = h
|
hBatch(x::AbstractVector, h::CuVector) = h
|
||||||
hBatch(x::AbstractMatrix, h::CuVector) = h .* cuones(1, size(x, 2))
|
hBatch(x::AbstractMatrix, h::CuVector) = h .* CuArrays.ones(1, size(x, 2))
|
||||||
hBatch(x::AbstractMatrix, h::CuMatrix) = h .* cuones(1, size(h,2) == 1 ? size(x,2) : 1)
|
hBatch(x::AbstractMatrix, h::CuMatrix) = h .* CuArrays.ones(1, size(h,2) == 1 ? size(x,2) : 1)
|
||||||
|
|
||||||
function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
|
function forward(rnn::RNNDesc{T}, x::CuArray{T}, h_::CuArray{T}, c_ = nothing, train = Val{false}) where T
|
||||||
h = hBatch(x, h_)
|
h = hBatch(x, h_)
|
||||||
|
@ -110,7 +110,7 @@ end
|
|||||||
(a::Dense{<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
|
(a::Dense{<:Any,W})(x::AbstractArray{T}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
|
||||||
invoke(a, Tuple{AbstractArray}, x)
|
invoke(a, Tuple{AbstractArray}, x)
|
||||||
|
|
||||||
(a::Dense{<:Any,W})(x::AbstractArray{<:Real}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
|
(a::Dense{<:Any,W})(x::AbstractArray{<:AbstractFloat}) where {T <: Union{Float32,Float64}, W <: AbstractArray{T}} =
|
||||||
a(T.(x))
|
a(T.(x))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -23,7 +23,7 @@ function apply!(o::Descent, x, Δ)
|
|||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Momentum(params, η = 0.01; ρ = 0.9)
|
Momentum(η = 0.01; ρ = 0.9)
|
||||||
|
|
||||||
Gradient descent with learning rate `η` and momentum `ρ`.
|
Gradient descent with learning rate `η` and momentum `ρ`.
|
||||||
"""
|
"""
|
||||||
|
@ -31,6 +31,7 @@ end
|
|||||||
|
|
||||||
function prefor(f, x; seen = IdSet())
|
function prefor(f, x; seen = IdSet())
|
||||||
x ∈ seen && return
|
x ∈ seen && return
|
||||||
|
push!(seen, x)
|
||||||
f(x)
|
f(x)
|
||||||
foreach(x -> prefor(f, x, seen = seen), children(x))
|
foreach(x -> prefor(f, x, seen = seen), children(x))
|
||||||
return
|
return
|
||||||
|
@ -85,6 +85,16 @@ end
|
|||||||
@test size.(params(m)) == [(5, 10), (5,)]
|
@test size.(params(m)) == [(5, 10), (5,)]
|
||||||
m = RNN(10, 5)
|
m = RNN(10, 5)
|
||||||
@test size.(params(m)) == [(5, 10), (5, 5), (5,), (5,)]
|
@test size.(params(m)) == [(5, 10), (5, 5), (5,), (5,)]
|
||||||
|
|
||||||
|
# Layer duplicated in same chain, params just once pls.
|
||||||
|
c = Chain(m, m)
|
||||||
|
@test size.(params(c)) == [(5, 10), (5, 5), (5,), (5,)]
|
||||||
|
|
||||||
|
# Self-referential array. Just want params, no stack overflow pls.
|
||||||
|
r = Any[nothing,m]
|
||||||
|
Flux.children(a::Vector{Any}) = Tuple(a)
|
||||||
|
r[1] = r
|
||||||
|
@test size.(params(r)) == [(5, 10), (5, 5), (5,), (5,)]
|
||||||
end
|
end
|
||||||
|
|
||||||
@testset "Basic Stacking" begin
|
@testset "Basic Stacking" begin
|
||||||
|
Loading…
Reference in New Issue
Block a user