1166: Fix crossentropy when some probabilities are zero r=dhairyagandhi96 a=cossio

Use a function `xlogy(x,y) = x * log(y)` that has the correct limit at `x=0`.

Before this PR:

```julia
julia> Flux.crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9])
NaN
```

After this PR:

```julia
julia> Flux.crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9])
0.3250829733914482
```

Co-authored-by: cossio <j.cossio.diaz@gmail.com>
This commit is contained in:
bors[bot] 2020-05-08 11:14:31 +00:00 committed by GitHub
commit 0287abbf66
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 65 additions and 20 deletions

View File

@ -1,6 +1,6 @@
name = "Flux" name = "Flux"
uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
version = "0.10.4" version = "0.10.5"
[deps] [deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

View File

@ -54,15 +54,15 @@ function huber_loss(ŷ, y; δ=eltype(ŷ)(1))
end end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
return -sum(y .* log.()) * 1 // size(y, 2) return -sum(xlogy.(y, )) * 1 // size(y, 2)
end end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
return -sum(y .* log.()) .* weight * 1 // size(y, 2) return -sum(xlogy.(y, )) .* weight * 1 // size(y, 2)
end end
function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector) function _crossentropy(::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
return -sum(y .* log.() .* weight) * 1 // size(y, 2) return -sum(xlogy.(y, ) .* weight) * 1 // size(y, 2)
end end
""" """
@ -123,7 +123,7 @@ julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
0.8616703662235441 0.8616703662235441
``` ```
""" """
binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ) binarycrossentropy(, y; ϵ=eps()) = -xlogy(y, + ϵ) - xlogy(1 - y, 1 - + ϵ)
# Re-definition to fix interaction with CuArrays. # Re-definition to fix interaction with CuArrays.
CuArrays.@cufunc binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ) CuArrays.@cufunc binarycrossentropy(, y; ϵ=eps()) = -y*log( + ϵ) - (1 - y)*log(1 - + ϵ)
@ -195,7 +195,7 @@ It is always non-negative and zero only when both the distributions are equal
everywhere. everywhere.
""" """
function kldivergence(, y) function kldivergence(, y)
entropy = sum(y .* log.(y)) * 1 //size(y,2) entropy = sum(xlogx.(y)) * 1 //size(y,2)
cross_entropy = crossentropy(, y) cross_entropy = crossentropy(, y)
return entropy + cross_entropy return entropy + cross_entropy
end end
@ -208,7 +208,7 @@ distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
[More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson). [More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
""" """
poisson(, y) = sum( .- y .* log.()) * 1 // size(y,2) poisson(, y) = sum( .- xlogy.(y, )) * 1 // size(y,2)
""" """
hinge(, y) hinge(, y)
@ -262,3 +262,29 @@ by linearizing all values for each element in the batch.
function flatten(x::AbstractArray) function flatten(x::AbstractArray)
return reshape(x, :, size(x)[end]) return reshape(x, :, size(x)[end])
end end
"""
xlogx(x)
Return `x * log(x)` for `x ≥ 0`, handling `x = 0` by taking the downward limit.
"""
function xlogx(x)
result = x * log(x)
ifelse(iszero(x), zero(result), result)
end
CuArrays.@cufunc function xlogx(x)
result = x * log(x)
ifelse(iszero(x), zero(result), result)
end
"""
xlogy(x, y)
Return `x * log(y)` for `y > 0` with correct limit at `x = 0`.
"""
function xlogy(x, y)
result = x * log(y)
ifelse(iszero(x), zero(result), result)
end
CuArrays.@cufunc function xlogy(x, y)
result = x * log(y)
ifelse(iszero(x), zero(result), result)
end

View File

@ -1,9 +1,26 @@
using Test using Test
using Flux: onehotbatch, mse, crossentropy, logitcrossentropy, using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
σ, binarycrossentropy, logitbinarycrossentropy, flatten σ, binarycrossentropy, logitbinarycrossentropy, flatten,
xlogx, xlogy
const ϵ = 1e-7 const ϵ = 1e-7
@testset "xlogx & xlogy" begin
@test iszero(xlogx(0))
@test isnan(xlogx(NaN))
@test xlogx(2) 2.0 * log(2.0)
@inferred xlogx(2)
@inferred xlogx(0)
@test iszero(xlogy(0, 1))
@test isnan(xlogy(NaN, 1))
@test isnan(xlogy(1, NaN))
@test isnan(xlogy(NaN, NaN))
@test xlogy(2, 3) 2.0 * log(3.0)
@inferred xlogy(2, 3)
@inferred xlogy(0, 1)
end
@testset "losses" begin @testset "losses" begin
# First, regression-style y's # First, regression-style y's
y = [1, 1, 0, 0] y = [1, 1, 0, 0]
@ -12,15 +29,15 @@ const ϵ = 1e-7
@testset "mse" begin @testset "mse" begin
@test mse(ŷ, y) (.1^2 + .9^2)/2 @test mse(ŷ, y) (.1^2 + .9^2)/2
end end
@testset "mae" begin @testset "mae" begin
@test Flux.mae(ŷ, y) 1/2 @test Flux.mae(ŷ, y) 1/2
end end
@testset "huber_loss" begin @testset "huber_loss" begin
@test Flux.huber_loss(ŷ, y) 0.20500000000000002 @test Flux.huber_loss(ŷ, y) 0.20500000000000002
end end
y = [123.0,456.0,789.0] y = [123.0,456.0,789.0]
ŷ = [345.0,332.0,789.0] ŷ = [345.0,332.0,789.0]
@testset "msle" begin @testset "msle" begin
@ -35,6 +52,7 @@ const ϵ = 1e-7
lossvalue = 1.203972804325936 lossvalue = 1.203972804325936
@testset "crossentropy" begin @testset "crossentropy" begin
@test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) crossentropy([0.1,0.9], [0.1,0.9])
@test crossentropy(ŷ, y) lossvalue @test crossentropy(ŷ, y) lossvalue
end end
@ -63,46 +81,47 @@ const ϵ = 1e-7
@testset "logitbinarycrossentropy" begin @testset "logitbinarycrossentropy" begin
@test logitbinarycrossentropy.(logŷ, y) binarycrossentropy.(σ.(logŷ), y; ϵ=0) @test logitbinarycrossentropy.(logŷ, y) binarycrossentropy.(σ.(logŷ), y; ϵ=0)
end end
y = [1 2 3] y = [1 2 3]
ŷ = [4.0 5.0 6.0] ŷ = [4.0 5.0 6.0]
@testset "kldivergence" begin @testset "kldivergence" begin
@test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) Flux.kldivergence([0.1,0.9], [0.1,0.9])
@test Flux.kldivergence(ŷ, y) -1.7661057888493457 @test Flux.kldivergence(ŷ, y) -1.7661057888493457
@test Flux.kldivergence(y, y) 0 @test Flux.kldivergence(y, y) 0
end end
y = [1 2 3 4] y = [1 2 3 4]
ŷ = [5.0 6.0 7.0 8.0] ŷ = [5.0 6.0 7.0 8.0]
@testset "hinge" begin @testset "hinge" begin
@test Flux.hinge(ŷ, y) 0 @test Flux.hinge(ŷ, y) 0
@test Flux.hinge(y, 0.5 .* y) 0.125 @test Flux.hinge(y, 0.5 .* y) 0.125
end end
@testset "squared_hinge" begin @testset "squared_hinge" begin
@test Flux.squared_hinge(ŷ, y) 0 @test Flux.squared_hinge(ŷ, y) 0
@test Flux.squared_hinge(y, 0.5 .* y) 0.0625 @test Flux.squared_hinge(y, 0.5 .* y) 0.0625
end end
y = [0.1 0.2 0.3] y = [0.1 0.2 0.3]
ŷ = [0.4 0.5 0.6] ŷ = [0.4 0.5 0.6]
@testset "poisson" begin @testset "poisson" begin
@test Flux.poisson(ŷ, y) 0.6278353988097339 @test Flux.poisson(ŷ, y) 0.6278353988097339
@test Flux.poisson(y, y) 0.5044459776946685 @test Flux.poisson(y, y) 0.5044459776946685
end end
y = [1.0 0.5 0.3 2.4] y = [1.0 0.5 0.3 2.4]
ŷ = [0 1.4 0.5 1.2] ŷ = [0 1.4 0.5 1.2]
@testset "dice_coeff_loss" begin @testset "dice_coeff_loss" begin
@test Flux.dice_coeff_loss(ŷ, y) 0.2799999999999999 @test Flux.dice_coeff_loss(ŷ, y) 0.2799999999999999
@test Flux.dice_coeff_loss(y, y) 0.0 @test Flux.dice_coeff_loss(y, y) 0.0
end end
@testset "tversky_loss" begin @testset "tversky_loss" begin
@test Flux.tversky_loss(ŷ, y) -0.06772009029345383 @test Flux.tversky_loss(ŷ, y) -0.06772009029345383
@test Flux.tversky_loss(ŷ, y, β = 0.8) -0.09490740740740744 @test Flux.tversky_loss(ŷ, y, β = 0.8) -0.09490740740740744
@test Flux.tversky_loss(y, y) -0.5576923076923075 @test Flux.tversky_loss(y, y) -0.5576923076923075
end end
@testset "no spurious promotions" begin @testset "no spurious promotions" begin
for T in (Float32, Float64) for T in (Float32, Float64)
y = rand(T, 2) y = rand(T, 2)