Merge #1166

1166: Fix crossentropy when some probabilities are zero r=dhairyagandhi96 a=cossio Use a function `xlogy(x,y) = x * log(y)` that has the correct limit at `x=0`. Before this PR: ```julia julia> Flux.crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) NaN ``` After this PR: ```julia julia> Flux.crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) 0.3250829733914482 ``` Co-authored-by: cossio <j.cossio.diaz@gmail.com>
2020-05-08 11:14:31 +00:00 · 2020-05-08 11:14:31 +00:00 · 0287abbf66
commit 0287abbf66
parent c444226db5 17f54e4c6f
3 changed files with 65 additions and 20 deletions
--- a/Project.toml
+++ b/Project.toml
@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.10.4"
+version = "0.10.5"

 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@ -54,15 +54,15 @@ function huber_loss(ŷ, y;  δ=eltype(ŷ)(1))
 end

 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
-  return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
+  return -sum(xlogy.(y, ŷ)) * 1 // size(y, 2)
 end

 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Number)
-  return -sum(y .* log.(ŷ)) .* weight * 1 // size(y, 2)
+  return -sum(xlogy.(y, ŷ)) .* weight * 1 // size(y, 2)
 end

 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::AbstractVector)
-  return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
+  return -sum(xlogy.(y, ŷ) .* weight) * 1 // size(y, 2)
 end

 """
@ -123,7 +123,7 @@ julia> Flux.binarycrossentropy.(σ.([-1.1491, 0.8619, 0.3127]), [1, 1, 0])
 0.8616703662235441
 ```
 """
-binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
+binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)

 # Re-definition to fix interaction with CuArrays.
 CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
@ -195,7 +195,7 @@ It is always non-negative and zero only when both the distributions are equal
 everywhere.
 """
 function kldivergence(ŷ, y)
-  entropy = sum(y .* log.(y)) * 1 //size(y,2)
+  entropy = sum(xlogx.(y)) * 1 //size(y,2)
  cross_entropy = crossentropy(ŷ, y)
  return entropy + cross_entropy
 end
@ -208,7 +208,7 @@ distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.

 [More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
-poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) * 1 // size(y,2)
+poisson(ŷ, y) = sum(ŷ .- xlogy.(y, ŷ)) * 1 // size(y,2)

 """
    hinge(ŷ, y)
@ -262,3 +262,29 @@ by linearizing all values for each element in the batch.
 function flatten(x::AbstractArray)
  return reshape(x, :, size(x)[end])
 end
+
+"""
+    xlogx(x)
+Return `x * log(x)` for `x ≥ 0`, handling `x = 0` by taking the downward limit.
+"""
+function xlogx(x)
+  result = x * log(x)
+  ifelse(iszero(x), zero(result), result)
+end
+CuArrays.@cufunc function xlogx(x)
+  result = x * log(x)
+  ifelse(iszero(x), zero(result), result)
+end
+
+"""
+    xlogy(x, y)
+Return `x * log(y)` for `y > 0` with correct limit at `x = 0`.
+"""
+function xlogy(x, y)
+  result = x * log(y)
+  ifelse(iszero(x), zero(result), result)
+end
+CuArrays.@cufunc function xlogy(x, y)
+  result = x * log(y)
+  ifelse(iszero(x), zero(result), result)
+end
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@ -1,9 +1,26 @@
 using Test
 using Flux: onehotbatch, mse, crossentropy, logitcrossentropy,
-            σ, binarycrossentropy, logitbinarycrossentropy, flatten
+            σ, binarycrossentropy, logitbinarycrossentropy, flatten,
+            xlogx, xlogy

 const ϵ = 1e-7

+@testset "xlogx & xlogy" begin
+  @test iszero(xlogx(0))
+  @test isnan(xlogx(NaN))
+  @test xlogx(2) ≈ 2.0 * log(2.0)
+  @inferred xlogx(2)
+  @inferred xlogx(0)
+
+  @test iszero(xlogy(0, 1))
+  @test isnan(xlogy(NaN, 1))
+  @test isnan(xlogy(1, NaN))
+  @test isnan(xlogy(NaN, NaN))
+  @test xlogy(2, 3) ≈ 2.0 * log(3.0)
+  @inferred xlogy(2, 3)
+  @inferred xlogy(0, 1)
+end
+
@testset "losses" begin
  # First, regression-style y's
  y = [1, 1, 0, 0]
@ -12,15 +29,15 @@ const ϵ = 1e-7
  @testset "mse" begin
    @test mse(ŷ, y) ≈ (.1^2 + .9^2)/2
  end
-  
+
  @testset "mae" begin
    @test Flux.mae(ŷ, y) ≈ 1/2
  end
-  
+
  @testset "huber_loss" begin
    @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
-  end       
-            
+  end
+
  y = [123.0,456.0,789.0]
  ŷ = [345.0,332.0,789.0]
  @testset "msle" begin
@ -35,6 +52,7 @@ const ϵ = 1e-7
  lossvalue = 1.203972804325936

  @testset "crossentropy" begin
+    @test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ crossentropy([0.1,0.9], [0.1,0.9])
    @test crossentropy(ŷ, y) ≈ lossvalue
  end

@ -63,46 +81,47 @@ const ϵ = 1e-7
  @testset "logitbinarycrossentropy" begin
    @test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0)
  end
-  
+
  y = [1 2 3]
  ŷ = [4.0 5.0 6.0]
  @testset "kldivergence" begin
+    @test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ Flux.kldivergence([0.1,0.9], [0.1,0.9])
    @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
-    @test Flux.kldivergence(y, y) ≈ 0 
+    @test Flux.kldivergence(y, y) ≈ 0
  end
-  
+
  y = [1 2 3 4]
  ŷ = [5.0 6.0 7.0 8.0]
  @testset "hinge" begin
    @test Flux.hinge(ŷ, y) ≈ 0
    @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
  end
-  
+
  @testset "squared_hinge" begin
    @test Flux.squared_hinge(ŷ, y) ≈ 0
    @test Flux.squared_hinge(y, 0.5 .* y) ≈ 0.0625
  end
-  
+
  y = [0.1 0.2 0.3]
  ŷ = [0.4 0.5 0.6]
  @testset "poisson" begin
    @test Flux.poisson(ŷ, y) ≈ 0.6278353988097339
    @test Flux.poisson(y, y) ≈ 0.5044459776946685
  end
-  
+
  y = [1.0 0.5 0.3 2.4]
  ŷ = [0 1.4 0.5 1.2]
  @testset "dice_coeff_loss" begin
    @test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999
    @test Flux.dice_coeff_loss(y, y) ≈ 0.0
  end
-            
+
  @testset "tversky_loss" begin
    @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
    @test Flux.tversky_loss(ŷ, y, β = 0.8) ≈ -0.09490740740740744
    @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
  end
-            
+
  @testset "no spurious promotions" begin
    for T in (Float32, Float64)
      y = rand(T, 2)