From 7ac647a7ac83ad688863082c6f37a72279200e36 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Wed, 5 Feb 2020 22:29:15 +0530
Subject: [PATCH 01/17] Added loss functions

---
 src/layers/stateless.jl | 117 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 159a8385..1324f62c 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -2,9 +2,91 @@ using CuArrays
 using NNlib: logsoftmax, logσ
 
 # Cost functions
+"""
+    mae(ŷ, y)
+L1 loss function. Computes the mean of absolute error between prediction and true values
+"""
+mae(ŷ, y) = sum(abs.(ŷ, y)) * 1 // length(y)
 
+
+"""
+    mse(ŷ, y)
+L2 loss function. Computes the mean of the squared errors between prediction and true values
+"""
 mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
+
+"""
+    mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
+
+L2 loss function. Returns the mean of the squared logarithmic errors of prediction ŷ, and true values y. The ϵ1 and ϵ2 terms provide numerical stability.
+(Computes mean of squared(log(predicted values)-log(true value)). This error penalizes an under-predicted estimate greater than an over-predicted estimate.
+
+  ```julia
+  julia> y_=[14726,327378,74734]
+  3-element Array{Int64,1}:
+    14726
+  327378
+    74734
+
+  julia> y = [12466.1,16353.95,16367.98]
+  3-element Array{Float64,1}:
+  12466.1 
+  16353.95
+  16367.98
+
+  julia> mean_squared_logarithmic_error(y,y_)
+  3.771271382334686
+  ```
+Alias:
+  msle(ŷ,y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
+
+"""
+mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+#Alias
+msle(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+
+
+
+"""
+    huber_loss(ŷ, y,delta=1.0)
+
+Computes the mean of the Huber loss between prediction ŷ and true values y. By default, delta is set to 1.0.
+[Huber Loss](https://en.wikipedia.org/wiki/Huber_loss).
+  
+  ```julia
+  julia> y = [1.2636,1.25,1.73]
+  3-element Array{Float64,1}:
+  1.2636
+  1.25  
+  1.73  
+
+  julia> y_= [-1.376,0,3.37]
+  3-element Array{Float64,1}:
+  -1.376
+   0.0  
+   3.37 
+
+  julia> huber_loss(y,y_)
+  0.7131999999999998
+  ```
+
+"""
+function huber_loss(ŷ, y,delta=1.0)
+  abs_error = abs.(ŷ.-y)
+  hub_loss =0
+  for i in 1:length(y)
+    if (abs_error[i]<=delta)
+      hub_loss+=abs_error[i]^2*0.5
+    else
+      hub_loss+=delta*(abs_error[i]-0.5*delta)
+    end
+  
+  return hub_loss*1//length(y)
+  end
+end
+
+
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
   return -sum(y .* log.(ŷ)) * 1 // size(y, 2)
 end
@@ -17,8 +99,32 @@ function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Abstr
   return -sum(y .* log.(ŷ) .* weight) * 1 // size(y, 2)
 end
 
+"""
+  crossentropy(ŷ, y, weight)
+
+Computes crossentropy loss over the prediction ŷ and true labels y(expected `onehot` encoded). 'weight' parameter allows to set the class weights while calculating loss.
+It can be a number or a vector of class weights. By default, weight is set to nothing.
+
+  ```julia
+  julia> ŷ = [0.33 .11 .98;0.11 0.34 0.11]
+  2×3 Array{Float64,2}:
+  0.33  0.11  0.98
+  0.11  0.34  0.11
+
+  julia> y = [1 0 0;0 1 0]
+  2×3 Array{Int64,2}:
+  1  0  0
+  0  1  0
+
+  julia> crossentropy(ŷ,y)
+  0.7291574286311803
+  ```
+
+Note: If only two classes are there, better use binarycrossentropy(ŷ, y) function.
+"""
 crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight=nothing) = _crossentropy(ŷ, y, weight)
 
+
 function logitcrossentropy(logŷ::AbstractVecOrMat, y::AbstractVecOrMat; weight = 1)
   return -sum(y .* logsoftmax(logŷ) .* weight) * 1 // size(y, 2)
 end
@@ -106,7 +212,16 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
 """
     hinge(ŷ, y)
-Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
+
+L1 loss function. Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
+
+"""
+    squared_hinge(ŷ, y)
+
+L2 loss function. Computes squared hinge loss over the prediction ŷ and true labels y(conatining 1 or -1)
+"""
+squared_hinge(ŷ, y) = sum((max.(0,1.-ŷ.*y)).^2) *1//size(y,2)
+  

From 643086c8db3220cd08a8a4c12a760121f45c5a46 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Wed, 5 Feb 2020 22:40:07 +0530
Subject: [PATCH 02/17] Updated squared_hinge

---
 src/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 1324f62c..8670a0b6 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -223,5 +223,5 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
 L2 loss function. Computes squared hinge loss over the prediction ŷ and true labels y(conatining 1 or -1)
 """
-squared_hinge(ŷ, y) = sum((max.(0,1.-ŷ.*y)).^2) *1//size(y,2)
+squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
   

From 44a977b7a4e8aaa1e84446d946b5ca95d43a09b3 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Wed, 5 Feb 2020 23:20:06 +0530
Subject: [PATCH 03/17] Added tests for new loss functions

---
 test/layers/stateless.jl | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 7cb8ed2e..fe553db0 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -12,6 +12,20 @@ const ϵ = 1e-7
   @testset "mse" begin
     @test mse(ŷ, y) ≈ (.1^2 + .9^2)/2
   end
+  
+  @testset "mae" begin
+    @test Flux.mae(ŷ, y) ≈ 1/2
+  end
+  
+  @testset "huber_loss" begin
+    @test Flux.huber_loss(ŷ, y) ≈ 0.0012499999999999994
+  end       
+            
+  y = [123,456,789]
+  y1 = [345,332,789]
+  @testset "msle" begin
+    @test Flux.msle(y1, y) ≈ 0.38813985859136585
+  end
 
   # Now onehot y's
   y = onehotbatch([1, 1, 0, 0], 0:1)
@@ -64,18 +78,23 @@ const ϵ = 1e-7
     @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
   end
   
+  @testset "squared_hinge" begin
+    @test Flux.squared_hinge(y, y1) ≈ 0
+    @test Flux.squared_hinge(y, 0.5 .* y) ≈ 0.0625
+  end
+  
   y = [0.1 0.2 0.3]
   y1 = [0.4 0.5 0.6]
   @testset "poisson" begin
     @test Flux.poisson(y, y1) ≈ 1.0160455586700767
     @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
-  
+            
   @testset "no spurious promotions" begin
     for T in (Float32, Float64)
       y = rand(T, 2)
       ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,Flux.mae,Flux.huber_loss,Flux.msle,Flux.squared_hinge)
         fwd, back = Flux.pullback(f, ŷ, y)
         @test fwd isa T
         @test eltype(back(one(T))[1]) == T

From b5184553d44619e3fb4a32d1d35e42eea9699346 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Wed, 5 Feb 2020 23:32:55 +0530
Subject: [PATCH 04/17] Error correction in mae

---
 src/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 8670a0b6..c0ac6ecb 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -6,7 +6,7 @@ using NNlib: logsoftmax, logσ
     mae(ŷ, y)
 L1 loss function. Computes the mean of absolute error between prediction and true values
 """
-mae(ŷ, y) = sum(abs.(ŷ, y)) * 1 // length(y)
+mae(ŷ, y) = sum(abs.(ŷ.- y)) * 1 // length(y)
 
 
 """

From 7710bb0b4bbe90693dbc6110de9a1e7112ed2c79 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 6 Feb 2020 01:06:41 +0530
Subject: [PATCH 05/17] Removed spurious promotions

---
 src/layers/stateless.jl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index c0ac6ecb..e3bdfe00 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -6,7 +6,7 @@ using NNlib: logsoftmax, logσ
     mae(ŷ, y)
 L1 loss function. Computes the mean of absolute error between prediction and true values
 """
-mae(ŷ, y) = sum(abs.(ŷ.- y)) * 1 // length(y)
+mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 
 
 """
@@ -42,9 +42,9 @@ Alias:
   msle(ŷ,y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
 
 """
-mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
 #Alias
-msle(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+msle(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
 
 
 
@@ -74,12 +74,14 @@ Computes the mean of the Huber loss between prediction ŷ and true values y. By
 """
 function huber_loss(ŷ, y,delta=1.0)
   abs_error = abs.(ŷ.-y)
-  hub_loss =0
+  type_ = eltype(ŷ)
+  delta = type_(delta)
+  hub_loss =type_(0)
   for i in 1:length(y)
     if (abs_error[i]<=delta)
-      hub_loss+=abs_error[i]^2*0.5
+      hub_loss+=abs_error[i]^2*type_(0.5)
     else
-      hub_loss+=delta*(abs_error[i]-0.5*delta)
+      hub_loss+=delta*(abs_error[i]-type_(0.5*delta))
     end
   
   return hub_loss*1//length(y)

From 659ba074d1e83075b23b4a3a9d5b09cb17551e4a Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 6 Feb 2020 01:21:51 +0530
Subject: [PATCH 06/17] Updated test for msle

---
 test/layers/stateless.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index fe553db0..d038bcda 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -21,8 +21,8 @@ const ϵ = 1e-7
     @test Flux.huber_loss(ŷ, y) ≈ 0.0012499999999999994
   end       
             
-  y = [123,456,789]
-  y1 = [345,332,789]
+  y = [123.0,456.0,789.0]
+  y1 = [345.0,332.0,789.0]
   @testset "msle" begin
     @test Flux.msle(y1, y) ≈ 0.38813985859136585
   end

From 980ce72914abb21224a7b21e9f8c60bfbcbcfa48 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 27 Feb 2020 02:00:28 +0530
Subject: [PATCH 07/17] Added tversky and dice loss

---
 src/layers/stateless.jl | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index e3bdfe00..74236700 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -74,14 +74,14 @@ Computes the mean of the Huber loss between prediction ŷ and true values y. By
 """
 function huber_loss(ŷ, y,delta=1.0)
   abs_error = abs.(ŷ.-y)
-  type_ = eltype(ŷ)
-  delta = type_(delta)
-  hub_loss =type_(0)
+  dtype= eltype(ŷ)
+  delta = dtype(delta)
+  hub_loss = dtype(0)
   for i in 1:length(y)
     if (abs_error[i]<=delta)
-      hub_loss+=abs_error[i]^2*type_(0.5)
+      hub_loss+=abs_error[i]^2*dtype(0.5)
     else
-      hub_loss+=delta*(abs_error[i]-type_(0.5*delta))
+      hub_loss+=delta*(abs_error[i]- dtype(0.5*delta))
     end
   
   return hub_loss*1//length(y)
@@ -226,4 +226,29 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 L2 loss function. Computes squared hinge loss over the prediction ŷ and true labels y(conatining 1 or -1)
 """
 squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
-  
+
+"""
+    dice_coeff_loss(y_pred,y_true,smooth = 1)
+
+Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score
+    Dice_Coefficient(A,B) = 2*sum(|A*B|+smooth)/(sum(A^2)+sum(B^2)+ smooth)
+    Dice_loss = 1-Dice_Coefficient
+
+Ref: [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
+"""
+function dice_coeff_loss(y_pred,y_true,smooth=eltype(y_pred)(1.0))
+    intersection = sum(y_true.*y_pred)
+    return 1 - (2*intersection + smooth)/(sum(y_true.^2) + sum(y_pred.^2)+smooth)
+end
+
+"""
+    tversky_loss(y_pred,y_true,beta = 0.7)
+
+Used with imbalanced data to give more weightage to False negatives. Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
+    tversky_loss(ŷ,y,beta) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + beta*(1 .- y).*ŷ + (1 .- beta)*y.*(1 .- ŷ))+ 1)
+Ref: [Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
+"""
+function tversky_loss(y_pred,y_true,beta = eltype(y_pred)(0.7))
+    intersection = sum(y_true.*y_pred)
+    return 1 - (intersection+1)/(sum(y_true.*y_pred + beta*(1 .- y_true).* y_pred + (1-beta).*y_true.*(1 .- y_pred))+1)
+end

From 3d8965230fc45f687d943f614dacd154f6212f11 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 27 Feb 2020 02:29:39 +0530
Subject: [PATCH 08/17] Added tests for dice and Tversky loss

---
 test/layers/stateless.jl | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index d038bcda..b7d15634 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -89,12 +89,25 @@ const ϵ = 1e-7
     @test Flux.poisson(y, y1) ≈ 1.0160455586700767
     @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
+  
+  y = [1.0 0.5 0.3 2.4]
+  y1 = [0 1.4 0.5 1.2]
+  @testset "dice_coeff_loss" begin
+    @test Flux.dice_coeff_loss(y, y1) ≈ 0.2799999999999999
+    @test Flux.dice_coeff_loss(y,y) ≈ 0.0
+  end
+            
+  @testset "tversky_loss" begin
+    @test Flux.tversky_loss(y,y1) ≈ 0.028747433264887046
+    @test Flux.tversky_loss(y,y1,0.8) ≈ 0.050200803212851364
+    @test Flux.tversky_loss(y,y) ≈ -0.5576923076923075
+  end
             
   @testset "no spurious promotions" begin
     for T in (Float32, Float64)
       y = rand(T, 2)
       ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,Flux.mae,Flux.huber_loss,Flux.msle,Flux.squared_hinge)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,Flux.mae,Flux.huber_loss,Flux.msle,Flux.squared_hinge,Flux.dice_coeff_loss,Flux.tversky_loss)
         fwd, back = Flux.pullback(f, ŷ, y)
         @test fwd isa T
         @test eltype(back(one(T))[1]) == T

From 9dce6232143cda235fe235016c332f8fe1fd939a Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 27 Feb 2020 16:26:17 +0530
Subject: [PATCH 09/17] Updated Msle loss

---
 src/layers/stateless.jl | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 74236700..b4e97660 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -17,33 +17,28 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
 
 """
-    mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
+    msle(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
 
-L2 loss function. Returns the mean of the squared logarithmic errors of prediction ŷ, and true values y. The ϵ1 and ϵ2 terms provide numerical stability.
+Mean Squared Logarithmic Error,an L2 loss function. Returns the mean of the squared logarithmic errors of prediction ŷ, and true values y. The ϵ1 and ϵ2 terms provide numerical stability.
 (Computes mean of squared(log(predicted values)-log(true value)). This error penalizes an under-predicted estimate greater than an over-predicted estimate.
 
   ```julia
-  julia> y_=[14726,327378,74734]
+  julia> y=[14726,327378,74734]
   3-element Array{Int64,1}:
     14726
   327378
     74734
 
-  julia> y = [12466.1,16353.95,16367.98]
+  julia> ŷ = [12466.1,16353.95,16367.98]
   3-element Array{Float64,1}:
   12466.1 
   16353.95
   16367.98
 
-  julia> mean_squared_logarithmic_error(y,y_)
+  julia> msle(ŷ,y)
   3.771271382334686
   ```
-Alias:
-  msle(ŷ,y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
-
 """
-mean_squared_logarithmic_error(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
-#Alias
 msle(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
 
 

From 8afed013458cf3f064f6be8f0e9427f49e1bade3 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Thu, 27 Feb 2020 23:23:53 +0530
Subject: [PATCH 10/17] Apply suggestions from code review

Co-Authored-By: David Lung <lungd@users.noreply.github.com>
---
 src/layers/stateless.jl  | 2 +-
 test/layers/stateless.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index b4e97660..f05f19fc 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -79,8 +79,8 @@ function huber_loss(ŷ, y,delta=1.0)
       hub_loss+=delta*(abs_error[i]- dtype(0.5*delta))
     end
   
-  return hub_loss*1//length(y)
   end
+  hub_loss*1//length(y)
 end
 
 
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index b7d15634..c09d1aae 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -18,7 +18,7 @@ const ϵ = 1e-7
   end
   
   @testset "huber_loss" begin
-    @test Flux.huber_loss(ŷ, y) ≈ 0.0012499999999999994
+    @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
   end       
             
   y = [123.0,456.0,789.0]

From 08dabce57e41a23f060ed019f84b32b962afeac6 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Sun, 1 Mar 2020 12:00:11 +0530
Subject: [PATCH 11/17] Updated loss function docs

---
 src/layers/stateless.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 9b6db037..592e2fa1 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -29,7 +29,7 @@ msle(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(
     huber_loss(ŷ, y,delta=1.0)
 
 Computes the mean of the Huber loss. By default, delta is set to 1.0.
-                    | 0.5*|(ŷ-y)|,   for |ŷ-y|<delta
+                    | 0.5*|(ŷ-y)|,   for |ŷ-y|<=delta
       Hubber loss = |
                     | delta*(|ŷ-y| - 0.5*delta),  otherwise
 
@@ -169,7 +169,7 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
-See also [`squared_hinge`](@ref)
+See also [`squared_hinge`](@ref).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
@@ -178,7 +178,7 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
 Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1)
 
-See also [`hinge`](@ref)
+See also [`hinge`](@ref).
 """
 squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
 
@@ -186,8 +186,8 @@ squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
     dice_coeff_loss(y_pred,y_true,smooth = 1)
 
 Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score
-    Dice_Coefficient(A,B) = 2*sum(|A*B|+smooth)/(sum(A^2)+sum(B^2)+ smooth)
-    Dice_loss = 1-Dice_Coefficient
+    Dice_Coefficient(A,B) = 2 * sum( |A*B| + smooth) / (sum( A^2 ) + sum( B^2 )+ smooth)
+    Dice_loss = 1 - Dice_Coefficient
 
 Ref: [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
 """

From f9e31a020c6cf65425dc0b0415a241dd946bfd31 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 13:25:23 +0530
Subject: [PATCH 12/17] Updated huber_loss with other minute changes

---
 src/layers/stateless.jl | 47 +++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 592e2fa1..01b26a8a 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -16,38 +16,31 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
 
 """
-    msle(ŷ, y;ϵ1=eps.(Float64.(ŷ)),ϵ2=eps.(Float64.(y)))
+    msle(ŷ, y; ϵ1=eps.(Float64.(ŷ)))
 
-Mean Squared Logarithmic Error. Returns the mean of the squared logarithmic errors `sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 / length(y)`.<br>
-The ϵ1 and ϵ2 terms provide numerical stability. This error penalizes an under-predicted estimate greater than an over-predicted estimate.
+Mean Squared Logarithmic Error. Returns the mean of the squared logarithmic errors `sum((log.(ŷ+ϵ1) .- log.(y+ϵ2)).^2) * 1 / length(y)`.<br>
+The `ϵ` term provides numerical stability. This error penalizes an under-predicted estimate greater than an over-predicted estimate.
 """
-msle(ŷ, y;ϵ1=eps.(ŷ),ϵ2=eps.(eltype(ŷ).(y))) = sum((log.(ŷ+ϵ1).-log.(y+ϵ2)).^2) * 1 // length(y)
+msle(ŷ, y; ϵ=eps.(ŷ)) = sum((log.(ŷ+ϵ).-log.(y+ϵ)).^2) * 1 // length(y)
 
 
 
 """
-    huber_loss(ŷ, y,delta=1.0)
+    huber_loss(ŷ, y; delta=1.0)
+
+Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, delta is set to 1.0.
 
-Computes the mean of the Huber loss. By default, delta is set to 1.0.
                     | 0.5*|(ŷ-y)|,   for |ŷ-y|<=delta
       Hubber loss = |
                     | delta*(|ŷ-y| - 0.5*delta),  otherwise
 
 [`Huber Loss`](https://en.wikipedia.org/wiki/Huber_loss).
 """
-function huber_loss(ŷ, y,delta=1.0)
-  abs_error = abs.(ŷ.-y)
-  dtype= eltype(ŷ)
-  delta = dtype(delta)
-  hub_loss = dtype(0)
-  for i in 1:length(y)
-    if (abs_error[i]<=delta)
-      hub_loss+=abs_error[i]^2*dtype(0.5)
-    else
-      hub_loss+=delta*(abs_error[i]- dtype(0.5*delta))
-    end
-  end
-  hub_loss*1//length(y)
+function huber_loss(ŷ, y; delta = eltype(ŷ)(1))
+   abs_error = abs.(ŷ.-y)
+   temp = abs_error.<delta
+   x = eltype(ŷ)(0.5)
+   hub_loss = sum(((abs_error.^2).*temp).*x .+ delta*(abs_error.- x*delta).*(1 .-temp)) * 1 // length(y)
 end
 
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
@@ -167,6 +160,7 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
     hinge(ŷ, y)
 
 Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
+Returns `sum((max.(0,1 .-ŷ .* y))) *1 // size(y, 2)`
 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
 See also [`squared_hinge`](@ref).
@@ -176,35 +170,38 @@ hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 """
     squared_hinge(ŷ, y)
 
-Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1)
+Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1).
+Returns `sum((max.(0,1 .-ŷ .* y)).^2) *1 // size(y, 2)`
 
 See also [`hinge`](@ref).
 """
 squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
 
 """
-    dice_coeff_loss(y_pred,y_true,smooth = 1)
+    dice_coeff_loss(y_pred, y_true, smooth = 1)
 
-Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score
+Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score.
+    
     Dice_Coefficient(A,B) = 2 * sum( |A*B| + smooth) / (sum( A^2 ) + sum( B^2 )+ smooth)
     Dice_loss = 1 - Dice_Coefficient
 
 Ref: [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
 """
-function dice_coeff_loss(y_pred,y_true,smooth=eltype(y_pred)(1.0))
+function dice_coeff_loss(y_pred, y_true; smooth=eltype(y_pred)(1.0))
     intersection = sum(y_true.*y_pred)
     return 1 - (2*intersection + smooth)/(sum(y_true.^2) + sum(y_pred.^2)+smooth)
 end
 
 """
-    tversky_loss(y_pred,y_true,beta = 0.7)
+    tversky_loss(y_pred, y_true, beta = 0.7)
 
 Used with imbalanced data to give more weightage to False negatives. Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
+    
     tversky_loss(ŷ,y,beta) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + beta*(1 .- y).*ŷ + (1 .- beta)*y.*(1 .- ŷ))+ 1)
 
 Ref: [Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
 """
-function tversky_loss(y_pred,y_true,beta = eltype(y_pred)(0.7))
+function tversky_loss(y_pred, y_true; beta = eltype(y_pred)(0.7))
     intersection = sum(y_true.*y_pred)
     return 1 - (intersection+1)/(sum(y_true.*y_pred + beta*(1 .- y_true).* y_pred + (1-beta).*y_true.*(1 .- y_pred))+1)
 end

From 89d07c07ec5c781de7525cd9f7a864ebe9978335 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 13:33:44 +0530
Subject: [PATCH 13/17] Added Loss functions to docs

---
 docs/src/models/layers.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 41e98f32..5522fe73 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -56,7 +56,10 @@ GroupNorm
 
 ## Cost Functions
 ```@docs
+Flux.mae
 Flux.mse
+Flux.msle
+Flux.huber_loss
 Flux.crossentropy
 Flux.logitcrossentropy
 Flux.binarycrossentropy
@@ -64,4 +67,7 @@ Flux.logitbinarycrossentropy
 Flux.kldivergence
 Flux.poisson
 Flux.hinge
+Flux.squared_hinge
+Flux.dice_coeff_loss
+Flux.tversky_loss
 ```

From 5565250c28d87eefd694e42ff67f68c2ffec8a35 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 13:46:33 +0530
Subject: [PATCH 14/17] Updated test for tversky

---
 test/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index c09d1aae..702288b6 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -99,7 +99,7 @@ const ϵ = 1e-7
             
   @testset "tversky_loss" begin
     @test Flux.tversky_loss(y,y1) ≈ 0.028747433264887046
-    @test Flux.tversky_loss(y,y1,0.8) ≈ 0.050200803212851364
+    @test Flux.tversky_loss(y,y1,beta = 0.8) ≈ 0.050200803212851364
     @test Flux.tversky_loss(y,y) ≈ -0.5576923076923075
   end
             

From 2f05094068067ee2738adbe2e6e455909adfff0d Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 20:00:47 +0530
Subject: [PATCH 15/17] =?UTF-8?q?Added=20consistency=20with=20y=CC=82=20an?=
 =?UTF-8?q?d=20unicode=20chars?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/layers/stateless.jl | 54 ++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 01b26a8a..5f457057 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -2,7 +2,7 @@
 """
     mae(ŷ, y)
 
-Return the mean of absolute error `sum(abs.(ŷ .- y)) * 1 / length(y)` 
+Return the mean of absolute error `sum(abs.(ŷ .- y)) / length(y)` 
 """
 mae(ŷ, y) = sum(abs.(ŷ .- y)) * 1 // length(y)
 
@@ -16,23 +16,25 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
 
 """
-    msle(ŷ, y; ϵ1=eps.(Float64.(ŷ)))
+    msle(ŷ, y; ϵ = eps.(Float64.(ŷ)))
 
-Mean Squared Logarithmic Error. Returns the mean of the squared logarithmic errors `sum((log.(ŷ+ϵ1) .- log.(y+ϵ2)).^2) * 1 / length(y)`.<br>
-The `ϵ` term provides numerical stability. This error penalizes an under-predicted estimate greater than an over-predicted estimate.
+Returns the mean of the squared logarithmic errors `sum((log.(ŷ + ϵ) .- log.(y + ϵ)).^2) / length(y)`.
+The `ϵ` term provides numerical stability. 
+
+This error penalizes an under-predicted estimate greater than an over-predicted estimate.
 """
-msle(ŷ, y; ϵ=eps.(ŷ)) = sum((log.(ŷ+ϵ).-log.(y+ϵ)).^2) * 1 // length(y)
+msle(ŷ, y; ϵ = eps.(ŷ)) = sum((log.(ŷ + ϵ).-log.(y + ϵ)).^2) * 1 // length(y)
 
 
 
 """
-    huber_loss(ŷ, y; delta=1.0)
+    huber_loss(ŷ, y; delta = 1.0)
 
 Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, delta is set to 1.0.
 
-                    | 0.5*|(ŷ-y)|,   for |ŷ-y|<=delta
+                    | 0.5*|ŷ - y|,   for |ŷ - y| <= delta
       Hubber loss = |
-                    | delta*(|ŷ-y| - 0.5*delta),  otherwise
+                    | delta*(|ŷ- y| - 0.5*delta),  otherwise
 
 [`Huber Loss`](https://en.wikipedia.org/wiki/Huber_loss).
 """
@@ -151,6 +153,7 @@ end
     poisson(ŷ, y)
 
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
+Returns `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`
 
 [Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
@@ -160,48 +163,49 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
     hinge(ŷ, y)
 
 Measures the loss given the prediction `ŷ` and true labels `y` (containing 1 or -1). 
-Returns `sum((max.(0,1 .-ŷ .* y))) *1 // size(y, 2)`
+Returns `sum((max.(0, 1 .- ŷ .* y))) / size(y, 2)`
 
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
 See also [`squared_hinge`](@ref).
 """
-hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
+hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y, 2)
 
 """
     squared_hinge(ŷ, y)
 
 Computes squared hinge loss given the prediction `ŷ` and true labels `y` (conatining 1 or -1).
-Returns `sum((max.(0,1 .-ŷ .* y)).^2) *1 // size(y, 2)`
+Returns `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`
 
 See also [`hinge`](@ref).
 """
-squared_hinge(ŷ, y) = sum((max.(0,1 .-ŷ .* y)).^2) *1//size(y,2)
+squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) *1 // size(y, 2)
 
 """
-    dice_coeff_loss(y_pred, y_true, smooth = 1)
+    dice_coeff_loss(ŷ, y, smooth = 1)
 
 Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score.
     
-    Dice_Coefficient(A,B) = 2 * sum( |A*B| + smooth) / (sum( A^2 ) + sum( B^2 )+ smooth)
+    Dice_Coefficient(ŷ, y) = 2 * sum( |ŷ.* y| + smooth) / (sum( ŷ.^2 ) + sum( y.^2 ) + smooth)
     Dice_loss = 1 - Dice_Coefficient
 
-Ref: [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
+[V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
 """
-function dice_coeff_loss(y_pred, y_true; smooth=eltype(y_pred)(1.0))
-    intersection = sum(y_true.*y_pred)
-    return 1 - (2*intersection + smooth)/(sum(y_true.^2) + sum(y_pred.^2)+smooth)
+function dice_coeff_loss(ŷ, y; smooth = eltype(ŷ)(1.0))
+    intersection = sum(y.*ŷ)
+    return 1 - (2*intersection + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
 end
 
 """
-    tversky_loss(y_pred, y_true, beta = 0.7)
+    tversky_loss(ŷ, y, β = 0.7)
 
-Used with imbalanced data to give more weightage to False negatives. Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
+Used with imbalanced data to give more weightage to False negatives. 
+Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
     
-    tversky_loss(ŷ,y,beta) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + beta*(1 .- y).*ŷ + (1 .- beta)*y.*(1 .- ŷ))+ 1)
+    tversky_loss(ŷ, y, β) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + β *(1 .- y).*ŷ + (1 - β).*y.*(1 .- ŷ))+ 1)
 
-Ref: [Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
+[Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
 """
-function tversky_loss(y_pred, y_true; beta = eltype(y_pred)(0.7))
-    intersection = sum(y_true.*y_pred)
-    return 1 - (intersection+1)/(sum(y_true.*y_pred + beta*(1 .- y_true).* y_pred + (1-beta).*y_true.*(1 .- y_pred))+1)
+function tversky_loss(ŷ, y; β = eltype(ŷ)(0.7))
+    intersection = sum(y.*ŷ)
+    return 1 - (intersection + 1) / (sum(y.* ŷ + β *(1 .- y).* ŷ + (1 - β).*y.*(1 .- ŷ)) + 1)
 end

From 92e09e204d0684258f76aac92e509aa89935b6ec Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Mon, 2 Mar 2020 20:33:12 +0530
Subject: [PATCH 16/17] =?UTF-8?q?Test=20argument=20consistency=20with=20?=
 =?UTF-8?q?=C5=B7=20and=20y?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/layers/stateless.jl | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 702288b6..ce940bf9 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -22,9 +22,9 @@ const ϵ = 1e-7
   end       
             
   y = [123.0,456.0,789.0]
-  y1 = [345.0,332.0,789.0]
+  ŷ = [345.0,332.0,789.0]
   @testset "msle" begin
-    @test Flux.msle(y1, y) ≈ 0.38813985859136585
+    @test Flux.msle(ŷ, y) ≈ 0.38813985859136585
   end
 
   # Now onehot y's
@@ -65,49 +65,50 @@ const ϵ = 1e-7
   end
   
   y = [1 2 3]
-  y1 = [4.0 5.0 6.0]
+  ŷ = [4.0 5.0 6.0]
   @testset "kldivergence" begin
-    @test Flux.kldivergence(y, y1) ≈ 4.761838062403337
+    @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457
     @test Flux.kldivergence(y, y) ≈ 0 
   end
   
   y = [1 2 3 4]
-  y1 = [5.0 6.0 7.0 8.0]
+  ŷ = [5.0 6.0 7.0 8.0]
   @testset "hinge" begin
-    @test Flux.hinge(y, y1) ≈ 0
+    @test Flux.hinge(ŷ, y) ≈ 0
     @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
   end
   
   @testset "squared_hinge" begin
-    @test Flux.squared_hinge(y, y1) ≈ 0
+    @test Flux.squared_hinge(ŷ, y) ≈ 0
     @test Flux.squared_hinge(y, 0.5 .* y) ≈ 0.0625
   end
   
   y = [0.1 0.2 0.3]
-  y1 = [0.4 0.5 0.6]
+  ŷ = [0.4 0.5 0.6]
   @testset "poisson" begin
-    @test Flux.poisson(y, y1) ≈ 1.0160455586700767
+    @test Flux.poisson(ŷ, y) ≈ 0.6278353988097339
     @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
   
   y = [1.0 0.5 0.3 2.4]
-  y1 = [0 1.4 0.5 1.2]
+  ŷ = [0 1.4 0.5 1.2]
   @testset "dice_coeff_loss" begin
-    @test Flux.dice_coeff_loss(y, y1) ≈ 0.2799999999999999
-    @test Flux.dice_coeff_loss(y,y) ≈ 0.0
+    @test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999
+    @test Flux.dice_coeff_loss(y, y) ≈ 0.0
   end
             
   @testset "tversky_loss" begin
-    @test Flux.tversky_loss(y,y1) ≈ 0.028747433264887046
-    @test Flux.tversky_loss(y,y1,beta = 0.8) ≈ 0.050200803212851364
-    @test Flux.tversky_loss(y,y) ≈ -0.5576923076923075
+    @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
+    @test Flux.tversky_loss(ŷ, y, β = 0.8) ≈ -0.09490740740740744
+    @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
   end
             
   @testset "no spurious promotions" begin
     for T in (Float32, Float64)
       y = rand(T, 2)
       ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,Flux.mae,Flux.huber_loss,Flux.msle,Flux.squared_hinge,Flux.dice_coeff_loss,Flux.tversky_loss)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson,
+              Flux.mae, Flux.huber_loss, Flux.msle, Flux.squared_hinge, Flux.dice_coeff_loss, Flux.tversky_loss)
         fwd, back = Flux.pullback(f, ŷ, y)
         @test fwd isa T
         @test eltype(back(one(T))[1]) == T

From 6e5c18bddffd447d7d6f84eb07f04724ab16a099 Mon Sep 17 00:00:00 2001
From: Adarsh Kumar <45385384+AdarshKumar712@users.noreply.github.com>
Date: Tue, 3 Mar 2020 16:02:57 +0530
Subject: [PATCH 17/17] Updated loss functions

---
 src/layers/stateless.jl | 51 +++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 5f457057..2fd98815 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -16,33 +16,33 @@ mse(ŷ, y) = sum((ŷ .- y).^2) * 1 // length(y)
 
 
 """
-    msle(ŷ, y; ϵ = eps.(Float64.(ŷ)))
+    msle(ŷ, y; ϵ=eps(eltype(ŷ)))
 
-Returns the mean of the squared logarithmic errors `sum((log.(ŷ + ϵ) .- log.(y + ϵ)).^2) / length(y)`.
+Returns the mean of the squared logarithmic errors `sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) / length(y)`.
 The `ϵ` term provides numerical stability. 
 
 This error penalizes an under-predicted estimate greater than an over-predicted estimate.
 """
-msle(ŷ, y; ϵ = eps.(ŷ)) = sum((log.(ŷ + ϵ).-log.(y + ϵ)).^2) * 1 // length(y)
+msle(ŷ, y; ϵ=eps(eltype(ŷ))) = sum((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)).^2) * 1 // length(y)
 
 
 
 """
-    huber_loss(ŷ, y; delta = 1.0)
+    huber_loss(ŷ, y; δ=1.0)
 
-Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, delta is set to 1.0.
+Computes the mean of the Huber loss given the prediction `ŷ` and true values `y`. By default, δ is set to 1.0.
 
-                    | 0.5*|ŷ - y|,   for |ŷ - y| <= delta
+                    | 0.5*|ŷ - y|,   for |ŷ - y| <= δ
       Hubber loss = |
-                    | delta*(|ŷ- y| - 0.5*delta),  otherwise
+                    |  δ*(|ŷ - y| - 0.5*δ),  otherwise
 
 [`Huber Loss`](https://en.wikipedia.org/wiki/Huber_loss).
 """
-function huber_loss(ŷ, y; delta = eltype(ŷ)(1))
-   abs_error = abs.(ŷ.-y)
-   temp = abs_error.<delta
+function huber_loss(ŷ, y;  δ=eltype(ŷ)(1))
+   abs_error = abs.(ŷ .- y)
+   temp = abs_error .<  δ
    x = eltype(ŷ)(0.5)
-   hub_loss = sum(((abs_error.^2).*temp).*x .+ delta*(abs_error.- x*delta).*(1 .-temp)) * 1 // length(y)
+   hub_loss = sum(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp)) * 1 // length(y)
 end
 
 function _crossentropy(ŷ::AbstractVecOrMat, y::AbstractVecOrMat, weight::Nothing)
@@ -144,7 +144,7 @@ It is always non-negative and zero only when both the distributions are equal ev
 [KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
 """
 function kldivergence(ŷ, y)
-  entropy = sum(y .* log.(y)) *1 //size(y,2)
+  entropy = sum(y .* log.(y)) * 1 //size(y,2)
   cross_entropy = crossentropy(ŷ, y)
   return entropy + cross_entropy
 end
@@ -157,7 +157,7 @@ Returns `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`
 
 [Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
-poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
+poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) * 1 // size(y,2)
 
 """
     hinge(ŷ, y)
@@ -168,7 +168,7 @@ Returns `sum((max.(0, 1 .- ŷ .* y))) / size(y, 2)`
 [Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss)
 See also [`squared_hinge`](@ref).
 """
-hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y, 2)
+hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) * 1 // size(y, 2)
 
 """
     squared_hinge(ŷ, y)
@@ -178,34 +178,25 @@ Returns `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`
 
 See also [`hinge`](@ref).
 """
-squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) *1 // size(y, 2)
+squared_hinge(ŷ, y) = sum((max.(0, 1 .- ŷ .* y)).^2) * 1 // size(y, 2)
 
 """
-    dice_coeff_loss(ŷ, y, smooth = 1)
+    dice_coeff_loss(ŷ, y; smooth=1)
 
 Loss function used in Image Segmentation. Calculates loss based on dice coefficient. Similar to F1_score.
-    
-    Dice_Coefficient(ŷ, y) = 2 * sum( |ŷ.* y| + smooth) / (sum( ŷ.^2 ) + sum( y.^2 ) + smooth)
-    Dice_loss = 1 - Dice_Coefficient
+Returns `1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)`
 
 [V-Net: Fully Convolutional Neural Networks forVolumetric Medical Image Segmentation](https://arxiv.org/pdf/1606.04797v1.pdf)
 """
-function dice_coeff_loss(ŷ, y; smooth = eltype(ŷ)(1.0))
-    intersection = sum(y.*ŷ)
-    return 1 - (2*intersection + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
-end
+dice_coeff_loss(ŷ, y; smooth=eltype(ŷ)(1.0)) = 1 - (2*sum(y .* ŷ) + smooth) / (sum(y.^2) + sum(ŷ.^2) + smooth)
 
 """
-    tversky_loss(ŷ, y, β = 0.7)
+    tversky_loss(ŷ, y; β=0.7)
 
 Used with imbalanced data to give more weightage to False negatives. 
 Larger β weigh recall higher than precision (by placing more emphasis on false negatives)
-    
-    tversky_loss(ŷ, y, β) = 1 - sum(|y.*ŷ| + 1) / (sum(y.*ŷ + β *(1 .- y).*ŷ + (1 - β).*y.*(1 .- ŷ))+ 1)
+Returns `1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)`
 
 [Tversky loss function for image segmentation using 3D fully convolutional deep networks](https://arxiv.org/pdf/1706.05721.pdf)
 """
-function tversky_loss(ŷ, y; β = eltype(ŷ)(0.7))
-    intersection = sum(y.*ŷ)
-    return 1 - (intersection + 1) / (sum(y.* ŷ + β *(1 .- y).* ŷ + (1 - β).*y.*(1 .- ŷ)) + 1)
-end
+tversky_loss(ŷ, y; β=eltype(ŷ)(0.7)) = 1 - (sum(y .* ŷ) + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)