From 633f0df01fc0e200e4a03cb7f3e93f868a7d1b72 Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Tue, 12 Mar 2019 02:31:42 +0530
Subject: [PATCH 01/15] Added new loss functions.

---
 src/layers/stateless.jl | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 23fd1651..3bb48f1f 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -54,3 +54,31 @@ function normalise(x::AbstractArray, dims)
   Base.depwarn("`normalise(x::AbstractArray, dims)` is deprecated, use `normalise(a, dims=dims)` instead.", :normalise)
   normalise(x, dims = dims)
 end
+
+"""
+    Kullback Leibler Divergence(KL Divergence)
+KLDivergence is a measure of how much one probability distribution is different from the other.
+It is always non-negative and zero only when both the distributions are equal everywhere.
+
+"""
+function KLDivergence(ŷ, y)
+  entropy = sum(y .* log.(y)) *1 //size(y,2)
+  cross_entropy = crossentropy(ŷ, y)
+  return entropy + cross_entropy
+end
+
+"""
+    Poisson Loss function
+Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
+
+"""
+Poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
+
+"""
+    Logcosh Loss function
+"""
+
+logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y)))
+
+Hinge(ŷ, y) = sum(max.(0.0, 1 .-  ŷ .* y)) *1 // size(y,2)
+

From 61386c04f8ac8a6badcf8ca889169eb623b5327b Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Tue, 12 Mar 2019 02:36:37 +0530
Subject: [PATCH 02/15] Tests added.

---
 test/layers/stateless.jl | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 34abb8cb..336adc12 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -49,7 +49,31 @@ const ϵ = 1e-7
   @testset "logitbinarycrossentropy" begin
     @test logitbinarycrossentropy.(logŷ, y) ≈ binarycrossentropy.(σ.(logŷ), y; ϵ=0)
   end
+  
+  y = [1 2 3]
+  y1 = [4.0 5.0 6.0]
+  @testset "KLDivergence" begin
+    @test Flux.KLDivergence(y, y1) ≈ 4.761838062403337
+    @test Flux.KLDivergence(y, y) ≈ 0 
+  end
 
+  @testset "Hinge" begin
+    @test Flux.Hinge(y, y1) ≈ 0
+    @test Flux.Hinge(y, 0.2 .* y) ≈ 0.33333
+  end
+  
+  y = [0.1 0.2 0.3]
+  y1 = [0.4 0.5 0.6]
+  @testset "Poisson" begin
+    @test Flux.Poisson(y, y1) ≈ 1.0160455586700767
+    @test Flux.Poisson(y, y) ≈ 0.5044459776946685
+  end
+
+  @testset "logcosh" begin
+    @test Flux.logcosh(y, y1) ≈ 0.13302230977782092
+    @test Flux.logcosh(y, y) ≈ 0
+  end
+  
   @testset "no spurious promotions" begin
     for T in (Float16, Float32, Float64)
       y = rand(T, 2)

From 57a52e33750c9f8afcf7a8937abbbee766419121 Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Tue, 12 Mar 2019 02:58:32 +0530
Subject: [PATCH 03/15] Error of recurrent decimals fixed.

---
 test/layers/stateless.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 336adc12..f961ed2f 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -56,10 +56,12 @@ const ϵ = 1e-7
     @test Flux.KLDivergence(y, y1) ≈ 4.761838062403337
     @test Flux.KLDivergence(y, y) ≈ 0 
   end
-
+  
+  y = [1 2 3 4]
+  y1 = [5.0 6.0 7.0 8.0]
   @testset "Hinge" begin
     @test Flux.Hinge(y, y1) ≈ 0
-    @test Flux.Hinge(y, 0.2 .* y) ≈ 0.33333
+    @test Flux.Hinge(y, 0.5 .* y) ≈ 0.125
   end
   
   y = [0.1 0.2 0.3]

From c4d12e57fe6a3ea0473e5fa6145d1d55789c9358 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Tue, 26 Mar 2019 03:09:48 +0530
Subject: [PATCH 04/15] Loss function names in lowercase

---
 src/layers/stateless.jl  |  9 +++------
 test/layers/stateless.jl | 18 +++++++++---------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 3bb48f1f..424db1df 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -59,9 +59,8 @@ end
     Kullback Leibler Divergence(KL Divergence)
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
-
 """
-function KLDivergence(ŷ, y)
+function kldivergence(ŷ, y)
   entropy = sum(y .* log.(y)) *1 //size(y,2)
   cross_entropy = crossentropy(ŷ, y)
   return entropy + cross_entropy
@@ -70,15 +69,13 @@ end
 """
     Poisson Loss function
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
-
 """
-Poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
+poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
 """
     Logcosh Loss function
 """
-
 logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y)))
 
-Hinge(ŷ, y) = sum(max.(0.0, 1 .-  ŷ .* y)) *1 // size(y,2)
+hinge(ŷ, y) = sum(max.(0.0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index f961ed2f..97bfea10 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -52,23 +52,23 @@ const ϵ = 1e-7
   
   y = [1 2 3]
   y1 = [4.0 5.0 6.0]
-  @testset "KLDivergence" begin
-    @test Flux.KLDivergence(y, y1) ≈ 4.761838062403337
-    @test Flux.KLDivergence(y, y) ≈ 0 
+  @testset "kldivergence" begin
+    @test Flux.kldivergence(y, y1) ≈ 4.761838062403337
+    @test Flux.kldivergence(y, y) ≈ 0 
   end
   
   y = [1 2 3 4]
   y1 = [5.0 6.0 7.0 8.0]
-  @testset "Hinge" begin
-    @test Flux.Hinge(y, y1) ≈ 0
-    @test Flux.Hinge(y, 0.5 .* y) ≈ 0.125
+  @testset "hinge" begin
+    @test Flux.hinge(y, y1) ≈ 0
+    @test Flux.hinge(y, 0.5 .* y) ≈ 0.125
   end
   
   y = [0.1 0.2 0.3]
   y1 = [0.4 0.5 0.6]
-  @testset "Poisson" begin
-    @test Flux.Poisson(y, y1) ≈ 1.0160455586700767
-    @test Flux.Poisson(y, y) ≈ 0.5044459776946685
+  @testset "poisson" begin
+    @test Flux.poisson(y, y1) ≈ 1.0160455586700767
+    @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
 
   @testset "logcosh" begin

From 6f078857beda49e7f1d565cc7e4dded6c55db3d0 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Tue, 26 Mar 2019 03:15:28 +0530
Subject: [PATCH 05/15] Added reference links to loss functions

---
 src/layers/stateless.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 424db1df..aaefcee9 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -59,6 +59,7 @@ end
     Kullback Leibler Divergence(KL Divergence)
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
+https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
 """
 function kldivergence(ŷ, y)
   entropy = sum(y .* log.(y)) *1 //size(y,2)
@@ -69,6 +70,7 @@ end
 """
     Poisson Loss function
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
+https://isaacchanghau.github.io/post/loss_functions/
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 

From 930adb122dc5443f205ced401b5275ddbeeb67ca Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Mon, 25 Mar 2019 23:43:06 +0530
Subject: [PATCH 06/15] Avoided promotion to Float64 in hinge.

---
 src/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index aaefcee9..3221ddff 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -79,5 +79,5 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 """
 logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y)))
 
-hinge(ŷ, y) = sum(max.(0.0, 1 .-  ŷ .* y)) *1 // size(y,2)
+hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 

From 4efcc69ba5de4f68f5e0e0dc474b44ddf9388615 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Tue, 26 Mar 2019 23:23:02 +0530
Subject: [PATCH 07/15] logcosh averaged

---
 src/layers/stateless.jl  | 2 +-
 test/layers/stateless.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 3221ddff..6b6abb5e 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -77,7 +77,7 @@ poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 """
     Logcosh Loss function
 """
-logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y)))
+logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y))) *1 // size(y,2)
 
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 97bfea10..e8d881fb 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -72,7 +72,7 @@ const ϵ = 1e-7
   end
 
   @testset "logcosh" begin
-    @test Flux.logcosh(y, y1) ≈ 0.13302230977782092
+    @test Flux.logcosh(y, y1) ≈ 0.044340769925940306
     @test Flux.logcosh(y, y) ≈ 0
   end
   

From b84ab7ac95aa1eca3ec302bc7f997518b3e71612 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Fri, 5 Apr 2019 03:16:54 +0530
Subject: [PATCH 08/15] Removed logcosh

---
 src/layers/stateless.jl  | 5 -----
 test/layers/stateless.jl | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 6b6abb5e..3444f0f4 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -74,10 +74,5 @@ https://isaacchanghau.github.io/post/loss_functions/
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
-"""
-    Logcosh Loss function
-"""
-logcosh(ŷ, y) = sum(log.(cosh.(ŷ .- y))) *1 // size(y,2)
-
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)
 
diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index e8d881fb..d912a5fe 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -70,11 +70,6 @@ const ϵ = 1e-7
     @test Flux.poisson(y, y1) ≈ 1.0160455586700767
     @test Flux.poisson(y, y) ≈ 0.5044459776946685
   end
-
-  @testset "logcosh" begin
-    @test Flux.logcosh(y, y1) ≈ 0.044340769925940306
-    @test Flux.logcosh(y, y) ≈ 0
-  end
   
   @testset "no spurious promotions" begin
     for T in (Float16, Float32, Float64)

From 710084ffbfca78805d8c0fe41be8e9dbb58b3c4f Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Fri, 5 Apr 2019 23:50:16 +0530
Subject: [PATCH 09/15] Loss functions added to docs

---
 docs/src/training/training.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index ae483783..76c099eb 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -32,6 +32,18 @@ Flux.train!(loss, ps, data, opt)
 
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
 
+In-built loss functions:
+```@docs
+mse
+crossentropy
+logitcrossentropy
+binarycrossentropy
+logitbinarycrossentropy
+kldivergence
+poisson
+hinge
+```
+
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From ec35e9cbaa31bcdb37857c5bb39bbbfc22379e4e Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Mon, 30 Sep 2019 21:02:13 +0530
Subject: [PATCH 10/15] Loss functions docs added in layers.md

---
 docs/src/models/layers.md     | 12 ++++++++++++
 docs/src/training/training.md | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index f2bd8046..c439581c 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,3 +66,15 @@ AlphaDropout
 LayerNorm
 GroupNorm
 ```
+
+## In-built loss functions:
+```@docs
+mse
+crossentropy
+logitcrossentropy
+binarycrossentropy
+logitbinarycrossentropy
+kldivergence
+poisson
+hinge
+```
\ No newline at end of file
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index cba1422c..679bbd0b 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -32,18 +32,6 @@ Flux.train!(loss, ps, data, opt)
 
 The objective will almost always be defined in terms of some *cost function* that measures the distance of the prediction `m(x)` from the target `y`. Flux has several of these built in, like `mse` for mean squared error or `crossentropy` for cross entropy loss, but you can calculate it however you want.
 
-In-built loss functions:
-```@docs
-mse
-crossentropy
-logitcrossentropy
-binarycrossentropy
-logitbinarycrossentropy
-kldivergence
-poisson
-hinge
-```
-
 ## Datasets
 
 The `data` argument provides a collection of data to train with (usually a set of inputs `x` and target outputs `y`). For example, here's a dummy data set with only one data point:

From ec886c8ce864721b4144cb749c458b3410c67946 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Thu, 3 Oct 2019 21:13:09 +0530
Subject: [PATCH 11/15] Added docstring for hinge loss

---
 src/layers/stateless.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index c3dd22b0..8cdac33d 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -50,11 +50,6 @@ function normalise(x::AbstractArray; dims=1)
   return (x .- μ′) ./ σ′
 end
 
-function normalise(x::AbstractArray, dims)
-  Base.depwarn("`normalise(x::AbstractArray, dims)` is deprecated, use `normalise(a, dims=dims)` instead.", :normalise)
-  normalise(x, dims = dims)
-end
-
 """
     Kullback Leibler Divergence(KL Divergence)
 KLDivergence is a measure of how much one probability distribution is different from the other.
@@ -74,4 +69,8 @@ https://isaacchanghau.github.io/post/loss_functions/
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
+"""
+    Hinge Loss function
+Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). This is usually used for measuring whether two inputs are similar or dissimilar
+"""
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)

From 96a23c295c88454770dd5d5a961fec4d1898dcb0 Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Wed, 9 Oct 2019 14:53:03 +0530
Subject: [PATCH 12/15] Changes to docs

---
 docs/src/models/layers.md |  2 +-
 src/layers/stateless.jl   | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index c75c77b7..0007853a 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,7 +66,7 @@ LayerNorm
 GroupNorm
 ```
 
-## In-built loss functions:
+## Loss functions:
 ```@docs
 mse
 crossentropy
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 8cdac33d..4e142f07 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -51,10 +51,10 @@ function normalise(x::AbstractArray; dims=1)
 end
 
 """
-    Kullback Leibler Divergence(KL Divergence)
+    kldivergence(ŷ, y)
 KLDivergence is a measure of how much one probability distribution is different from the other.
 It is always non-negative and zero only when both the distributions are equal everywhere.
-https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+[KL Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).
 """
 function kldivergence(ŷ, y)
   entropy = sum(y .* log.(y)) *1 //size(y,2)
@@ -63,14 +63,15 @@ function kldivergence(ŷ, y)
 end
 
 """
-    Poisson Loss function
+    poisson(ŷ, y)
 Poisson loss function is a measure of how the predicted distribution diverges from the expected distribution.
-https://isaacchanghau.github.io/post/loss_functions/
+[Poisson Loss](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 """
 poisson(ŷ, y) = sum(ŷ .- y .* log.(ŷ)) *1 // size(y,2)
 
 """
-    Hinge Loss function
-Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). This is usually used for measuring whether two inputs are similar or dissimilar
+    hinge(ŷ, y)
+Measures the loss given the prediction ŷ and true labels y(containing 1 or -1). 
+[Hinge Loss](https://en.wikipedia.org/wiki/Hinge_loss).
 """
 hinge(ŷ, y) = sum(max.(0, 1 .-  ŷ .* y)) *1 // size(y,2)

From d591b2b59eba2ec360a0836184632d9da8f8dc8f Mon Sep 17 00:00:00 2001
From: thebhatman <manjunathbhat9920@gmail.com>
Date: Wed, 9 Oct 2019 21:36:40 +0530
Subject: [PATCH 13/15] Removed colon and capitalised

---
 docs/src/models/layers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 0007853a..227abe31 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,7 +66,7 @@ LayerNorm
 GroupNorm
 ```
 
-## Loss functions:
+## Loss Functions
 ```@docs
 mse
 crossentropy

From 8a93be8c6c9d4686c63284153d9cf8cf07f376a1 Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Mon, 9 Dec 2019 20:39:46 +0530
Subject: [PATCH 14/15] Change loss to cost

---
 docs/src/models/layers.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 227abe31..5f2ab3ce 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -66,7 +66,7 @@ LayerNorm
 GroupNorm
 ```
 
-## Loss Functions
+## Cost Functions
 ```@docs
 mse
 crossentropy
@@ -76,4 +76,4 @@ logitbinarycrossentropy
 kldivergence
 poisson
 hinge
-```
\ No newline at end of file
+```

From 747e01ea024134b09fdf64fe83c38fb71fe98536 Mon Sep 17 00:00:00 2001
From: Manjunath Bhat <manjunathbhat9920@gmail.com>
Date: Mon, 13 Jan 2020 18:33:30 +0530
Subject: [PATCH 15/15] Test to check for spurious promotions

---
 test/layers/stateless.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl
index 87c495f1..7cb8ed2e 100644
--- a/test/layers/stateless.jl
+++ b/test/layers/stateless.jl
@@ -75,7 +75,7 @@ const ϵ = 1e-7
     for T in (Float32, Float64)
       y = rand(T, 2)
       ŷ = rand(T, 2)
-      for f in (mse, crossentropy, logitcrossentropy)
+      for f in (mse, crossentropy, logitcrossentropy, Flux.kldivergence, Flux.hinge, Flux.poisson)
         fwd, back = Flux.pullback(f, ŷ, y)
         @test fwd isa T
         @test eltype(back(one(T))[1]) == T