diff --git a/src/data/dataloader.jl b/src/data/dataloader.jl
index baf32a83..8868a9b0 100644
--- a/src/data/dataloader.jl
+++ b/src/data/dataloader.jl
@@ -11,7 +11,7 @@ struct DataLoader
 end
 
 """
-     DataLoader(data...; batchsize=1, shuffle=false, partial=true)
+    DataLoader(data...; batchsize=1, shuffle=false, partial=true)
 
 An object that iterates over mini-batches of `data`, each mini-batch containing `batchsize` observations
 (except possibly the last one). 
diff --git a/src/data/iris.jl b/src/data/iris.jl
index d78606d8..f74e0709 100644
--- a/src/data/iris.jl
+++ b/src/data/iris.jl
@@ -28,7 +28,6 @@ function load()
 end
 
 """
-
     labels()
 
 Get the labels of the iris dataset, a 150 element array of strings listing the
@@ -53,7 +52,6 @@ function labels()
 end
 
 """
-
     features()
 
 Get the features of the iris dataset.  This is a 4x150 matrix of Float64
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index cf4496f4..f853ac23 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -6,7 +6,7 @@ const ϵ = 1e-8
 # TODO: should use weak refs
 
 """
-  Descent(η)
+    Descent(η)
 
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
@@ -441,17 +441,16 @@ function apply!(o::Optimiser, x, Δ)
 end
 
 """
-  InvDecay(γ)
+    InvDecay(γ)
 
 Applies inverse time decay to an optimiser, i.e., the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified.
-```
 
 ## Parameters
   - gamma (γ): Defaults to `0.001`
 
 ## Example
 ```julia
-  Optimiser(InvDecay(..), Opt(..))
+Optimiser(InvDecay(..), Opt(..))
 ```
 """
 mutable struct InvDecay
@@ -470,7 +469,7 @@ function apply!(o::InvDecay, x, Δ)
 end
 
 """
-  ExpDecay(eta, decay, decay_step, clip)
+    ExpDecay(eta, decay, decay_step, clip)
 
 Discount the learning rate `eta` by a multiplicative factor `decay` every `decay_step` till a minimum of `clip`.
 
@@ -483,9 +482,8 @@ Discount the learning rate `eta` by a multiplicative factor `decay` every `decay
 ## Example
 To apply exponential decay to an optimiser:
 ```julia
-  Optimiser(ExpDecay(..), Opt(..))
-
-  opt = Optimiser(ExpDecay(), ADAM())
+Optimiser(ExpDecay(..), Opt(..))
+opt = Optimiser(ExpDecay(), ADAM())
 ```
 """
 mutable struct ExpDecay
@@ -509,7 +507,7 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-  WeightDecay(wd)
+    WeightDecay(wd)
 
 Decays the weight by `wd`
 
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 79ebcc06..e12ab27b 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -3,8 +3,8 @@ import Zygote: Params, gradient
 
 
 """
-  update!(opt, p, g)
-  update!(opt, ps::Params, gs)
+    update!(opt, p, g)
+    update!(opt, ps::Params, gs)
 
 Perform an update step of the parameters `ps` (or the single parameter `p`) 
 according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
diff --git a/src/utils.jl b/src/utils.jl
index 2dba21c7..f483c5d9 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -60,7 +60,7 @@ head(x::Tuple) = reverse(Base.tail(reverse(x)))
 squeezebatch(x) = reshape(x, head(size(x)))
 
 """
-  batch(xs)
+    batch(xs)
 
 Batch the arrays in `xs` into a single array.