From 9b7fc12310157948b3a87540bee2d34a93bda2ea Mon Sep 17 00:00:00 2001
From: autodocs <autodocs>
Date: Thu, 2 Feb 2017 08:19:53 +0000
Subject: [PATCH] build based on ba22a5c

---
 latest/contributing.html     |   2 +-
 latest/examples/logreg.html  |   2 +-
 latest/index.html            |   2 +-
 latest/internals.html        |   2 +-
 latest/models/basics.html    |   2 +-
 latest/models/debugging.html |   2 +-
 latest/models/recurrent.html | 148 ++++++++++++++++++++++++++++++++++-
 latest/models/templates.html |   2 +-
 latest/search_index.js       |   2 +-
 9 files changed, 154 insertions(+), 10 deletions(-)
diff --git a/latest/contributing.html b/latest/contributing.html
index 3943e91c..a54f7317 100644
--- a/latest/contributing.html
+++ b/latest/contributing.html
@@ -109,7 +109,7 @@ Contributing &amp; Help
               </a>
             </li>
           </ul>
-          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/e00ea2e944c0950abce70ee9d4bebfb8bb3ab41a/docs/src/contributing.md">
+          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/ba22a5cedce39c2f7d38b4468f15e7eb45cc1f47/docs/src/contributing.md">
             <span class="fa">
 
             </span>
diff --git a/latest/examples/logreg.html b/latest/examples/logreg.html
index 893f7413..b44a5c82 100644
--- a/latest/examples/logreg.html
+++ b/latest/examples/logreg.html
@@ -112,7 +112,7 @@ Logistic Regression
               </a>
             </li>
           </ul>
-          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/e00ea2e944c0950abce70ee9d4bebfb8bb3ab41a/docs/src/examples/logreg.md">
+          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/ba22a5cedce39c2f7d38b4468f15e7eb45cc1f47/docs/src/examples/logreg.md">
             <span class="fa">
 
             </span>
diff --git a/latest/index.html b/latest/index.html
index 2cf6931b..1c267d50 100644
--- a/latest/index.html
+++ b/latest/index.html
@@ -115,7 +115,7 @@ Home
               </a>
             </li>
           </ul>
-          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/e00ea2e944c0950abce70ee9d4bebfb8bb3ab41a/docs/src/index.md">
+          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/ba22a5cedce39c2f7d38b4468f15e7eb45cc1f47/docs/src/index.md">
             <span class="fa">
 
             </span>
diff --git a/latest/internals.html b/latest/internals.html
index 454ee65b..d8152327 100644
--- a/latest/internals.html
+++ b/latest/internals.html
@@ -109,7 +109,7 @@ Internals
               </a>
             </li>
           </ul>
-          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/e00ea2e944c0950abce70ee9d4bebfb8bb3ab41a/docs/src/internals.md">
+          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/ba22a5cedce39c2f7d38b4468f15e7eb45cc1f47/docs/src/internals.md">
             <span class="fa">
 
             </span>
diff --git a/latest/models/basics.html b/latest/models/basics.html
index 0ba4f014..338b75a3 100644
--- a/latest/models/basics.html
+++ b/latest/models/basics.html
@@ -128,7 +128,7 @@ Model Building Basics
               </a>
             </li>
           </ul>
-          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/e00ea2e944c0950abce70ee9d4bebfb8bb3ab41a/docs/src/models/basics.md">
+          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/ba22a5cedce39c2f7d38b4468f15e7eb45cc1f47/docs/src/models/basics.md">
             <span class="fa">
 
             </span>
diff --git a/latest/models/debugging.html b/latest/models/debugging.html
index cf594e04..f3ab5ed9 100644
--- a/latest/models/debugging.html
+++ b/latest/models/debugging.html
@@ -112,7 +112,7 @@ Debugging
               </a>
             </li>
           </ul>
-          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/e00ea2e944c0950abce70ee9d4bebfb8bb3ab41a/docs/src/models/debugging.md">
+          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/ba22a5cedce39c2f7d38b4468f15e7eb45cc1f47/docs/src/models/debugging.md">
             <span class="fa">
 
             </span>
diff --git a/latest/models/recurrent.html b/latest/models/recurrent.html
index 3dd83c8f..fefeef68 100644
--- a/latest/models/recurrent.html
+++ b/latest/models/recurrent.html
@@ -112,7 +112,7 @@ Recurrence
               </a>
             </li>
           </ul>
-          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/e00ea2e944c0950abce70ee9d4bebfb8bb3ab41a/docs/src/models/recurrent.md">
+          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/ba22a5cedce39c2f7d38b4468f15e7eb45cc1f47/docs/src/models/recurrent.md">
             <span class="fa">
 
             </span>
@@ -127,7 +127,151 @@ Recurrent Models
         </a>
       </h1>
       <p>
-[WIP]
+        <a href="https://en.wikipedia.org/wiki/Recurrent_neural_network">
+Recurrence
+        </a>
+ is a first-class feature in Flux and recurrent models are very easy to build and use. Recurrences are often illustrated as cycles or self-dependencies in the graph; they can also be thought of as a hidden output from / input to the network. For example, for a sequence of inputs 
+<code>x1, x2, x3 ...</code>
+ we produce predictions as follows:
+      </p>
+<pre><code class="language-julia">y1 = f(W, x1) # `f` is the model, `W` represents the parameters
+y2 = f(W, x2)
+y3 = f(W, x3)
+...</code></pre>
+      <p>
+Each evaluation is independent and the prediction made for a given input will always be the same. That makes a lot of sense for, say, MNIST images, but less sense when predicting a sequence. For that case we introduce the hidden state:
+      </p>
+<pre><code class="language-julia">y1, s = f(W, x1, s)
+y2, s = f(W, x2, s)
+y3, s = f(W, x3, s)
+...</code></pre>
+      <p>
+The state 
+<code>s</code>
+ allows the prediction to depend not only on the current input 
+<code>x</code>
+ but also on the history of past inputs.
+      </p>
+      <p>
+The simplest recurrent network looks as follows in Flux, and it should be familiar if you&#39;ve seen the equations defining an RNN before:
+      </p>
+<pre><code class="language-julia">@net type Recurrent
+  Wxy; Wyy; by
+  y
+  function (x)
+    y = tanh( x * Wxy + y{-1} * Wyy + by )
+  end
+end</code></pre>
+      <p>
+The only difference from a regular feed-forward layer is that we create a variable 
+<code>y</code>
+ which is defined as depending on itself. The 
+<code>y{-1}</code>
+ syntax means &quot;take the value of 
+<code>y</code>
+ from the previous run of the network&quot;.
+      </p>
+      <p>
+Using recurrent layers is straightforward and no different feedforard ones in terms of the 
+<code>Chain</code>
+ macro etc. For example:
+      </p>
+<pre><code class="language-julia">model = Chain(
+    Affine(784, 20), σ
+    Recurrent(20, 30),
+    Recurrent(30, 15))</code></pre>
+      <p>
+Before using the model we need to unroll it. This happens with the 
+<code>unroll</code>
+ function:
+      </p>
+<pre><code class="language-julia">unroll(model, 20)</code></pre>
+      <p>
+This call creates an unrolled, feed-forward version of the model which accepts N (= 20) inputs and generates N predictions at a time. Essentially, the model is replicated N times and Flux ties the hidden outputs 
+<code>y</code>
+ to hidden inputs.
+      </p>
+      <p>
+Here&#39;s a more complex recurrent layer, an LSTM, and again it should be familiar if you&#39;ve seen the 
+        <a href="https://colah.github.io/posts/2015-08-Understanding-LSTMs/">
+equations
+        </a>
+:
+      </p>
+<pre><code class="language-julia">@net type LSTM
+  Wxf; Wyf; bf
+  Wxi; Wyi; bi
+  Wxo; Wyo; bo
+  Wxc; Wyc; bc
+  y; state
+  function (x)
+    # Gates
+    forget = σ( x * Wxf + y{-1} * Wyf + bf )
+    input  = σ( x * Wxi + y{-1} * Wyi + bi )
+    output = σ( x * Wxo + y{-1} * Wyo + bo )
+    # State update and output
+    state′ = tanh( x * Wxc + y{-1} * Wyc + bc )
+    state  = forget .* state{-1} + input .* state′
+    y = output .* tanh(state)
+  end
+end</code></pre>
+      <p>
+The only unfamiliar part is that we have to define all of the parameters of the LSTM upfront, which adds a few lines at the beginning.
+      </p>
+      <p>
+Flux&#39;s very mathematical notation generalises well to handling more complex models. For example, 
+        <a href="https://arxiv.org/abs/1409.0473">
+this neural translation model with alignment
+        </a>
+ can be fairly straightforwardly, and recognisably, translated from the paper into Flux code:
+      </p>
+<pre><code class="language-julia"># A recurrent model which takes a token and returns a context-dependent
+# annotation.
+
+@net type Encoder
+  forward
+  backward
+  token -&gt; hcat(forward(token), backward(token))
+end
+
+Encoder(in::Integer, out::Integer) =
+  Encoder(LSTM(in, out÷2), flip(LSTM(in, out÷2)))
+
+# A recurrent model which takes a sequence of annotations, attends, and returns
+# a predicted output token.
+
+@net type Decoder
+  attend
+  recur
+  state; y; N
+  function (anns)
+    energies = map(ann -&gt; exp(attend(hcat(state{-1}, ann))[1]), seq(anns, N))
+    weights = energies./sum(energies)
+    ctx = sum(map((α, ann) -&gt; α .* ann, weights, anns))
+    (_, state), y = recur((state{-1},y{-1}), ctx)
+    y
+  end
+end
+
+Decoder(in::Integer, out::Integer; N = 1) =
+  Decoder(Affine(in+out, 1),
+          unroll1(LSTM(in, out)),
+          param(zeros(1, out)), param(zeros(1, out)), N)
+
+# The model
+
+Nalpha  =  5 # The size of the input token vector
+Nphrase =  7 # The length of (padded) phrases
+Nhidden = 12 # The size of the hidden state
+
+encode = Encoder(Nalpha, Nhidden)
+decode = Chain(Decoder(Nhidden, Nhidden, N = Nphrase), Affine(Nhidden, Nalpha), softmax)
+
+model = Chain(
+  unroll(encode, Nphrase, stateful = false),
+  unroll(decode, Nphrase, stateful = false, seq = false))</code></pre>
+      <p>
+Note that this model excercises some of the more advanced parts of the compiler and isn&#39;t stable for general use yet.
       </p>
       <footer>
         <hr/>
diff --git a/latest/models/templates.html b/latest/models/templates.html
index 4f26108c..dfe25821 100644
--- a/latest/models/templates.html
+++ b/latest/models/templates.html
@@ -128,7 +128,7 @@ Model Templates
               </a>
             </li>
           </ul>
-          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/e00ea2e944c0950abce70ee9d4bebfb8bb3ab41a/docs/src/models/templates.md">
+          <a class="edit-page" href="https://github.com/MikeInnes/Flux.jl/tree/ba22a5cedce39c2f7d38b4468f15e7eb45cc1f47/docs/src/models/templates.md">
             <span class="fa">
 
             </span>
diff --git a/latest/search_index.js b/latest/search_index.js
index fe2b4142..7bea2eb7 100644
--- a/latest/search_index.js
+++ b/latest/search_index.js
@@ -117,7 +117,7 @@ var documenterSearchIndex = {"docs": [
     "page": "Recurrence",
     "title": "Recurrent Models",
     "category": "section",
-    "text": "[WIP]"
+    "text": "Recurrence is a first-class feature in Flux and recurrent models are very easy to build and use. Recurrences are often illustrated as cycles or self-dependencies in the graph; they can also be thought of as a hidden output from / input to the network. For example, for a sequence of inputs x1, x2, x3 ... we produce predictions as follows:y1 = f(W, x1) # `f` is the model, `W` represents the parameters\ny2 = f(W, x2)\ny3 = f(W, x3)\n...Each evaluation is independent and the prediction made for a given input will always be the same. That makes a lot of sense for, say, MNIST images, but less sense when predicting a sequence. For that case we introduce the hidden state:y1, s = f(W, x1, s)\ny2, s = f(W, x2, s)\ny3, s = f(W, x3, s)\n...The state s allows the prediction to depend not only on the current input x but also on the history of past inputs.The simplest recurrent network looks as follows in Flux, and it should be familiar if you've seen the equations defining an RNN before:@net type Recurrent\n  Wxy; Wyy; by\n  y\n  function (x)\n    y = tanh( x * Wxy + y{-1} * Wyy + by )\n  end\nendThe only difference from a regular feed-forward layer is that we create a variable y which is defined as depending on itself. The y{-1} syntax means \"take the value of y from the previous run of the network\".Using recurrent layers is straightforward and no different feedforard ones in terms of the Chain macro etc. For example:model = Chain(\n    Affine(784, 20), σ\n    Recurrent(20, 30),\n    Recurrent(30, 15))Before using the model we need to unroll it. This happens with the unroll function:unroll(model, 20)This call creates an unrolled, feed-forward version of the model which accepts N (= 20) inputs and generates N predictions at a time. Essentially, the model is replicated N times and Flux ties the hidden outputs y to hidden inputs.Here's a more complex recurrent layer, an LSTM, and again it should be familiar if you've seen the equations:@net type LSTM\n  Wxf; Wyf; bf\n  Wxi; Wyi; bi\n  Wxo; Wyo; bo\n  Wxc; Wyc; bc\n  y; state\n  function (x)\n    # Gates\n    forget = σ( x * Wxf + y{-1} * Wyf + bf )\n    input  = σ( x * Wxi + y{-1} * Wyi + bi )\n    output = σ( x * Wxo + y{-1} * Wyo + bo )\n    # State update and output\n    state′ = tanh( x * Wxc + y{-1} * Wyc + bc )\n    state  = forget .* state{-1} + input .* state′\n    y = output .* tanh(state)\n  end\nendThe only unfamiliar part is that we have to define all of the parameters of the LSTM upfront, which adds a few lines at the beginning.Flux's very mathematical notation generalises well to handling more complex models. For example, this neural translation model with alignment can be fairly straightforwardly, and recognisably, translated from the paper into Flux code:# A recurrent model which takes a token and returns a context-dependent\n# annotation.\n\n@net type Encoder\n  forward\n  backward\n  token -> hcat(forward(token), backward(token))\nend\n\nEncoder(in::Integer, out::Integer) =\n  Encoder(LSTM(in, out÷2), flip(LSTM(in, out÷2)))\n\n# A recurrent model which takes a sequence of annotations, attends, and returns\n# a predicted output token.\n\n@net type Decoder\n  attend\n  recur\n  state; y; N\n  function (anns)\n    energies = map(ann -> exp(attend(hcat(state{-1}, ann))[1]), seq(anns, N))\n    weights = energies./sum(energies)\n    ctx = sum(map((α, ann) -> α .* ann, weights, anns))\n    (_, state), y = recur((state{-1},y{-1}), ctx)\n    y\n  end\nend\n\nDecoder(in::Integer, out::Integer; N = 1) =\n  Decoder(Affine(in+out, 1),\n          unroll1(LSTM(in, out)),\n          param(zeros(1, out)), param(zeros(1, out)), N)\n\n# The model\n\nNalpha  =  5 # The size of the input token vector\nNphrase =  7 # The length of (padded) phrases\nNhidden = 12 # The size of the hidden state\n\nencode = Encoder(Nalpha, Nhidden)\ndecode = Chain(Decoder(Nhidden, Nhidden, N = Nphrase), Affine(Nhidden, Nalpha), softmax)\n\nmodel = Chain(\n  unroll(encode, Nphrase, stateful = false),\n  unroll(decode, Nphrase, stateful = false, seq = false))Note that this model excercises some of the more advanced parts of the compiler and isn't stable for general use yet."
 },
 
 {