diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 6b2ea0ab..00000000
--- a/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-*.jl.cov
-*.jl.*.cov
-*.jl.mem
-docs/build/
-docs/site/
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 2a9a22ce..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-# Documentation: http://docs.travis-ci.com/user/languages/julia/
-language: julia
-os:
-  - linux
-  - osx
-julia:
-  - 0.5
-notifications:
-  email: false
-# uncomment the following lines to override the default test script
-script:
-  - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
-  - julia -e 'Pkg.clone("https://github.com/MikeInnes/DataFlow.jl")'
-  - julia -e 'Pkg.clone(pwd()); Pkg.build("Flux"); Pkg.test("Flux"; coverage=true)'
-# after_success:
-  - julia -e 'Pkg.add("Documenter")'
-  - julia -e 'cd(Pkg.dir("Flux")); include(joinpath("docs", "make.jl"))'
diff --git a/LICENSE.md b/LICENSE.md
deleted file mode 100644
index 60fd1522..00000000
--- a/LICENSE.md
+++ /dev/null
@@ -1,22 +0,0 @@
-The Flux.jl package is licensed under the MIT "Expat" License:
-
-> Copyright (c) 2016: Mike Innes.
->
-> Permission is hereby granted, free of charge, to any person obtaining
-> a copy of this software and associated documentation files (the
-> "Software"), to deal in the Software without restriction, including
-> without limitation the rights to use, copy, modify, merge, publish,
-> distribute, sublicense, and/or sell copies of the Software, and to
-> permit persons to whom the Software is furnished to do so, subject to
-> the following conditions:
->
-> The above copyright notice and this permission notice shall be
-> included in all copies or substantial portions of the Software.
->
-> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-> IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-> CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-> TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-> SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
deleted file mode 100644
index 9d4f0bcf..00000000
--- a/README.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# Флукс
-
-[![Build Status](https://travis-ci.org/MikeInnes/Flux.jl.svg?branch=master)](https://travis-ci.org/MikeInnes/Flux.jl)
-
-Flux is a high-level API for machine learning, implemented in Julia.
-
-Flux aims to provide a concise and expressive syntax for architectures that are hard to express within other frameworks. The notation should be familiar and extremely close to what you'd find in a paper or description of the model.
-
-The current focus is on ANNs with TensorFlow or MXNet as a backend. While it's in a very early working-prototype stage, you can see what works so far in the [examples folder](/examples).
-
-## Brief Examples
-
-Simple multi-layer-perceptron for MNIST:
-
-```julia
-Chain(
-  Input(784),
-  Affine(128), relu,
-  Affine( 64), relu,
-  Affine( 10), softmax)
-```
-
-LSTM example:
-
-```julia
-@net type LSTM
-  Wxf; Wyf; bf
-  Wxi; Wyi; bi
-  Wxo; Wyo; bo
-  Wxc; Wyc; bc
-  y; state
-  function (x)
-    # Gates
-    forget = σ( x * Wxf + y{-1} * Wyf + bf )
-    input  = σ( x * Wxi + y{-1} * Wyi + bi )
-    output = σ( x * Wxo + y{-1} * Wyo + bo )
-    # State update and output
-    state′ = tanh( x * Wxc + y{-1} * Wyc + bc )
-    state  = forget .* state{-1} + input .* state′
-    y = output .* tanh(state)
-  end
-end
-
-Chain(
-  Input(N),
-  LSTM(N, 256),
-  LSTM(256, 256),
-  Affine(256, N),
-  softmax)
-```
diff --git a/REQUIRE b/REQUIRE
deleted file mode 100644
index d0ddfa05..00000000
--- a/REQUIRE
+++ /dev/null
@@ -1,3 +0,0 @@
-julia 0.5-
-TensorFlow
-Iterators
diff --git a/docs/make.jl b/docs/make.jl
deleted file mode 100644
index 9ef67abd..00000000
--- a/docs/make.jl
+++ /dev/null
@@ -1,17 +0,0 @@
-using Documenter, Flux
-
-makedocs(modules=Module[Flux],
-         doctest=false, clean=true,
-         format = :html,
-         sitename="Flux Documentation",
-         pages = [
-           "Home" => "index.md",
-         ])
-
-deploydocs(
-   repo = "github.com/MikeInnes/Flux.jl.git",
-   target = "build",
-   osname = "linux",
-   julia = "0.5",
-   deps = nothing,
-   make = nothing)
diff --git a/docs/src/index.md b/docs/src/index.md
deleted file mode 100644
index 23a30b30..00000000
--- a/docs/src/index.md
+++ /dev/null
@@ -1 +0,0 @@
-# Flux
diff --git a/examples/MNIST.jl b/examples/MNIST.jl
deleted file mode 100644
index 70d26ee1..00000000
--- a/examples/MNIST.jl
+++ /dev/null
@@ -1,22 +0,0 @@
-using Flux, MNIST
-
-data = [(Vector{Float32}(trainfeatures(i)), onehot(Float32, trainlabel(i), 0:9)) for i = 1:60_000]
-train = data[1:50_000]
-test = data[50_001:60_000]
-
-m = Chain(
-  Input(784),
-  Affine(128), relu,
-  Affine( 64), relu,
-  Affine( 10), softmax)
-
-# Convert to TensorFlow
-model = tf(m)
-
-# An example prediction pre-training
-model(data[1][1])
-
-@time Flux.train!(model, train, test, η = 1e-3)
-
-# An example prediction post-training
-model(data[1][1])
diff --git a/examples/batching.jl b/examples/batching.jl
deleted file mode 100644
index ccbac02f..00000000
--- a/examples/batching.jl
+++ /dev/null
@@ -1,26 +0,0 @@
-using Flux
-
-# Traditional Approach
-
-# 100 samples of sequences of 15 28×28 3-colour images
-rand(100, 15, 28, 28, 3)
-
-# Basic Batching
-
-data = Batch([collect(reshape(9(i-1):9i-1, 3, 3)) for i = 1:10])
-
-Batch(flatten.(data))
-
-data |> structure
-
-Batch(flatten.(data)) |> structure
-
-# Nested Batching
-
-# DNA seqence, encoded as a list of [A, T, G, C]
-x1 = Seq([[0,1,0,0], [1,0,0,0], [0,0,0,1]])
-x2 = Seq([[0,0,1,0], [0,0,0,1], [0,0,1,0]])
-
-data = Batch([x1, x2])
-
-data |> structure
diff --git a/examples/char-rnn.jl b/examples/char-rnn.jl
deleted file mode 100644
index 0bff81cf..00000000
--- a/examples/char-rnn.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-using Flux
-import StatsBase: wsample
-
-nunroll = 50
-nbatch = 50
-
-getseqs(chars, alphabet) = sequences((onehot(Float32, char, alphabet) for char in chars), nunroll)
-getbatches(chars, alphabet) = batches((getseqs(part, alphabet) for part in chunk(chars, nbatch))...)
-
-input = readstring("$(homedir())/Downloads/shakespeare_input.txt")
-alphabet = unique(input)
-N = length(alphabet)
-
-Xs, Ys = getbatches(input, alphabet), getbatches(input[2:end], alphabet)
-
-model = Chain(
-  Input(N),
-  LSTM(N, 256),
-  LSTM(256, 256),
-  Affine(256, N),
-  softmax)
-
-m = tf(unroll(model, nunroll))
-
-@time Flux.train!(m, Xs, Ys, η = 0.1, epoch = 1)
-
-string(map(c -> onecold(c, alphabet), m(first(first(Xs))))...)
-
-function sample(model, n, temp = 1)
-  s = [rand(alphabet)]
-  m = tf(unroll(model, 1))
-  for i = 1:n
-    push!(s, wsample(alphabet, softmax(m(Seq((onehot(Float32, s[end], alphabet),)))[1]./temp)))
-  end
-  return string(s...)
-end
-
-sample(model, 100)
diff --git a/examples/integration.jl b/examples/integration.jl
deleted file mode 100644
index 704446e8..00000000
--- a/examples/integration.jl
+++ /dev/null
@@ -1,70 +0,0 @@
-using Flux, Juno
-
-# Flux aims to provide high-level APIs that work well across backends, but in
-# some cases you may want to take advantage of features specific to a given
-# backend (or Flux may simply not have an implementation of that feature yet).
-# In these cases it's easy to "drop down" and use the backend's API directly,
-# where appropriate.
-
-# In this example, both things are happening; firstly, Flux doesn't yet support
-# ConvNets in the pure-Julia backend, but this is invisible thanks to the use of
-# a simple "shim" type, `Conv2D`. This is provided by the library but could easily
-# have been user-defined.
-
-# Secondly, we want to take advantage of TensorFlow.jl's training process and
-# optimisers. We can simply call `Tensor` exactly as we would on a regular
-# TensorFlow model, and the rest of the process trivially follows
-# TensorFlow.jl's usual API.
-
-conv1 = Chain(
-  Reshape(28,28,1),
-  Conv2D((5,5), out = 20), tanh,
-  MaxPool((2,2), stride = (2,2)))
-
-conv2 = Chain(
-  Input(12,12,20),
-  Conv2D((5,5), in = 20, out = 50), tanh,
-  MaxPool((2,2), stride = (2,2)))
-
-lenet = Chain(
-  conv1, conv2, flatten,
-  Affine(500), tanh,
-  Affine(10), softmax)
-
-#--------------------------------------------------------------------------------
-
-# Now we can continue exactly as in plain TensorFlow, following
-#   https://github.com/malmaud/TensorFlow.jl/blob/master/examples/mnist_full.jl
-# (taking only the training and cost logic, not the graph building steps)
-
-using TensorFlow, Distributions
-
-include(Pkg.dir("TensorFlow", "examples", "mnist_loader.jl"))
-loader = DataLoader()
-
-session = Session(Graph())
-
-x  = placeholder(Float32)
-y′ = placeholder(Float32)
-y  = Tensor(lenet, x)
-
-cross_entropy = reduce_mean(-reduce_sum(y′.*log(y), reduction_indices=[2]))
-
-train_step = train.minimize(train.AdamOptimizer(1e-4), cross_entropy)
-
-accuracy = reduce_mean(cast(indmax(y, 2) .== indmax(y′, 2), Float32))
-
-run(session, initialize_all_variables())
-
-@progress for i in 1:1000
-    batch = next_batch(loader, 50)
-    if i%100 == 1
-        train_accuracy = run(session, accuracy, Dict(x=>batch[1], y′=>batch[2]))
-        info("step $i, training accuracy $train_accuracy")
-    end
-    run(session, train_step, Dict(x=>batch[1], y′=>batch[2]))
-end
-
-testx, testy = load_test_set()
-test_accuracy = run(session, accuracy, Dict(x=>testx, y′=>testy))
-info("test accuracy $test_accuracy")
diff --git a/examples/translation.jl b/examples/translation.jl
deleted file mode 100644
index df2559cb..00000000
--- a/examples/translation.jl
+++ /dev/null
@@ -1,52 +0,0 @@
-# Based on https://arxiv.org/abs/1409.0473
-
-using Flux
-using Flux: flip
-
-# A recurrent model which takes a token and returns a context-dependent
-# annotation.
-
-@net type Encoder
-  forward
-  backward
-  token -> hcat(forward(token), backward(token))
-end
-
-Encoder(in::Integer, out::Integer) =
-  Encoder(LSTM(in, out÷2), flip(LSTM(in, out÷2)))
-
-# A recurrent model which takes a sequence of annotations, attends, and returns
-# a predicted output token.
-
-@net type Decoder
-  attend
-  recur
-  state; y; N
-  function (anns)
-    energies = map(ann -> exp(attend(hcat(state{-1}, ann))[1]), seq(anns, N))
-    weights = energies./sum(energies)
-    ctx = sum(map((α, ann) -> α .* ann, weights, anns))
-    (_, state), y = recur((state{-1},y{-1}), ctx)
-    y
-  end
-end
-
-Decoder(in::Integer, out::Integer; N = 1) =
-  Decoder(Affine(in+out, 1),
-          unroll1(LSTM(in, out)),
-          param(zeros(1, out)), param(zeros(1, out)), N)
-
-# The model
-
-Nalpha  =  5 # The size of the input token vector
-Nphrase =  7 # The length of (padded) phrases
-Nhidden = 12 # The size of the hidden state
-
-encode = Encoder(Nalpha, Nhidden)
-decode = Chain(Decoder(Nhidden, Nhidden, N = Nphrase), Affine(Nhidden, Nalpha), softmax)
-
-model = Chain(
-  unroll(encode, Nphrase, stateful = false),
-  unroll(decode, Nphrase, stateful = false, seq = false))
-
-xs = Batch([Seq(rand(Float32, Nalpha) for _ = 1:Nphrase)])
diff --git a/src/Flux.jl b/src/Flux.jl
deleted file mode 100644
index 1df87404..00000000
--- a/src/Flux.jl
+++ /dev/null
@@ -1,36 +0,0 @@
-module Flux
-
-using MacroTools, Lazy, DataFlow, Juno
-using DataFlow: graphm, syntax, prewalk!, postwalk!, prewalk, postwalk,
-  iscyclic, Constant, constant, isconstant, group, Split, splitnode,
-  detuple, value, inputs, thread!, value, inputs, Split, splitnode, inputnode,
-  spliceinputs, bumpinputs, Frame
-using Juno: Tree, Row
-
-# Zero Flux Given
-
-include("model.jl")
-include("utils.jl")
-include("data.jl")
-
-include("compiler/code.jl")
-include("compiler/loops.jl")
-include("compiler/interp.jl")
-include("compiler/shape.jl")
-
-include("layers/affine.jl")
-include("layers/activation.jl")
-include("layers/recurrent.jl")
-include("layers/shape.jl")
-include("layers/chain.jl")
-include("layers/shims.jl")
-
-include("dims/catmat.jl")
-include("dims/batching.jl")
-include("dims/seq.jl")
-
-include("cost.jl")
-
-include("backend/backend.jl")
-
-end # module
diff --git a/src/backend/backend.jl b/src/backend/backend.jl
deleted file mode 100644
index 372148f9..00000000
--- a/src/backend/backend.jl
+++ /dev/null
@@ -1,11 +0,0 @@
-export tf
-
-function loadtf()
-  isdefined(Flux, :TF) && return
-  @eval include(joinpath(dirname($@__FILE__), "tensorflow/tensorflow.jl"))
-end
-
-function tf(args...)
-  loadtf()
-  TF.tf(args...)
-end
diff --git a/src/backend/tensorflow/graph.jl b/src/backend/tensorflow/graph.jl
deleted file mode 100644
index bd618223..00000000
--- a/src/backend/tensorflow/graph.jl
+++ /dev/null
@@ -1,72 +0,0 @@
-using Base: @get!
-using DataFlow: Constant, constant, Context, interpret, Split,
-  interpv, ituple, ilambda, iconst, iline, stack, mux
-using Flux: imap
-using TensorFlow: RawTensor
-
-# TODO: implement Julia's type promotion rules
-
-node(x::Tuple) = map(node, x)
-node(x::Tensor) = x
-node(x::Variable) = x
-node(x::Number) = TensorFlow.constant(Float32(x))
-
-graph(::typeof(tuple), args...) = (args...,)
-graph(s::Split, t::Tuple) = t[s.n]
-graph(::typeof(softmax), x) = nn.softmax(x)
-graph(::typeof(relu), x) = nn.relu(x)
-graph(::typeof(σ), x) = nn.sigmoid(x)
-graph(::typeof(hcat), xs...) = concat(1, xs)
-graph(::typeof(seq), xs, n) = TensorFlow.unpack(xs, num = n, axis = 1)
-
-for op in (tanh, *, .*, +, -)
-  @eval graph(::typeof($op), args...) = $op(node(args)...)
-end
-
-graph(::typeof(.-), args...) = -(node(args)...)
-
-# reshape hack due to https://github.com/malmaud/TensorFlow.jl/issues/79
-batchsize(x::Tensor) = reduce_sum(slice(TensorFlow.shape(x), [0], [1]))
-graph(::typeof(flatten), x) = reshape(x, pack([batchsize(x), Int32(-1)]))
-graph(r::Reshape, x) = reshape(x, pack([batchsize(x), map(Int32, r.dims)...]))
-
-graph(::Input, x) = x
-
-graph(p::MaxPool, x) =
-  nn.max_pool(x, [1, p.size..., 1], [1, p.stride..., 1], "VALID")
-
-graph(op::Op, xs...) = op.f(xs...)
-
-function graph(ctx::Context, model, args...)
-  node = graph(model, interpv(ctx, args)...)
-  isa(node, Tensor) && (ctx[:stacks][node.op.name] = stack(ctx))
-  return node
-end
-
-interp(ctx, c::Conv2D, x) =
-  nn.conv2d(interpv(ctx, x), interp(ctx, Constant(c.filter)), [1,c.stride...,1], "VALID")
-
-interp{T<:AArray}(ctx, p::Constant{Flux.Param{T}}) =
-  haskey(ctx[:params], p.value) ?
-     ctx[:params][p.value] :
-    (ctx[:params][p.value] = Variable(p.value.x))
-
-interp(ctx, p::Constant) = p.value
-
-function interp(ctx, model, args...)
-  g = Flux.graph(model)
-  g == nothing && return graph(ctx, model, args...)
-  DataFlow.iscyclic(g) && error("This model has a cycle; try unrolling it first.")
-  interpret(ctx, g, interpv(ctx, args)...)
-end
-
-function tograph(model, args...)
-  ctx = Context(mux(iline, ilambda, ituple, imap, interp),
-                params = ObjectIdDict(), stacks = Dict())
-  out = interp(ctx, model, map(constant, args)...)
-  return ctx[:params], ctx[:stacks], out
-end
-
-TensorFlow.Tensor(m::Flux.Model, args...) = tograph(m, args...)[2]
-
-RawTensor(data::Union{Batch,Seq}) = RawTensor(rawbatch(data))
diff --git a/src/backend/tensorflow/model.jl b/src/backend/tensorflow/model.jl
deleted file mode 100644
index ef6d2040..00000000
--- a/src/backend/tensorflow/model.jl
+++ /dev/null
@@ -1,98 +0,0 @@
-type Model
-  model::Any
-  session::Session
-  params::Dict{Flux.Param,Tensor}
-  stacks::Dict
-  inputs::Vector{Tensor}
-  output::Any
-end
-
-function makesession(model, inputs; session = Session(Graph()))
-  params, stacks, output = tograph(model, inputs...)
-  run(session, initialize_all_variables())
-  Model(model, session, params, stacks, inputs, output)
-end
-
-function makesession(model, n::Integer; session = Session(Graph()))
-  makesession(model, [placeholder(Float32) for _ = 1:n], session = session)
-end
-
-tf(model) = makesession(model, 1)
-
-function storeparams!(sess, params)
-  for (p, t) in params
-    p.x = run(sess, t)
-  end
-end
-
-storeparams!(m::Model) = storeparams!(m.session, m.params)
-
-ismultioutput(m::Model) = !isa(m.output, Tensor)
-
-function batch(xs)
-  dims = ndims(xs)-1
-  T = Array{eltype(xs),dims}
-  B = Array{eltype(xs),dims+1}
-  Batch{T,B}(xs)
-end
-
-function tferr(model::Model, e)
-  m = match(r"Node: ([\w\d]+) =", string(e.status))
-  m == nothing && return
-  node = m.captures[1]
-  if haskey(model.stacks, node)
-    stk = model.stacks[node]
-    println("TensorFlow error occured at:")
-    foreach(l -> println("$(l.file):$(l.line)"), stk)
-  end
-end
-
-function runmodel(m::Model, args...)
-  @assert length(args) == length(m.inputs)
-  try
-    output = run(m.session, m.output, Dict(zip(m.inputs, args)))
-    ismultioutput(m) ? (batch.(output)...,) : batch(output)
-  catch e
-    isa(e, TensorFlow.TFException) || rethrow(e)
-    tferr(m, e)
-    rethrow(e)
-  end
-end
-
-function (m::Model)(args::Batch...)
-  runmodel(m, args...)
-end
-
-function (m::Model)(args...)
-  output = m(map(batchone, args)...)
-  ismultioutput(m) ? map(first, output) : first(output)
-end
-
-for f in :[back!, update!].args
-  @eval function Flux.$f(m::Model, args...)
-    error($(string(f)) * " is not yet supported on TensorFlow models")
-  end
-end
-
-import Juno: info
-
-function Flux.train!(m::Model, train, test=[]; epoch = 1, η = 0.1,
-                     loss = (y, y′) -> reduce_sum((y - y′).^2)/2,
-                     opt = TensorFlow.train.GradientDescentOptimizer(η))
-  i = 0
-  Y = placeholder(Float32)
-  Loss = loss(m.output, Y)
-  minimize_op = TensorFlow.train.minimize(opt, Loss)
-  for e in 1:epoch
-    info("Epoch $e\n")
-    @progress for (x, y) in train
-      y, cur_loss, _ = run(m.session, vcat(m.output, Loss, minimize_op),
-                           Dict(m.inputs[1]=>batchone(x), Y=>batchone(y)))
-      if i % 5000 == 0
-        @show y
-        @show accuracy(m, test)
-      end
-      i += 1
-    end
-  end
-end
diff --git a/src/backend/tensorflow/recurrent.jl b/src/backend/tensorflow/recurrent.jl
deleted file mode 100644
index 5abee520..00000000
--- a/src/backend/tensorflow/recurrent.jl
+++ /dev/null
@@ -1,83 +0,0 @@
-# TODO: refactor, some of this is more general than just the TF backend
-
-type SeqModel
-  m::Model
-  state::Any
-end
-
-cgroup(xs...) = DataFlow.group(map(constant, xs)...)
-
-function makesession(model::Flux.Unrolled)
-  sess = Session(Graph())
-  input = placeholder(Float32)
-  inputs = TensorFlow.unpack(input, num = model.steps, axis = 1)
-  let params, stacks, outputs, instates, outstates
-    if model.stateful
-      instates = [placeholder(Float32) for _ in model.state]
-      params, stacks, (outstates, outputs) = tograph(model, cgroup(instates...), cgroup(inputs...))
-    else
-      params, stacks, outputs = tograph(model, cgroup(inputs...))
-    end
-    output = TensorFlow.pack(outputs, axis = 1)
-    run(sess, initialize_all_variables())
-    sess, params, stacks, (instates, input), (outstates, output)
-  end
-end
-
-function tf(model::Flux.Unrolled)
-  sess, params, stacks, (instates, input), (outstates, output) = makesession(model)
-  SeqModel(
-    Model(model, sess, params, stacks,
-          [instates..., input], [outstates..., output]),
-    model.state)
-end
-
-function batchseq(xs)
-  dims = ndims(xs)-2
-  T = Array{eltype(xs),dims}
-  S = Array{eltype(xs),dims+1}
-  B = Array{eltype(xs),dims+2}
-  Batch{Seq{T,S},B}(xs)
-end
-
-batchseq(xs::Batch) = batchseq(rawbatch(xs))
-
-TensorFlow.get_tensors(x::Tuple) = TensorFlow.get_tensors(collect(x))
-
-function (m::SeqModel)(x::BatchSeq)
-  m.m.model.stateful || return batchseq(runmodel(m.m, x)[end])
-  if isempty(m.state) || length(first(m.state)) ≠ length(x)
-    m.state = batchone.(m.m.model.state)
-  end
-  output = runmodel(m.m, m.state..., x)
-  m.state, output = output[1:end-1], output[end]
-  return batchseq(output)
-end
-
-(m::SeqModel)(x::Seq) = first(m(batchone(x)))
-
-function Flux.train!(m::SeqModel, Xs, Ys; epoch = 1, η = 0.1,
-                     loss = (y, ŷ) -> -reduce_sum(y .* log(ŷ)),
-                     opt = () -> TensorFlow.train.GradientDescentOptimizer(η))
-  batchlen, seqlen = length(first(Xs)), length(first(Xs)[1])
-  state = batchone.(m.m.model.state)
-  sess, params, stacks, (instates, input), (outstates, output) = makesession(m.m.model)
-  Y = placeholder(Float32)
-  Loss = loss(Y, output)/batchlen/seqlen
-  minimize_op = TensorFlow.train.minimize(opt(), Loss)
-  @progress "training" for e in 1:epoch
-    info("Epoch $e\n")
-    @progress "epoch" for (i, (x, y)) in enumerate(zip(Xs,Ys))
-      out = run(sess, vcat(outstates..., output, Loss, minimize_op),
-                merge(Dict(input=>batchone(x), Y=>batchone(y)),
-                      Dict(zip(instates, state))))
-      state = out[1:length(state)]
-      loss = out[end-1]
-      isnan(loss) && error("Loss is NaN")
-      isinf(loss) && error("Loss is Inf")
-      (i-1) % 10 == 0 && @show loss
-    end
-  end
-  storeparams!(sess, params)
-  return
-end
diff --git a/src/backend/tensorflow/tensorflow.jl b/src/backend/tensorflow/tensorflow.jl
deleted file mode 100644
index f2c27f2b..00000000
--- a/src/backend/tensorflow/tensorflow.jl
+++ /dev/null
@@ -1,21 +0,0 @@
-module TF
-
-using ..Flux, DataFlow, TensorFlow, Juno
-import Flux: accuracy
-
-export tf
-
-type Op
-  f
-  shape
-end
-
-Op(f) = Op(f, (d...) -> nothing)
-
-Flux.shape(op::Op, d...) = op.shape(d...)
-
-include("graph.jl")
-include("model.jl")
-include("recurrent.jl")
-
-end
diff --git a/src/compiler/code.jl b/src/compiler/code.jl
deleted file mode 100644
index 49c7dc65..00000000
--- a/src/compiler/code.jl
+++ /dev/null
@@ -1,81 +0,0 @@
-import DataFlow: mapconst, cse
-
-export @net, @ml
-
-function process_func(ex, params = [])
-  @capture(shortdef(ex), (args__,) -> body_)
-  body = @> body MacroTools.flatten liftloops graphm DataFlow.il
-  body = mapconst(x -> x in params ? :(self.$x) : x, body)
-  return args, body
-end
-
-function makegraph(graph, args)
-  @assert length(args) == 1
-  graph = prewalk(graph) do v
-    isa(value(v), Constant) && value(v).value == args[1] ?
-      inputnode(1) :
-      v
-  end
-  graph = map(graph) do x
-    isa(x, Offset) ?
-      :(Flux.Offset($(Expr(:quote, x.name)), $(x.n), self.$(x.name))) :
-      x
-  end
-  vertex(:(Flux.Frame(self)), graph)
-end
-
-function build_type(T, params)
-  @esc T
-  ex = quote
-    type $T <: Model
-      $(params...)
-    end
-  end
-  if any(x->isexpr(x, Symbol), params)
-    push!(ex.args,
-      :($T($(map(x->isexpr(x, Symbol) ? :($x::AArray) : x, params)...)) =
-          $T($(map(x->isexpr(x, Symbol) ? :(param($x)) : namify(x), params)...))))
-  end
-  ex
-end
-
-import Lazy: groupby
-
-reifyparams(v::IVertex) = mapconst(x -> isa(x, Param) ? x.x : x, v)
-
-function process_type(ex)
-  @capture(ex, type T_ fs__ end)
-  @destruct [params = false || [],
-             funcs  = true || []] = groupby(x->isexpr(x, :->, :function), fs)
-  @assert length(funcs) == 1
-  pnames = namify.(params)
-  args, body = process_func(funcs[1], pnames)
-  @assert length(args) == 1
-  self = esc(:self)
-  quote
-    $(build_type(T, params))
-    $(esc(:(self::$T)))($(args...),) = interpret(reifyparams(graph($self)), $(args...))
-    $(esc(:(Flux.update!(self::$T, η)))) = ($(map(p -> :(update!($self.$p, η)), pnames)...);)
-    $(esc(:(Flux.graph(self::$T)))) = $(DataFlow.constructor(mapconst(esc, makegraph(body, args))))
-    nothing
-  end
-end
-
-macro net(ex)
-  isexpr(ex, :type) ? process_type(ex) :
-  isexpr(ex, :->, :function) ? error("@net functions not implemented") :
-  error("Unsupported model expression $ex")
-end
-
-function process_anon(ex)
-  args, body = process_func(ex)
-  @assert length(args) == 1
-  :(Flux.Capacitor($(DataFlow.constructor(mapconst(esc, makegraph(body, args))))))
-end
-
-macro ml(ex)
-  @capture(shortdef(ex), ((xs__,) -> body_ ) | (f_(xs__,) = body_)) ||
-    error("@ml requires a function definition")
-  ex = process_anon(:($(xs...,) -> $body))
-  f == nothing ? ex : :($(esc(f)) = $ex)
-end
diff --git a/src/compiler/interp.jl b/src/compiler/interp.jl
deleted file mode 100644
index 05572150..00000000
--- a/src/compiler/interp.jl
+++ /dev/null
@@ -1,26 +0,0 @@
-using DataFlow: mux, interpret, interpv, ituple, ilambda, iconst, Context
-
-function astuple(xs::Vertex)
-  isconstant(xs) && isa(value(xs).value, Tuple) ? value(xs).value :
-  isa(xs, Vertex) && value(xs) == tuple ? inputs(xs) :
-  nothing
-end
-
-astuple(xs::Tuple) = xs
-
-astuple(xs) = nothing
-
-function astuples(xs)
-  xs = [astuple(x) for x in xs]
-  all(x->!(x==nothing), xs) ? xs : nothing
-end
-
-function imap(cb, ctx, ::typeof(map), f, xs...)
-  f, xs = interpv(ctx, (f, xs))
-  xs′ = astuples(xs)
-  xs′ ≠ nothing ?
-    group(map(f, xs′...)...) :
-    cb(ctx, map, constant(f), xs...)
-end
-
-imap(f, args...) = f(args...)
diff --git a/src/compiler/loops.jl b/src/compiler/loops.jl
deleted file mode 100644
index e5a49f53..00000000
--- a/src/compiler/loops.jl
+++ /dev/null
@@ -1,134 +0,0 @@
-export unroll, unroll1
-
-type Offset
-  name::Symbol
-  n::Int
-  default::Nullable{Param}
-end
-
-Offset(name, n) = Offset(name, n, nothing)
-
-Base.:-(o::Offset) = Offset(o.name, -o.n, o.default)
-
-function liftloops(ex)
-  ex = DataFlow.normedges(ex)
-  decls = Dict()
-  ex = MacroTools.postwalk(ex) do ex
-    @capture(ex, x_{n_}) || return ex
-    haskey(decls, (x,n)) && return namify(decls[(x,n)])
-    @gensym edge
-    decls[(x,n)] = :($edge = $(Offset(x,n))($x))
-    edge
-  end
-  prepend!(ex.args, collect(values(decls)))
-  ex
-end
-
-function hasloops(model)
-  g = graph(model)
-  g == nothing && return false
-  iscyclic(g) && return true
-  result = false
-  map(m -> hasloops(m) && (result = true), g)
-  return result
-end
-
-function atomise(model)
-  postwalk(graph(model)) do v
-    hasloops(value(v)) || return v
-    spliceinputs(atomise(value(v)), inputs(v)...)
-  end
-end
-
-function collect_state(v::IVertex)
-  state = typeof(v)[]
-  offset = Int[]
-  default = Param[]
-  prewalk!(v) do v
-    isa(value(v), Offset) || return v
-    if (i = findfirst(state, v[1])) == 0
-      push!(state, v[1])
-      push!(offset, max(0, -value(v).n))
-      push!(default, get(value(v).default))
-    else
-      offset[i] = max(offset[i], -value(v).n)
-    end
-    v
-  end
-  return state, offset, default
-end
-
-hiddeninput(n) = vertex(Split(n), inputnode(1))
-
-function create_steps(v::IVertex, n; seq = true, stateful = true)
-  [(stateful ? bumpinputs : copy)(seq ? spliceinputs(v, hiddeninput(i)) : v) for i = 1:n]
-end
-
-function getvar(n, step, steps, offset, default; stateful = true)
-  if stateful && step < 1
-    hiddeninput(sum(offset[1:n-1]) + 1 - step)
-  elseif step ∉ 1:length(steps)
-    constant(default[n])
-  else
-    steps[step][1,n]
-  end
-end
-
-function stateout(steps, offset, default)
-  outs = []
-  defaults = []
-  for i = 1:length(offset), j = 1:offset[i]
-    push!(outs, getvar(i, length(steps)-j+1, steps, offset, default))
-    push!(defaults, default[i])
-  end
-  group(outs...), defaults
-end
-
-function unrollgraph(v::IVertex, n; seq = true, stateful = true)
-  state, offset, default = collect_state(v)
-  v = group(group(state...), v)
-  steps = create_steps(v, n, seq = seq, stateful = stateful)
-  for i = 1:n
-    vars = inputs(steps[i][1])
-    postwalk!(steps[i]) do v
-      isa(value(v), Offset) || return v
-      varid = findfirst(vars,v[1])
-      getvar(varid, value(v).n + i, steps, offset, default, stateful = stateful)
-    end
-  end
-  out = group(map(x->x[2], steps)...)
-  if stateful
-    state, defaults = stateout(steps, offset, default)
-    group(state,out), map(Flux.state, defaults)
-  else
-    out, []
-  end
-end
-
-unrollgraph(m, n; kws...) = unrollgraph(atomise(m), n; kws...)
-
-# TODO: perhaps split into SeqModel + StatefulModel
-type Unrolled <: Model
-  model
-  graph::IVertex{Any}
-  state::Vector{Any}
-  stateful::Bool
-  steps::Int
-end
-
-(m::Unrolled)(xs...) = interpret(reifyparams(m.graph), xs...)
-
-graph(u::Unrolled) = u.graph
-
-function unroll(model, n; seq = true, stateful = true)
-  graph, state = unrollgraph(model, n; seq = seq, stateful = stateful)
-  seq || stateful ? Unrolled(model, graph, state, stateful, n) : Capacitor(graph)
-end
-
-function unroll1(model)
-  graph, state = unrollgraph(model, 1; seq = false)
-  graph = group(graph[1], map(x->x[1], inputs(graph)[2:end])...)
-  Unrolled(model, graph, state, false, 1)
-end
-
-flip(model) = Capacitor(map(x -> isa(x, Offset) ? -x : x, atomise(model)))
diff --git a/src/compiler/shape.jl b/src/compiler/shape.jl
deleted file mode 100644
index 99d496c9..00000000
--- a/src/compiler/shape.jl
+++ /dev/null
@@ -1,48 +0,0 @@
-using DataFlow: ilinev, iargs, applylines, Line
-
-type Hint
-  typ
-end
-
-DataFlow.tocall(h::Hint, x) = :($x::$(h.typ))
-
-function gethint(v::IVertex)
-  while isa(value(v), Union{Line,Frame}) v = v[1] end
-  isa(value(v), Hint) && return value(v).typ
-  return
-end
-
-ihint(f, ctx::Context, h::Hint, x) = vertex(h, x)
-ihint(f, args...) = f(args...)
-
-hintify(c::Constant) = hintify(state(c.value))
-hintify(xs::AbstractArray) = vertex(Hint(size(xs)), constant(:_))
-
-interpshape = mux(ilinev, ihint, iargs, ituple, hintify)
-
-function hintify(f, xs...)
-  sh = infer(f, map(gethint, xs)...)
-  sh ≠ nothing ? vertex(Hint(sh), vertex(f, xs...)) :
-  !any(x->x==nothing, xs) && graph(f) ≠ nothing ? interpret(Context(interpshape), graph(f), xs...) :
-    vertex(f, xs...)
-end
-
-function shapesv(f, args...)
-  (g = graph(f)) == nothing && return
-  ins = [vertex(Hint(d), inputnode(i)) for (i,d) in enumerate(args)]
-  interpret(Context(interpshape), g, ins...)
-end
-
-shapes(args...) = shapesv(args...) |> syntax |> applylines |> (x->prettify(x, lines=true))
-
-# Inference primitives
-
-infer(f, args...) = graph(f) == nothing ? nothing : gethint(shapesv(f, args...))
-
-function infer(::typeof(*), a::NTuple{2}, b::NTuple{2})
-  a[2] == b[1] || return nothing
-  (a[1], b[2])
-end
-
-# TODO: make correct
-infer(::typeof(+), a, b) = a
diff --git a/src/cost.jl b/src/cost.jl
deleted file mode 100644
index 34267202..00000000
--- a/src/cost.jl
+++ /dev/null
@@ -1,8 +0,0 @@
-export mse, mse!
-
-function mse!(Δ, pred, target)
-  map!(-, Δ, pred, target)
-  sumabs2(Δ)/2
-end
-
-mse(pred, target) = mse(similar(pred), pred, target)
diff --git a/src/data.jl b/src/data.jl
deleted file mode 100644
index be5497b6..00000000
--- a/src/data.jl
+++ /dev/null
@@ -1,36 +0,0 @@
-export onehot, onecold, chunk, partition, batches, sequences
-
-"""
-    onehot('b', ['a', 'b', 'c', 'd']) => [false, true, false, false]
-
-    onehot(Float32, 'c', ['a', 'b', 'c', 'd']) => [0., 0., 1., 0.]
-
-Produce a one-hot-encoded version of an item, given a list of possible values
-for the item.
-"""
-onehot(T::Type, label, labels) = T[i == label for i in labels]
-onehot(label, labels) = onehot(Int, label, labels)
-
-"""
-    onecold([0.0, 1.0, 0.0, ...],
-            ['a', 'b', 'c', ...]) => 'b'
-
-The inverse of `onehot`; takes an output prediction vector and a list of
-possible values, and produces the appropriate value.
-"""
-onecold(pred, labels = 1:length(pred)) = labels[findfirst(pred, maximum(pred))]
-
-using Iterators
-import Iterators: partition
-
-export partition
-
-Base.length(l::Iterators.Partition) = length(l.xs) ÷ l.step
-
-_partition(r::UnitRange, step::Integer) = (step*(i-1)+1:step*i for i in 1:(r.stop÷step))
-_partition(xs, step) = (xs[i] for i in _partition(1:length(xs), step))
-
-chunk(xs, n) = _partition(xs, length(xs)÷n)
-
-batches(xs...) = (Batch(x) for x in zip(xs...))
-sequences(xs, len) = (Seq(x) for x in partition(xs, len))
diff --git a/src/dims/batching.jl b/src/dims/batching.jl
deleted file mode 100644
index 8faf50f6..00000000
--- a/src/dims/batching.jl
+++ /dev/null
@@ -1,21 +0,0 @@
-export Batch, batchone
-
-immutable Batch{T,S} <: AbstractVector{T}
-  data::CatMat{T,S}
-end
-
-@forward Batch.data size, eltype, getindex, setindex!, rawbatch
-
-Batch(xs) = Batch(CatMat(xs))
-
-convert{T,S}(::Type{Batch{T,S}},storage::S) =
-  Batch{T,S}(storage)
-
-batchone(x) = Batch((x,))
-batchone(x::Batch) = x
-
-@render Juno.Inline b::Batch begin
-  Tree(Row(Text("Batch of "), eltype(b),
-           Juno.fade("[$(length(b))]")),
-       Juno.trim(collect(b)))
-end
diff --git a/src/dims/catmat.jl b/src/dims/catmat.jl
deleted file mode 100644
index 17592921..00000000
--- a/src/dims/catmat.jl
+++ /dev/null
@@ -1,50 +0,0 @@
-import Base: eltype, size, getindex, setindex!, convert
-
-export CatMat, rawbatch
-
-immutable CatMat{T,S} <: AbstractVector{T}
-  data::S
-end
-
-convert{T,S}(::Type{CatMat{T,S}},storage::S) =
-  CatMat{T,S}(storage)
-
-eltype{T}(::CatMat{T}) = T
-
-size(b::CatMat) = (size(b.data, 1),)
-
-getindex(b::CatMat, i)::eltype(b) = slicedim(b.data, 1, i)
-
-setindex!(b::CatMat, v, i) = b[i, :] = v
-
-allequal(xs) = all(x -> x == first(xs), xs)
-
-function (::Type{CatMat{T,S}}){T,S}(xs, storage::S)
-  @assert @>> xs map(size) allequal
-  @assert size(storage) == (length(xs), size(first(xs))...)
-  for i = 1:length(xs)
-    storage[i, :] = xs[i]
-  end
-  return CatMat{T,S}(storage)
-end
-
-function (::Type{CatMat{T}}){T}(xs)
-  xs′ = map(rawbatch, xs)
-  storage = similar(first(xs′), (length(xs′), size(first(xs′))...))
-  CatMat{T,typeof(storage)}(xs′, storage)
-end
-
-function CatMat(xs)
-  xs = promote(xs...)
-  CatMat{eltype(xs)}(xs)
-end
-
-@render Juno.Inline b::CatMat begin
-  Tree(Row(Text("CatMat of "), eltype(b),
-           Juno.fade("[$(length(b))]")),
-       Juno.trim(collect(b)))
-end
-
-rawbatch(xs) = xs
-
-rawbatch(xs::CatMat) = xs.data
diff --git a/src/dims/seq.jl b/src/dims/seq.jl
deleted file mode 100644
index 3b0d3d02..00000000
--- a/src/dims/seq.jl
+++ /dev/null
@@ -1,20 +0,0 @@
-export seq, Seq, BatchSeq
-
-immutable Seq{T,S} <: AbstractVector{T}
-  data::CatMat{T,S}
-end
-
-@forward Seq.data size, eltype, getindex, setindex!, rawbatch
-
-Seq(xs) = Seq(CatMat(xs))
-
-convert{T,S}(::Type{Seq{T,S}},storage::S) =
-  Seq{T,S}(storage)
-
-@render Juno.Inline b::Seq begin
-  Tree(Row(Text("Seq of "), eltype(b),
-           Juno.fade("[$(length(b))]")),
-       Juno.trim(collect(b)))
-end
-
-typealias BatchSeq{T<:Seq} Batch{T}
diff --git a/src/layers/activation.jl b/src/layers/activation.jl
deleted file mode 100644
index 88cad04c..00000000
--- a/src/layers/activation.jl
+++ /dev/null
@@ -1,18 +0,0 @@
-export σ, relu, softmax, flatten
-
-σ(x) = 1 ./ (1 + exp.(-x))
-
-back!(::typeof(σ), Δ, x) = Δ .* σ(x)./(1.-σ(x))
-
-relu(x) = max(0, x)
-
-back!(::typeof(relu), Δ, x) = Δ .* (x .< 0)
-
-softmax(xs) = exp.(xs) ./ sum(exp.(xs))
-
-flatten(xs) = reshape(xs, length(xs))
-
-shape(::typeof(flatten), in) = prod(in)
-
-infer(::typeof(softmax), x) = x
-infer(::typeof(σ), x) = x
diff --git a/src/layers/affine.jl b/src/layers/affine.jl
deleted file mode 100644
index 81fef977..00000000
--- a/src/layers/affine.jl
+++ /dev/null
@@ -1,20 +0,0 @@
-export Affine
-
-# TODO: type hints for parameters
-
-@net type Affine
-  W
-  b
-  x -> x*W + b
-end
-
-Affine(in::Integer, out::Integer; init = initn) =
-  Affine(init(in, out), init(1, out))
-
-@net type Sigmoid
-  layer::Model
-  x -> σ(layer(x))
-end
-
-Sigmoid(in::Integer, out::Integer; init = randn) =
-  Sigmoid(Affine(in, out, init = init))
diff --git a/src/layers/chain.jl b/src/layers/chain.jl
deleted file mode 100644
index b4bd6ced..00000000
--- a/src/layers/chain.jl
+++ /dev/null
@@ -1,32 +0,0 @@
-export Chain
-
-function inferchain(ms)
-  chain = []
-  sh = nothing
-  for m in ms
-    m = init(m, single(sh))
-    sh = shape(m, sh)
-    push!(chain, m)
-  end
-  return chain, sh
-end
-
-type Chain <: Model
-  layers::Vector{Any}
-  shape
-  function Chain(ms...)
-    ms, shape = inferchain(ms)
-    return new(ms, shape)
-  end
-end
-
-@forward Chain.layers Base.getindex, Base.first, Base.last
-
-(s::Chain)(x) = foldl((x, m) -> m(x), x, s.layers)
-back!(s::Chain, Δ) = foldr((m, Δ) -> back!(m, Δ), Δ, s.layers)
-update!(s::Chain, η) = foreach(l -> update!(l, η), s.layers)
-
-graph(s::Chain) =
-  foldl((v, m) -> vertex(m, v), constant(inputnode(1)), s.layers)
-
-shape(c::Chain, in) = c.shape
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
deleted file mode 100644
index e012a215..00000000
--- a/src/layers/recurrent.jl
+++ /dev/null
@@ -1,51 +0,0 @@
-export Recurrent, GatedRecurrent, LSTM
-
-@net type Recurrent
-  Wxy; Wyy; by
-  y
-  function (x)
-    y = tanh( x * Wxy + y{-1} * Wyy + by )
-  end
-end
-
-Recurrent(in, out; init = initn) =
-  Recurrent(init((in, out)), init((out, out)), init(out), init(out))
-
-@net type GatedRecurrent
-  Wxr; Wyr; br
-  Wxu; Wyu; bu
-  Wxh; Wyh; bh
-  y
-  function (x)
-    reset  = σ( x * Wxr + y{-1} * Wyr + br )
-    update = σ( x * Wxu + y{-1} * Wyu + bu )
-    y′ = tanh( x * Wxh + (reset .* y{-1}) * Wyh + bh )
-    y = (1 .- update) .* y′ + update .* y{-1}
-  end
-end
-
-GatedRecurrent(in, out; init = initn) =
-  GatedRecurrent(vcat([[init((in, out)), init((out, out)), init(out)] for _ = 1:3]...)...,
-       zeros(Float32, out))
-
-@net type LSTM
-  Wxf; Wyf; bf
-  Wxi; Wyi; bi
-  Wxo; Wyo; bo
-  Wxc; Wyc; bc
-  y; state
-  function (x)
-    # Gates
-    forget = σ( x * Wxf + y{-1} * Wyf + bf )
-    input  = σ( x * Wxi + y{-1} * Wyi + bi )
-    output = σ( x * Wxo + y{-1} * Wyo + bo )
-    # State update and output
-    state′ = tanh( x * Wxc + y{-1} * Wyc + bc )
-    state  = forget .* state{-1} + input .* state′
-    y = output .* tanh(state)
-  end
-end
-
-LSTM(in, out; init = initn) =
-  LSTM(vcat([[init((in, out)), init((out, out)), init((1, out))] for _ = 1:4]...)...,
-       zeros(Float32, out), zeros(Float32, out))
diff --git a/src/layers/shape.jl b/src/layers/shape.jl
deleted file mode 100644
index 5fe47cd9..00000000
--- a/src/layers/shape.jl
+++ /dev/null
@@ -1,47 +0,0 @@
-export Input
-
-typealias Dims{N} NTuple{N,Int}
-
-dims(d::Dims) = d
-
-dims(i...) = (i...,)
-
-single(i) = i
-single(i::Dims) = length(i) == 1 ? first(i) : i
-
-# Shim for kicking off shape inference
-
-type ShapeError <: Exception
-  layer
-  shape
-end
-
-type Input{N} <: Model
-  dims::Dims{N}
-end
-
-Input(i...) = Input(dims(i...))
-
-(::Input)(x) = x
-back!(::Input, Δ, x) = Δ
-
-# Initialise placeholder
-
-type Init{F}
-  f::F
-end
-
-init(i::Init, input...) = i.f(input...)
-init(m, input...) = m
-
-# Shape inference API
-
-shape(x, in) = in
-
-shape(i::Input, _) = i.dims
-
-# Implementation for bundled layers
-
-shape(d::Affine, _) = length(state(d.b)) # TODO: could perhaps infer this
-
-Affine(out::Integer) = Init(in::Integer -> Affine(in, out))
diff --git a/src/layers/shims.jl b/src/layers/shims.jl
deleted file mode 100644
index 8ffc4e1b..00000000
--- a/src/layers/shims.jl
+++ /dev/null
@@ -1,44 +0,0 @@
-export Conv2D, MaxPool, Reshape
-
-type Conv2D <: Model
-  filter::Param{Array{Float32,4}} # [height, width, inchans, outchans]
-  stride::Dims{2}
-end
-
-Conv2D(size; in = 1, out = 1, stride = (1,1), init = initn) =
-  Conv2D(param(initn(size..., in, out)), stride)
-
-shape(c::Conv2D, in::Dims{2}) =
-  (map(i -> (in[i]-size(c.filter,i))÷c.stride[i]+1, (1,2))..., size(c.filter, 4))
-
-shape(c::Conv2D, in::Dims{3}) =
-  shape(c, (in[1],in[2]))
-
-type MaxPool <: Model
-  size::Dims{2}
-  stride::Dims{2}
-end
-
-MaxPool(size; stride = (1,1)) =
-  MaxPool(size, stride)
-
-shape(c::MaxPool, in::Dims{2}) =
-  map(i -> (in[i]-c.size[i])÷c.stride[i]+1, (1,2))
-
-shape(c::MaxPool, in::Dims{3}) =
-  (shape(c, (in[1],in[2]))..., in[3])
-
-shape(c::MaxPool, in) = throw(ShapeError(c, in))
-
-immutable Reshape{N}
-  dims::Dims{N}
-end
-
-Reshape(dims::Integer...) = Reshape(dims)
-
-function shape(r::Reshape, dims)
-    prod(dims) == prod(r.dims) || throw(ShapeError(r, dims))
-    return r.dims
-end
-
-shape(r::Reshape, ::Void) = r.dims
diff --git a/src/model.jl b/src/model.jl
deleted file mode 100644
index 9bca1e8c..00000000
--- a/src/model.jl
+++ /dev/null
@@ -1,111 +0,0 @@
-export Model, back!, update!, param
-
-# Basic model API
-
-"""
-    (m::Model)(X...) => Y
-
-A "model" is a function with state. For example, a logistic regression is the
-function
-
-    x -> σ(x * W + b)
-
-where `W` and `b` are a trainable matrix and vector of weights repectively. The
-`Model` abstract type is used loosely; in general the concept of a model is
-closer to a protocol, and models don't need to inherit from this type. Normal
-Julia functions are models with 0 parameters, for example.
-"""
-abstract Model
-
-"""
-    back!(m::Model, ΔY, X...) => ΔX
-
-Backpropagate the gradient `ΔY` through the model `m`, accumulating the
-gradients of any parameters. Returns the gradient of the input `X`. Gradients
-may be arrays or tuples of arrays (for multiple inputs/outputs).
-"""
-back!(m::Model, Δ, xs...) = error("Backprop not implemented for $(typeof(m))")
-
-"""
-    update!(m::Model, η) => m
-
-Update the parameters of the model `m` using the accumulated gradients from
-`back!`, using the learning rate `η`.
-"""
-update!(m, η) = m
-
-"""
-    graph(m::Model) => ::IVertex{Any} | nothing
-
-Returns the graph representation of the model, if any. Most models are built
-from lower-level components and can simply implement this method to get most of
-Flux's functionality. If this method isn't available, functionality like
-backpropagation or conversion for backend must be implemented on a case-by-case
-basis. Alternatively, one can implement this method and override individual
-methods as necessary.
-"""
-graph(m) = nothing
-
-# Model parameters
-
-"""
-A `Param` object stores a parameter array along with an accumulated delta to
-that array. When converting to backends like TensorFlow, identical `Param`s will
-result in identical variable objects, making model reuse trivial.
-"""
-type Param{T}
-  x::T
-  Δx::T
-end
-
-"""
-    param(x::T) => ::Param{T}
-
-Convenience method for creating a `Param` object for a given array.
-"""
-param(x) = Param(x, zero(x))
-
-state(p::Param) = p.x
-
-"""
-    accumulate!(p::Param, Δ) => p
-
-Accumulates the update `Δ` on `p`. The value of `p` won't change until
-`update!`.
-"""
-function accumulate!(p::Param, Δ)
-  p.Δx += Δ
-  return p
-end
-
-"""
-    update!(p::Param)
-
-Apply the accumulated updates to the value of the parameter.
-"""
-function update!(p::Param, η)
-  p.x .-= p.Δx .* η
-  p.Δx[:] = 0
-  return p
-end
-
-state(x) = x
-accumulate!(x, Δ) = x
-
-@forward Param.x Base.size
-
-function Base.show(io::IO, p::Param)
-  print(io, "Param", size(p.x))
-end
-
-# Anonymous models
-
-export Capacitor
-
-type Capacitor <: Model
-  graph::IVertex{Any}
-end
-
-(m::Capacitor)(xs...) = interpret(reifyparams(m.graph), xs...)
-
-graph(cap::Capacitor) = cap.graph
diff --git a/src/utils.jl b/src/utils.jl
deleted file mode 100644
index c685fd4e..00000000
--- a/src/utils.jl
+++ /dev/null
@@ -1,30 +0,0 @@
-export AArray
-
-const AArray = AbstractArray
-
-initn(dims...) = randn(Float32, dims...)/10
-
-function train!(m, train, test = []; epoch = 1, batch = 10, η = 0.1)
-    i = 0
-    Δ = zeros(length(train[1][2]))
-    for _ in 1:epoch
-      @progress for (x, y) in train
-        i += 1
-        pred = m(x)
-        any(isnan, pred) && error("NaN")
-        err = mse!(Δ, pred, y)
-        back!(m, Δ, x)
-        i % batch == 0 && update!(m, η)
-        i % 1000 == 0 && @show accuracy(m, test)
-      end
-    end
-    return m
-end
-
-function accuracy(m, data)
-  correct = 0
-  for (x, y) in data
-    onecold(m(x)) == onecold(y) && (correct += 1)
-  end
-  return correct/length(data)
-end
diff --git a/test/backend.jl b/test/backend.jl
deleted file mode 100644
index e69de29b..00000000
diff --git a/test/basic.jl b/test/basic.jl
deleted file mode 100644
index 7209b36f..00000000
--- a/test/basic.jl
+++ /dev/null
@@ -1,10 +0,0 @@
-xs = randn(10)' # TODO: batching semantics
-
-d = Affine(10, 20)
-
-@test d(xs) == xs*d.W.x + d.b.x
-
-let
-  @capture(syntax(d), _Frame(_Line(x_[1] * W_ + b_)))
-  @test isa(x, Input) && isa(W, Param) && isa(b, Param)
-end
diff --git a/test/recurrent.jl b/test/recurrent.jl
deleted file mode 100644
index e69de29b..00000000
diff --git a/test/runtests.jl b/test/runtests.jl
deleted file mode 100644
index 810223ec..00000000
--- a/test/runtests.jl
+++ /dev/null
@@ -1,10 +0,0 @@
-using Flux, DataFlow, MacroTools, Base.Test
-using Flux: graph, Param
-using DataFlow: Input, Line, Frame
-
-syntax(v::Vertex) = prettify(DataFlow.syntax(v))
-syntax(x) = syntax(graph(x))
-
-include("basic.jl")
-include("recurrent.jl")
-include("backend.jl")