add cmudict dataset

This commit is contained in:
Mike J Innes 2017-11-01 16:01:34 +00:00
parent 0f8ba87dc6
commit e7a510da9a
5 changed files with 62 additions and 1 deletions

2
.gitignore vendored
View File

@ -4,4 +4,4 @@
docs/build/ docs/build/
docs/site/ docs/site/
docs/flux.css docs/flux.css
demos deps

View File

@ -29,4 +29,6 @@ include("layers/basic.jl")
include("layers/recurrent.jl") include("layers/recurrent.jl")
include("layers/normalisation.jl") include("layers/normalisation.jl")
include("data/Data.jl")
end # module end # module

14
src/data/Data.jl Normal file
View File

@ -0,0 +1,14 @@
module Data
export CMUDict, cmudict
deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...)
function __init__()
mkpath(deps())
end
include("cmudict.jl")
using .CMUDict
end

42
src/data/cmudict.jl Normal file
View File

@ -0,0 +1,42 @@
module CMUDict
export cmudict
using ..Data: deps
const version = "0.7b"
function load()
isdir(deps("cmudict")) && return
mkpath(deps("cmudict"))
for x in ["", ".phones", ".symbols"]
download("http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-$version$x",
deps("cmudict", "cmudict$x"))
end
end
function phones()
load()
Symbol.(first.(split.(split(readstring(deps("cmudict", "cmudict.phones")),
"\n", keep = false), "\t")))
end
function symbols()
load()
Symbol.(split(readstring(deps("CMUDict", "cmudict.symbols")),
"\n", keep = false))
end
function rawdict()
load()
Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
filter(!isempty, split.(split(readstring(deps("CMUDict", "cmudict")), "\n"))))
end
validword(s) = ismatch(r"^[\w-\.]+$", s)
cmudict() = filter((s, ps) -> validword(s), rawdict())
alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
end

3
test/data.jl Normal file
View File

@ -0,0 +1,3 @@
using Flux.Data
@test cmudict()["CATASTROPHE"] == :[K,AH0,T,AE1,S,T,R,AH0,F,IY0].args