diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl index e6266540..0ed724d4 100644 --- a/src/data/cmudict.jl +++ b/src/data/cmudict.jl @@ -24,18 +24,35 @@ function load() end end +""" + phones() + +Return a `Vector` containing the phones used in the dataset. +""" function phones() load() Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String), "\n", keepempty = false), "\t"))) end +""" + symbols() + +Return a `Vector` containing the symbols used in the dataset. +A symbol is a phone with optional auxiliary symbols, indicating for example the +amount of stress on the phone. +""" function symbols() load() Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String), "\n", keepempty = false)) end +""" + rawdict() + +Return the unfiltered CMU Pronouncing Dictionary. +""" function rawdict() load() Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in @@ -44,6 +61,14 @@ end validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s) +""" + cmudict() + +Return a filtered CMU Pronouncing Dictionary. + +It is filtered so each word contains only ASCII characters and a combination of +word characters (as determined by the regex engine using `\\w`), '-' and '.'. +""" cmudict() = filter(p -> validword(p.first), rawdict()) alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.'] diff --git a/src/data/sentiment.jl b/src/data/sentiment.jl index ecb1ab8d..058dcf07 100644 --- a/src/data/sentiment.jl +++ b/src/data/sentiment.jl @@ -1,3 +1,4 @@ +"Stanford Sentiment Treebank dataset." module Sentiment using ZipFile @@ -39,8 +40,28 @@ function gettrees(name) return parsetree.(ss) end +""" + train() + +Return the train split of the Stanford Sentiment Treebank. +The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format. +""" train() = gettrees("train") + +""" + test() + +Return the test split of the Stanford Sentiment Treebank. +The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format. +""" test() = gettrees("test") + +""" + dev() + +Return the dev split of the Stanford Sentiment Treebank. +The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format. +""" dev() = gettrees("dev") end