Add missing docstrings to `src/data`.

This commit is contained in:
janEbert 2019-08-31 12:49:40 +02:00
parent ba80c2e8ab
commit 740a59d0a6
2 changed files with 46 additions and 0 deletions

View File

@ -24,18 +24,35 @@ function load()
end end
end end
"""
phones()
Return a `Vector` containing the phones used in the dataset.
"""
function phones() function phones()
load() load()
Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String), Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
"\n", keepempty = false), "\t"))) "\n", keepempty = false), "\t")))
end end
"""
symbols()
Return a `Vector` containing the symbols used in the dataset.
A symbol is a phone with optional auxiliary symbols, indicating for example the
amount of stress on the phone.
"""
function symbols() function symbols()
load() load()
Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String), Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
"\n", keepempty = false)) "\n", keepempty = false))
end end
"""
rawdict()
Return the unfiltered CMU Pronouncing Dictionary.
"""
function rawdict() function rawdict()
load() load()
Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
@ -44,6 +61,14 @@ end
validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s) validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
"""
cmudict()
Return a filtered CMU Pronouncing Dictionary.
It is filtered so each word contains only ASCII characters and a combination of
word characters (as determined by the regex engine using `\\w`), '-' and '.'.
"""
cmudict() = filter(p -> validword(p.first), rawdict()) cmudict() = filter(p -> validword(p.first), rawdict())
alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.'] alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']

View File

@ -1,3 +1,4 @@
"Stanford Sentiment Treebank dataset."
module Sentiment module Sentiment
using ZipFile using ZipFile
@ -39,8 +40,28 @@ function gettrees(name)
return parsetree.(ss) return parsetree.(ss)
end end
"""
train()
Return the train split of the Stanford Sentiment Treebank.
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
"""
train() = gettrees("train") train() = gettrees("train")
"""
test()
Return the test split of the Stanford Sentiment Treebank.
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
"""
test() = gettrees("test") test() = gettrees("test")
"""
dev()
Return the dev split of the Stanford Sentiment Treebank.
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
"""
dev() = gettrees("dev") dev() = gettrees("dev")
end end