Add missing docstrings to `src/data`.
This commit is contained in:
parent
ba80c2e8ab
commit
740a59d0a6
|
@ -24,18 +24,35 @@ function load()
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
phones()
|
||||||
|
|
||||||
|
Return a `Vector` containing the phones used in the dataset.
|
||||||
|
"""
|
||||||
function phones()
|
function phones()
|
||||||
load()
|
load()
|
||||||
Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
|
Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
|
||||||
"\n", keepempty = false), "\t")))
|
"\n", keepempty = false), "\t")))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
symbols()
|
||||||
|
|
||||||
|
Return a `Vector` containing the symbols used in the dataset.
|
||||||
|
A symbol is a phone with optional auxiliary symbols, indicating for example the
|
||||||
|
amount of stress on the phone.
|
||||||
|
"""
|
||||||
function symbols()
|
function symbols()
|
||||||
load()
|
load()
|
||||||
Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
|
Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
|
||||||
"\n", keepempty = false))
|
"\n", keepempty = false))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
rawdict()
|
||||||
|
|
||||||
|
Return the unfiltered CMU Pronouncing Dictionary.
|
||||||
|
"""
|
||||||
function rawdict()
|
function rawdict()
|
||||||
load()
|
load()
|
||||||
Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
|
Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
|
||||||
|
@ -44,6 +61,14 @@ end
|
||||||
|
|
||||||
validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
|
validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
|
||||||
|
|
||||||
|
"""
|
||||||
|
cmudict()
|
||||||
|
|
||||||
|
Return a filtered CMU Pronouncing Dictionary.
|
||||||
|
|
||||||
|
It is filtered so each word contains only ASCII characters and a combination of
|
||||||
|
word characters (as determined by the regex engine using `\\w`), '-' and '.'.
|
||||||
|
"""
|
||||||
cmudict() = filter(p -> validword(p.first), rawdict())
|
cmudict() = filter(p -> validword(p.first), rawdict())
|
||||||
|
|
||||||
alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
|
alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
"Stanford Sentiment Treebank dataset."
|
||||||
module Sentiment
|
module Sentiment
|
||||||
|
|
||||||
using ZipFile
|
using ZipFile
|
||||||
|
@ -39,8 +40,28 @@ function gettrees(name)
|
||||||
return parsetree.(ss)
|
return parsetree.(ss)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
train()
|
||||||
|
|
||||||
|
Return the train split of the Stanford Sentiment Treebank.
|
||||||
|
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
|
||||||
|
"""
|
||||||
train() = gettrees("train")
|
train() = gettrees("train")
|
||||||
|
|
||||||
|
"""
|
||||||
|
test()
|
||||||
|
|
||||||
|
Return the test split of the Stanford Sentiment Treebank.
|
||||||
|
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
|
||||||
|
"""
|
||||||
test() = gettrees("test")
|
test() = gettrees("test")
|
||||||
|
|
||||||
|
"""
|
||||||
|
dev()
|
||||||
|
|
||||||
|
Return the dev split of the Stanford Sentiment Treebank.
|
||||||
|
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
|
||||||
|
"""
|
||||||
dev() = gettrees("dev")
|
dev() = gettrees("dev")
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue