Add missing docstrings to src/data.

2019-08-31 12:49:40 +02:00 · 2019-08-31 12:49:40 +02:00 · 740a59d0a6
commit 740a59d0a6
parent ba80c2e8ab
2 changed files with 46 additions and 0 deletions
--- a/src/data/cmudict.jl
+++ b/src/data/cmudict.jl
@ -24,18 +24,35 @@ function load()
  end
 end
 """
    phones()
 Return a `Vector` containing the phones used in the dataset.
 """
 function phones()
  load()
  Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
                        "\n", keepempty = false), "\t")))
 end
 """
    symbols()
 Return a `Vector` containing the symbols used in the dataset.
 A symbol is a phone with optional auxiliary symbols, indicating for example the
 amount of stress on the phone.
 """
 function symbols()
  load()
  Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
                "\n", keepempty = false))
 end
 """
    rawdict()
 Return the unfiltered CMU Pronouncing Dictionary.
 """
 function rawdict()
  load()
  Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
@ -44,6 +61,14 @@ end
 validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
 """
    cmudict()
 Return a filtered CMU Pronouncing Dictionary.
 It is filtered so each word contains only ASCII characters and a combination of
 word characters (as determined by the regex engine using `\\w`), '-' and '.'.
 """
 cmudict() = filter(p -> validword(p.first), rawdict())
 alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
--- a/src/data/sentiment.jl
+++ b/src/data/sentiment.jl
@ -1,3 +1,4 @@
 "Stanford Sentiment Treebank dataset."
 module Sentiment
 using ZipFile
@ -39,8 +40,28 @@ function gettrees(name)
  return parsetree.(ss)
 end
 """
    train()
 Return the train split of the Stanford Sentiment Treebank.
 The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
 """
 train() = gettrees("train")
 """
    test()
 Return the test split of the Stanford Sentiment Treebank.
 The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
 """
 test() = gettrees("test")
 """
    dev()
 Return the dev split of the Stanford Sentiment Treebank.
 The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
 """
 dev() = gettrees("dev")
 end