68 lines
1.7 KiB
Julia
68 lines
1.7 KiB
Julia
"Stanford Sentiment Treebank dataset."
|
|
module Sentiment
|
|
|
|
using ZipFile
|
|
using ..Data: deps, download_and_verify
|
|
|
|
function load()
|
|
isfile(deps("sentiment.zip")) && return
|
|
@info "Downloading sentiment treebank dataset"
|
|
download_and_verify("https://cache.julialang.org/https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip",
|
|
deps("sentiment.zip"), "5c613a4f673fc74097d523a2c83f38e0cc462984d847b82c7aaf36b01cbbbfcc")
|
|
end
|
|
|
|
getfile(r, name) = r.files[findfirst(x -> x.name == name, r.files)]
|
|
|
|
function getfile(name)
|
|
r = ZipFile.Reader(deps("sentiment.zip"))
|
|
text = read(getfile(r, "trees/$name"), String)
|
|
close(r)
|
|
return text
|
|
end
|
|
|
|
using ..Data: Tree
|
|
|
|
totree_(n, w) = Tree{Any}((parse(Int, n), w))
|
|
totree_(n, a, b) = Tree{Any}((parse(Int, n), nothing), totree(a), totree(b))
|
|
totree(t::Expr) = totree_(t.args...)
|
|
|
|
function parsetree(s)
|
|
s = replace(s, "\\" => "")
|
|
s = replace(s, "\$" => "\\\$")
|
|
s = replace(s, r"[^ \n\(\)]+" => s -> "\"$s\"")
|
|
s = replace(s, " " => ", ")
|
|
return totree(Meta.parse(s))
|
|
end
|
|
|
|
function gettrees(name)
|
|
load()
|
|
ss = split(getfile("$name.txt"), '\n', keepempty = false)
|
|
return parsetree.(ss)
|
|
end
|
|
|
|
"""
|
|
train()
|
|
|
|
Return the train split of the Stanford Sentiment Treebank.
|
|
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
|
|
"""
|
|
train() = gettrees("train")
|
|
|
|
"""
|
|
test()
|
|
|
|
Return the test split of the Stanford Sentiment Treebank.
|
|
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
|
|
"""
|
|
test() = gettrees("test")
|
|
|
|
"""
|
|
dev()
|
|
|
|
Return the dev split of the Stanford Sentiment Treebank.
|
|
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
|
|
"""
|
|
dev() = gettrees("dev")
|
|
|
|
end
|