Flux.jl/src/data/sentiment.jl
2020-04-04 18:16:46 +02:00

68 lines
1.7 KiB
Julia

"Stanford Sentiment Treebank dataset."
module Sentiment
using ZipFile
using ..Data: deps, download_and_verify
function load()
isfile(deps("sentiment.zip")) && return
@info "Downloading sentiment treebank dataset"
download_and_verify("https://cache.julialang.org/https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip",
deps("sentiment.zip"), "5c613a4f673fc74097d523a2c83f38e0cc462984d847b82c7aaf36b01cbbbfcc")
end
getfile(r, name) = r.files[findfirst(x -> x.name == name, r.files)]
function getfile(name)
r = ZipFile.Reader(deps("sentiment.zip"))
text = read(getfile(r, "trees/$name"), String)
close(r)
return text
end
using ..Data: Tree
totree_(n, w) = Tree{Any}((parse(Int, n), w))
totree_(n, a, b) = Tree{Any}((parse(Int, n), nothing), totree(a), totree(b))
totree(t::Expr) = totree_(t.args...)
function parsetree(s)
s = replace(s, "\\" => "")
s = replace(s, "\$" => "\\\$")
s = replace(s, r"[^ \n\(\)]+" => s -> "\"$s\"")
s = replace(s, " " => ", ")
return totree(Meta.parse(s))
end
function gettrees(name)
load()
ss = split(getfile("$name.txt"), '\n', keepempty = false)
return parsetree.(ss)
end
"""
train()
Return the train split of the Stanford Sentiment Treebank.
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
"""
train() = gettrees("train")
"""
test()
Return the test split of the Stanford Sentiment Treebank.
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
"""
test() = gettrees("test")
"""
dev()
Return the dev split of the Stanford Sentiment Treebank.
The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
"""
dev() = gettrees("dev")
end