Flux.jl/src/data/sentiment.jl

46 lines
1.0 KiB
Julia
Raw Normal View History

2017-11-02 11:41:28 +00:00
module Sentiment
2017-11-06 12:01:47 +00:00
using ZipFile
2017-11-02 11:41:28 +00:00
using ..Data: deps
function load()
2018-04-15 19:09:08 +00:00
isfile(deps("sentiment.zip")) || return
2018-08-11 13:42:33 +00:00
@info "Downloading sentiment treebank dataset"
2018-04-15 19:09:08 +00:00
download("https://cache.julialang.org/https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip",
deps("sentiment.zip"))
2017-11-02 11:41:28 +00:00
end
getfile(r, name) = r.files[findfirst(x -> x.name == name, r.files)]
2017-11-06 12:01:47 +00:00
function getfile(name)
r = ZipFile.Reader(deps("sentiment.zip"))
text = readstring(getfile(r, "trees/$name"))
2017-11-02 11:41:28 +00:00
close(r)
2017-11-06 12:01:47 +00:00
return text
2017-11-02 11:41:28 +00:00
end
2018-02-13 15:45:33 +00:00
using ..Data: Tree
2017-11-06 12:01:47 +00:00
totree_(n, w) = Tree{Any}((parse(Int, n), w))
totree_(n, a, b) = Tree{Any}((parse(Int, n), nothing), totree(a), totree(b))
totree(t::Expr) = totree_(t.args...)
function parsetree(s)
s = replace(s, r"\$", s -> "\\\$")
s = replace(s, r"[^\s\(\)]+", s -> "\"$s\"")
s = replace(s, " ", ", ")
return totree(parse(s))
end
function gettrees(name)
load()
ss = split(getfile("$name.txt"), '\n', keep = false)
return parsetree.(ss)
end
train() = gettrees("train")
test() = gettrees("test")
dev() = gettrees("dev")
2017-11-02 11:41:28 +00:00
end