Flux.jl/src/data/sentiment.jl

47 lines
1.2 KiB
Julia
Raw Normal View History

2017-11-02 11:41:28 +00:00
module Sentiment
2017-11-06 12:01:47 +00:00
using ZipFile
2019-01-30 11:56:54 +00:00
using ..Data: deps, download_and_verify
2017-11-02 11:41:28 +00:00
function load()
2018-09-14 17:10:24 +00:00
isfile(deps("sentiment.zip")) && return
2018-08-11 13:42:33 +00:00
@info "Downloading sentiment treebank dataset"
2019-01-30 11:56:54 +00:00
download_and_verify("https://cache.julialang.org/https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip",
deps("sentiment.zip"), "5c613a4f673fc74097d523a2c83f38e0cc462984d847b82c7aaf36b01cbbbfcc")
2017-11-02 11:41:28 +00:00
end
getfile(r, name) = r.files[findfirst(x -> x.name == name, r.files)]
2017-11-06 12:01:47 +00:00
function getfile(name)
r = ZipFile.Reader(deps("sentiment.zip"))
2018-08-14 16:46:23 +00:00
text = read(getfile(r, "trees/$name"), String)
2017-11-02 11:41:28 +00:00
close(r)
2017-11-06 12:01:47 +00:00
return text
2017-11-02 11:41:28 +00:00
end
2018-02-13 15:45:33 +00:00
using ..Data: Tree
2017-11-06 12:01:47 +00:00
totree_(n, w) = Tree{Any}((parse(Int, n), w))
totree_(n, a, b) = Tree{Any}((parse(Int, n), nothing), totree(a), totree(b))
totree(t::Expr) = totree_(t.args...)
function parsetree(s)
2018-09-14 17:10:24 +00:00
s = replace(s, "\\" => "")
s = replace(s, "\$" => "\\\$")
s = replace(s, r"[^ \n\(\)]+" => s -> "\"$s\"")
s = replace(s, " " => ", ")
2018-08-14 16:46:23 +00:00
return totree(Meta.parse(s))
2017-11-06 12:01:47 +00:00
end
function gettrees(name)
load()
2018-08-14 16:46:23 +00:00
ss = split(getfile("$name.txt"), '\n', keepempty = false)
2017-11-06 12:01:47 +00:00
return parsetree.(ss)
end
train() = gettrees("train")
test() = gettrees("test")
dev() = gettrees("dev")
2017-11-02 11:41:28 +00:00
end