From 197a1a70c09deba9f4d5ae1bf74bc12a86314288 Mon Sep 17 00:00:00 2001 From: pranjaldatta Date: Fri, 7 Feb 2020 03:47:19 +0530 Subject: [PATCH 1/2] added BostonHousing dataset and testing --- src/data/Data.jl | 3 + src/data/housing.jl | 136 ++++++++++++++++++++++++++++++++++++++++++++ test/data.jl | 8 ++- 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 src/data/housing.jl diff --git a/src/data/Data.jl b/src/data/Data.jl index d7cd0303..88af9549 100644 --- a/src/data/Data.jl +++ b/src/data/Data.jl @@ -42,4 +42,7 @@ using .Sentiment include("iris.jl") export Iris +include("housing.jl") +export Housing + end diff --git a/src/data/housing.jl b/src/data/housing.jl new file mode 100644 index 00000000..0d167dc0 --- /dev/null +++ b/src/data/housing.jl @@ -0,0 +1,136 @@ +""" +1. Title: Boston Housing Data + +2. Sources: + (a) Origin: This dataset was taken from the StatLib library which is + maintained at Carnegie Mellon University. + (b) Creator: Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the + demand for clean air', J. Environ. Economics & Management, + vol.5, 81-102, 1978. + (c) Date: July 7, 1993 + +3. Number of Instances: 506 + +4. Number of Attributes: 13 continuous attributes (including "class" + attribute "MEDV"), 1 binary-valued attribute. + +5. Attribute Information: + + 1. CRIM per capita crime rate by town + 2. ZN proportion of residential land zoned for lots over + 25,000 sq.ft. + 3. INDUS proportion of non-retail business acres per town + 4. CHAS Charles River dummy variable (= 1 if tract bounds + river; 0 otherwise) + 5. NOX nitric oxides concentration (parts per 10 million) + 6. RM average number of rooms per dwelling + 7. AGE proportion of owner-occupied units built prior to 1940 + 8. DIS weighted distances to five Boston employment centres + 9. RAD index of accessibility to radial highways + 10. TAX full-value property-tax rate per 10,000 dollars + 11. PTRATIO pupil-teacher ratio by town + 12. B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks + by town + 13. LSTAT % lower status of the population + 14. MEDV Median value of owner-occupied homes in 1000's of dollars + + Downloaded From: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data + +""" +module Housing + +using DelimitedFiles +using ..Data: deps, download_and_verify + +#Uncomment if package exists +#const cache_prefix = "https://cache.julialang.org/" +const cache_prefix = "" + +function load() + isfile(deps("housing.data")) && return + + @info "Downloading the Boston housing Dataset" + download_and_verify("$(cache_prefix)https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data", + deps("housing.data"), + "baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a") + + #@info "Download complete. Working on the files" + path = deps() + isfile(deps("housing.data")) && touch(joinpath(path, "tempfile.data")) + open(joinpath(path, "tempfile.data"), "a") do fout + open(deps("housing.data"), "r") do fin + for line in eachline(fin) + line = replace(lstrip(line), r" +" => s",") + println(fout, line) + end + end + end + mv(joinpath(path, "tempfile.data"), deps("housing.data"), force=true) +end + +""" +Gets the targets for the Boston housing dataset, a 506 element array listing the targets for each example + +```jldoctest +julia> using Flux + +julia> target = Flux.Data.Housing.targets() + +julia> summary(target) +506×1 Array{Float64,2} + +julia> target[1] +24.0 + +""" +function targets() + load() + housing = readdlm(deps("housing.data"), ',') + reshape(Vector{Float64}(housing[1:end,end]), (506, 1)) +end + + +""" +Gets the names of the features provided in the dataset + +""" +function feature_names() + ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"] +end + + +""" +Gets the features of the Boston Housing Dataset. This is a 506x13 Matrix of Float64 datatypes. +The values are in the order ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"]. +It has 506 examples. + +```jldoctest +julia> using Flux + +julia> features = Flux.Data.Housing.features() + +julia> summary(features) +506×13 Array{Float64,2} + +julia> features[1, :] +13-element Array{Float64,1}: +0.00632 +18.0 +2.31 +0.0 +0.538 + ⋮ +296.0 +15.3 +396.9 +4.98 + +""" +function features() + load() + housing = readdlm(deps("housing.data"), ',') + Matrix{Float64}(housing[1:end, 1:13]) +end + + +end \ No newline at end of file diff --git a/test/data.jl b/test/data.jl index 6b777873..aa913806 100644 --- a/test/data.jl +++ b/test/data.jl @@ -16,7 +16,13 @@ using Test @test Data.Sentiment.train() isa Vector{Data.Tree{Any}} @test Iris.features() isa Matrix -@test size(Iris.features()) == (4,150) +@test size(Iris.features()) == (4,150) @test Iris.labels() isa Vector{String} @test size(Iris.labels()) == (150,) + +@test Housing.features() isa Matrix +@test size(Housing.features()) == (506, 13) + +@test Housing.targets() isa Array{Float64} +@test size(Housing.targets()) == (506, 1) \ No newline at end of file From 569021a9f1f9910f7f2e9ac6869bb149b9da7023 Mon Sep 17 00:00:00 2001 From: pranjaldatta Date: Wed, 26 Feb 2020 15:05:23 +0530 Subject: [PATCH 2/2] added newlines at end of file --- src/data/housing.jl | 2 +- test/data.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/housing.jl b/src/data/housing.jl index 0d167dc0..61391304 100644 --- a/src/data/housing.jl +++ b/src/data/housing.jl @@ -133,4 +133,4 @@ function features() end -end \ No newline at end of file +end diff --git a/test/data.jl b/test/data.jl index aa913806..6c012a93 100644 --- a/test/data.jl +++ b/test/data.jl @@ -25,4 +25,4 @@ using Test @test size(Housing.features()) == (506, 13) @test Housing.targets() isa Array{Float64} -@test size(Housing.targets()) == (506, 1) \ No newline at end of file +@test size(Housing.targets()) == (506, 1)