julia_coursera/Week4_Working_with_data.jl

202 lines
4.8 KiB
Julia

# WORKING WITH DATA
# -----------------
# I Distributions
# * Data point values for a distribution usually follow a pattern
# * Such patterns are called distributions
# * Distributions are either discrete or continuous
# * The Distribution.jl package contains most of the common
# data distributions
# 1 Importing Distributions.jl
using Distributions
using Random
# 2 The standard normal distribution
#Seed the pseudo-random number generator
Random.seed!(1234)
#Saving the standard normal distribution as an object
n = Normal()
#Parameter values of the standard normal distribution
params(n)
#Select 10 elements at random from n
var1 = rand(n, 10)
#Calculating the mean and standard deviation of var1
mean(var1)
std(var1)
#Probability density function value at x = 0.3
pdf(Normal(), 0.3)
#Cumulative distribution function as x = 0.25
cdf(Normal(), 0.25)
#Quantiles
quantile(Normal(), 0.025)
quantile(Normal(), 0.975)
# 3 The normal distribution
#Returning the parameters of the normal distribution
fieldnames(Normal)
#Creating 100 data point values from a normal distribution
# with a mean of 100 and a standard deviation of 10
var2 = rand(Normal(100, 10), 100)
#Calculating the mean and standard deviation of var2
mean(var2)
std(var2)
#Using fit() to calculate the parameters of a distribution
fit(Normal, var2)
# 3 Skewness and kurtosis
skewness(var2)
kurtosis(var2)
# 4 Beta distribution
b = Beta(1, 1)
params(b)
var3 = rand(b, 100)
fit(Beta, var3)
# 5 χ2 distribution
c = Chisq(1)
var4 = rand(c, 100)
fieldnames(Chisq) # Degrees of freedom
# 6 Distribution types are hierarchical
supertype(Normal)
subtypes(Distribution{Univariate,Continuous})
subtypes(Distribution{Univariate,Discrete})
# * Search for help in the REPL
# II DataFrames
using DataFrames
# * Allows for creation of a flat data structure (rows and columns)
# * Columns are variables
# * Rows are subjects (examples)
# 1 Create a DataFrame
typeof(var2)
#Create and empty DataFrame
df = DataFrame()
# 2 Add a column with data point values (rows)
df[:Var2] = var2
#View first five rows
head(df)
# 3 Add another columns
df[:Var3] = var3
#View last three rows
tail(df, 3)
# 4 Dimensions of a DataFrame
size(df)
rows = size(df, 1)
columns = size(df, 2)
# 5 Inspect content
showcols(df)
#Data type only
eltypes(df)
#Descriptive statistics
describe(df)
#Print in console
print(describe(df))
# 6 Create a bigger DataFrame
df2 = DataFrame()
df2[:A] = 1:10
df2[:B] = ["I", "II", "II", "I", "II","I", "II", "II", "I", "II"]
df2[:C] = rand(Normal(), 10)
df2[:D] = rand(Chisq(1), 10)
# 7 Slicing
#First three rows
df2[1:3, :]
#All rows columns 1 and 3
df2[:, [1, 3]]
#Different notation
df2[:, [:A, :C]]
# III Importing data files
# 1 Set working directory in Atom settings under Julia tab
pwd()
# 2 Import CSV
using CSV
# 3 Import csv file (in same directory)
data1 = CSV.read("CCS.csv")
# 4 Explore the data
typeof(data1)
head(data1)
showcols(data1)
eltypes(data1)
describe(data1)
# 5 Combining DataFrames
#Creating DataFrames
subjects = DataFrame(Number = [100, 101, 102, 103], Stage = ["I", "III", "II", "I"])
treatment = DataFrame(Number = [103, 102, 101, 100], Treatment = ["A", "B", "A", "B"])
subjects
treatment
#Joining
df3 = join(subjects, treatment, on = :Number);
df3
#Adding a longer list of subjects
subjects = DataFrame(Number = [100, 101, 102, 103, 104, 105], Stage = ["I", "III", "II", "I", "II", "II"])
#Inner join
df4 = join(subjects, treatment, on = :Number, kind = :inner);
df4
#Outer joing: empty fields filled with missing
df5 = join(subjects, treatment, on = :Number, kind = :outer);
df5
# 6 Grouping
#Creating a new DataFrame
df6 = DataFrame(Group = rand(["A", "B", "C"], 15), Variable1 = randn(15), Variable2 = rand(15));
df6
#Grouping using by()
by(df6, :Group, size)
#Count unique data point values in :Group column
by(df6, :Group, dfc -> DataFrame(Count = size(dfc, 1)))
#Aggregate for descriptive statistics
print(aggregate(df6, :Group, [mean, std]))
#Group
groupby(df6, :Group)
length(groupby(df6, :Group))
groupby(df6, :Group)[2]
# 7 Selection
df6[:Variable1] .> 0
sub(df6, df6[:Variable1] .> 0)
# 8 New DataFrame by selection
df6A = sub(df6, df6[:Group] .== "A");
df6A
# 9 Sorting
df6S = sort!(df6, cols = [:Group, :Variable1], rev = true);
df6S
df6S2 = sort!(df6, cols = [:Group, :Variable1, :Variable2], rev = (false, false, true));
df6S2
# 10 Unique rows
#Creating a DataFrame with an obvious duplicate row
df7 = DataFrame(A = [1, 2, 2, 3, 4, 5], B = [11, 12, 12, 13, 14, 15], C = ["A", "B", "B", "C", "D", "E"]);
df7
#Only unique rows
unique(df7)
df7
#Permanant change
unique!(df7)
df7
# 11 Delete rows
#Permanently
deleterows!(df7, [1, 5])
df7