[ADD] honors material

This commit is contained in:
Eduardo Cueto Mendoza 2020-05-30 15:58:12 -06:00
parent f1f66b5d2e
commit 9c0c146183
7 changed files with 7720 additions and 0 deletions

2223
Week4_Collections.ipynb Normal file

File diff suppressed because it is too large Load Diff

244
Week4_Collections.jl Normal file
View File

@ -0,0 +1,244 @@
# COLLECTIONS
# -----------
# * Collections are groups of elements
# * Elements are values of different Julia data types
# * Storing elements in collections is one of the most useful
# operations in computing
# I ARRAYS
# * Arrays are collections of values separated with commas and
# them inside of square brackets
# * They are represented in column or in row form
# 1 Like a column vector (click on the downward arrow)
array1 = [1, 2, 3]
typeof(array1)
# 2 Like row vector (click on the downward arrow)
array2 = [1 2 3]
typeof(array2)
# 3 The transpose converts betwee the two
transpose(array1)
#The apostrophe is an alternative notation
array1'
# 4 Boolean logic (==)
transpose(array1) == array1'
# 5 Data type inheritance
#With a mix of types, all the elements inherent the "highest" type
array2 = [1, 2, 3.0]
#Index for one of the original integers will be Float64
array2[1]
# 6 Column-wise entry of multidimensional array
array3 = [[1, 2, 3] [4, 5, 6] [7, 8, 9]]
# 7 Row-wise entry of multidimensional array
array4 = [[1 2 3]; [4 5 6]; [7 8 9]]
# 8 Length of array
length(array3)
length(array4)
# 9 Index order of column-wise array
for i in 1:length(array3)
println("Element $(i) is ", array3[i])
end
# 10 Index order of row-wise array
for i in 1:length(array4)
println("Element $(i) is ", array4[i])
end
# 11 Using repeat() to repeat elements
repeat([1, 2], 3)
# 12 Using range(start, step, number of elements)
range(1, step = 1, length = 10)
typeof(range(1, step = 1, length = 10))
# 13 Create collections usin gthe collect() function
collect(range(1, step = 1, length = 10))
#Short-hand
collect(1:10)
typeof(1:10)
#Add step size
collect(2:2:10)
typeof(collect(2:2:10))
# 14 Creating empty array with two rows and three columns
array5 = Array{Union{Missing, Int}}(missing, 2, 3)
# 15 Reshaping
reshape(array5, 3, 2)
# 16 Indexing (slicing)
#Random uniform distribution of values in closed domain [10,20]
#Shape 10 x 5
array6 = rand(10:20, 10, 5)
#All rows in first column
array6[:, 1]
#Rows two through five of second column
array6[2:5, 2]
#Values in rows 2, 4, 6, and in columns 1 and 5
array6[[2, 4, 6], [1, 5]]
#Values in row 1 from column 3 to the last column
array6[1, 3:end]
# Boolean logic (returning only true and false)
array6[:, 1] .> 12
# 17 Changing element values
array7 = [1, 2, 3, 4, 5]
#Permanantly append 10 to end of array
push!(array7, 10)
#Remove last element
#Only the removed value will be displayed
pop!(array7)
array7
#Change second element value to 1000
array7[2] = 1000
array7
# 18 List comprehension
array8 = [3 * i for i in 1:5]
#Column-wise collection iterating through second element first
array9 = [a * b for a in 1:3, b in 1:3]
# 19 Arithmetic on arrays
#Elementwise addition of a scalar using dot notation
array8 .+ 1
#Elementwise addition of similar sized arrays
array7 + array8
# 20 Missing
# * Missing is a Julia data type
# * Provides a placeholder for missing data in a statistical sense
# * Propagates automatically
# * Equality as a type can be tested
# * Sorting is possible since missing is seen as greater than other values
#Propagation
missing + 1
missing > 1
[1, 2, 3, missing, 5] + [10, 20, 30, 40 ,50]
#Cannot return true or false since value is not known
missing == missing
#Equality
missing === missing
isequal(missing, missing)
#Sorting with isless()
isless(1, missing)
isless(Inf, missing)
# 21 Array of integer zeros
array11 = zeros(Int8, 3, 3)
# 22 Array of floating point ones
array12 = ones(Float16, 3, 3)
# 23 Array of true (bit array) values
array13 = trues(3, 3)
# 24 Fill an array with n elements of value x
array14 = fill(10, 3, 3)
# 25 Convert elements to a different data type
convert.(Float16, array14)
# 26 Concatenation
#Concatenate arrays along rows (makes row)
array15 = [1, 2, 3]
array16 = [10, 20, 30]
cat(array15, array16, dims = 1)
#Same as above
vcat(array15, array16)
#Concatenate arrays along columns (makes colums)
cat(array15, array16, dims = 2)
#Same as above
hcat(array15, array16)
# II TUPLES
# * Tuples are immutable collections
# 1 Tuples with mixed types
tuple1 = (1, 2, 3, 4, "Julia")
#For loop to look at value and type of each element
for i in 1:length(tuple1)
println(" The value of the tuple at index number $(i) is $(tuple1[i]) and the type is $(typeof(tuple1[i])).")
end
# 2 Each element can be named
a, b, c, seven = (1, 3, 5, 7)
a
seven
# 2 Reverse order index (can be done with arrays too)
tuple1[end:-1:1]
# 3 Mixed length tuples
tuple2 = ((1, 2, 3), 1, 2, (3, 100, 1))
#Element 4
tuple2[4]
#Element 2 in element 4
tuple2[4][2]
# III DICTIONARIES
# * Dictionaries are collection sof key-value pairs
# 1 Example of a dictionary
dictionary1 = Dict(1 => 77, 2 => 66, 3 => 1)
#The => is shorthand for the Pair() function
dictionary2 = Dict(Pair(1,100), Pair(2,200), Pair(3,300))
# 2 Specifying types
dictionary3 = Dict{Any, Any}(1 => 77, 2 => 66, 3 => "three")
#We can get a bit crazy
dictionary4 = Dict{Any, Any}("a" => 1, (2, 3) => "hello")
# 3 Using symbols as keys
dictionary5 = Dict(:A => 300, :B => 305, :C => 309)
dictionary5[:A]
# 4 Using in() to check on key-value pairs
in((:A => 300), dictionary5)
# 5 Changing an existing value
dictionary5[:C] = 1000
dictionary5
# 6 Using the delete!() function
delete!(dictionary5, :A)
# 7 The keys of a dictionary
keys(dictionary5)
# 8 The values of a dictionary
values(dictionary5)
# 8 Creating a dictionary with automatic keys
procedure_vals = ["Appendectomy", "Colectomy", "Cholecystectomy"]
procedure_dict = Dict{AbstractString,AbstractString}()
for (s, n) in enumerate(procedure_vals)
procedure_dict["x_$(s)"] = n
end
#Procedure_dict is now a dictionary
procedure_dict
# 9 Iterating through a dictionary by key and value
for (k, v) in procedure_dict
println(k, " is ",v)
end
# 10 Sorting
dictionary6 = Dict("a"=> 1,"b"=>2 ,"c"=>3 ,"d"=>4 ,"e"=>5 ,"f"=>6)
# Sorting using a for loop
for k in sort(collect(keys(dictionary6)))
println("$(k) is $(dictionary6[k])")
end

2252
Week4_Functions.ipynb Normal file

File diff suppressed because it is too large Load Diff

104
Week4_Functions.jl Normal file
View File

@ -0,0 +1,104 @@
+(2, 2)
versioninfo()
# FUNCTIONS IN JULIA 1.0 (0.7+)
# ----------------------
# * A function maps a tuple of arguments to a return value
# I Creating basic Functions
# 1 using the function keyword
#Create a function named my_addition
#Takes two arguments
#Return the addition of the two values
function my_addition(x, y)
return x + y
end
# 2 Calling a function
#Call the function with two argument values
my_addition(3, 4)
# 3 Built-in functions
#The plus, +, symbol (as other arithmetical symbols) are built-in functions
+(3, 4)
# 4 Using a Unicode symbol as a function name
function Σ(x, y)
return x + y
end
Σ(3, 4)
# II Anonymous functions
# * Functions can be assigned to variables
# * Functions can be used as arguments
# * Functions can be returned as values
# 1 An anonymous function
x -> x^2 + 3
# 2 A function as an argument
#Passing the round() function as argument to the map() function
map(round, [2.1, 3.4, 7.9])
#Passing an anonymous function as an arguemt to the map() function
map(x -> x^2, [2, 3, 4])
# III Tuples and functions
# * Tuples are immutable collections
# 1 Examples of tuples
my_tuple = (1, "Julia", 7)
typeof(my_tuple)
# * Single value tuple must have a comma
my_second_tuple = (4,)
typeof(my_second_tuple)
# 2 Indexing a tuple
length(my_tuple)
my_tuple[2]
# 3 Tuple indexing
# * Named tuple creates a name for each element
my_other_tuple = (a = 4, b = "Julia", c = 3)
#Indexing tuple by name
my_other_tuple.b
# 4 Function returns
# * Multiple return values of a function are tuples
function my_function(a, b)
return a + b, a - b
end
#Calling the function
my_function(10, 5)
#Looking up the type of the function return
typeof(my_function(10, 5))
#This allows for each element to be given a variable
r, s = my_function(10, 5)
r
s
# IV Functions with keyword arguments
# * Keyword arguments are added after semi-colon
# * Their order is not explicit
# * Default values are addded
# 1 Creating a function with a keyword argument
function my_keyword_function(x, y; z = 3)
return x + y + z
end
# * Omission of keyword argument uses default
my_keyword_function(1, 2)
# * Keyword argument names must be used
my_keyword_function(1, 2, z = 10)
# 4 Use of dot notation for functions
# * Passes a collection elementwise to a function
# * Use instead of map()
sin.([0., π/2., π, 3/2. * π, π])

569
Week4_PR_Template.ipynb Normal file
View File

@ -0,0 +1,569 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Week 4 Peer Review"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. In a code cell below, import the required packages: Distributions, DataFrames, and Random (install these packages via the REPL if required)."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Import the required packages\n",
"using Distributions, DataFrames, Random"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Seed the random number generator\n",
"Random.seed!(1234);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"3. In a code cell below, create a dataframe named df1, with 30 rows and 4 columns (variables). Call the first column ID. It should hold the values 1 through 30 (to make up 30 rows). Use three rand() function calls to generate three more columns named var1, var2, and var3. The second column (var1) should consist of 30 values from a standard normal distribution (mean of 0 and standard deviation of 1). The third column (var2) should consist of 30 random value from a normal distribution with a mean of 10 and a standard deviation of 2. The last column (var3) should contain 30 random values chosen from a range of integers between (and including) 5 and 15."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"data-frame\"><thead><tr><th></th><th>ID</th><th>var1</th><th>var2</th><th>var3</th></tr><tr><th></th><th>Int64</th><th>Float64</th><th>Float64</th><th>Int64</th></tr></thead><tbody><p>30 rows × 4 columns</p><tr><th>1</th><td>1</td><td>0.867347</td><td>7.44066</td><td>14</td></tr><tr><th>2</th><td>2</td><td>-0.901744</td><td>11.9946</td><td>13</td></tr><tr><th>3</th><td>3</td><td>-0.494479</td><td>10.6048</td><td>12</td></tr><tr><th>4</th><td>4</td><td>-0.902914</td><td>9.92711</td><td>9</td></tr><tr><th>5</th><td>5</td><td>0.864401</td><td>10.2839</td><td>15</td></tr><tr><th>6</th><td>6</td><td>2.21188</td><td>11.0425</td><td>14</td></tr><tr><th>7</th><td>7</td><td>0.532813</td><td>11.7935</td><td>15</td></tr><tr><th>8</th><td>8</td><td>-0.271735</td><td>8.97294</td><td>9</td></tr><tr><th>9</th><td>9</td><td>0.502334</td><td>8.4704</td><td>9</td></tr><tr><th>10</th><td>10</td><td>-0.516984</td><td>6.91715</td><td>8</td></tr><tr><th>11</th><td>11</td><td>-0.560501</td><td>9.83968</td><td>15</td></tr><tr><th>12</th><td>12</td><td>-0.0192918</td><td>7.81756</td><td>14</td></tr><tr><th>13</th><td>13</td><td>0.128064</td><td>8.83897</td><td>11</td></tr><tr><th>14</th><td>14</td><td>1.85278</td><td>9.36913</td><td>10</td></tr><tr><th>15</th><td>15</td><td>-0.827763</td><td>7.2771</td><td>15</td></tr><tr><th>16</th><td>16</td><td>0.110096</td><td>9.77109</td><td>15</td></tr><tr><th>17</th><td>17</td><td>-0.251176</td><td>10.3317</td><td>6</td></tr><tr><th>18</th><td>18</td><td>0.369714</td><td>9.18312</td><td>5</td></tr><tr><th>19</th><td>19</td><td>0.0721164</td><td>7.98043</td><td>12</td></tr><tr><th>20</th><td>20</td><td>-1.50343</td><td>8.91239</td><td>13</td></tr><tr><th>21</th><td>21</td><td>1.56417</td><td>7.54655</td><td>14</td></tr><tr><th>22</th><td>22</td><td>-1.39674</td><td>8.91657</td><td>5</td></tr><tr><th>23</th><td>23</td><td>1.1055</td><td>8.62701</td><td>8</td></tr><tr><th>24</th><td>24</td><td>-1.10673</td><td>8.57414</td><td>9</td></tr><tr><th>25</th><td>25</td><td>-3.21136</td><td>9.34588</td><td>5</td></tr><tr><th>26</th><td>26</td><td>-0.0740145</td><td>11.0297</td><td>9</td></tr><tr><th>27</th><td>27</td><td>0.150976</td><td>14.8349</td><td>10</td></tr><tr><th>28</th><td>28</td><td>0.769278</td><td>9.38405</td><td>14</td></tr><tr><th>29</th><td>29</td><td>-0.310153</td><td>12.4906</td><td>15</td></tr><tr><th>30</th><td>30</td><td>-0.602707</td><td>9.9001</td><td>7</td></tr></tbody></table>"
],
"text/latex": [
"\\begin{tabular}{r|cccc}\n",
"\t& ID & var1 & var2 & var3\\\\\n",
"\t\\hline\n",
"\t& Int64 & Float64 & Float64 & Int64\\\\\n",
"\t\\hline\n",
"\t1 & 1 & 0.867347 & 7.44066 & 14 \\\\\n",
"\t2 & 2 & -0.901744 & 11.9946 & 13 \\\\\n",
"\t3 & 3 & -0.494479 & 10.6048 & 12 \\\\\n",
"\t4 & 4 & -0.902914 & 9.92711 & 9 \\\\\n",
"\t5 & 5 & 0.864401 & 10.2839 & 15 \\\\\n",
"\t6 & 6 & 2.21188 & 11.0425 & 14 \\\\\n",
"\t7 & 7 & 0.532813 & 11.7935 & 15 \\\\\n",
"\t8 & 8 & -0.271735 & 8.97294 & 9 \\\\\n",
"\t9 & 9 & 0.502334 & 8.4704 & 9 \\\\\n",
"\t10 & 10 & -0.516984 & 6.91715 & 8 \\\\\n",
"\t11 & 11 & -0.560501 & 9.83968 & 15 \\\\\n",
"\t12 & 12 & -0.0192918 & 7.81756 & 14 \\\\\n",
"\t13 & 13 & 0.128064 & 8.83897 & 11 \\\\\n",
"\t14 & 14 & 1.85278 & 9.36913 & 10 \\\\\n",
"\t15 & 15 & -0.827763 & 7.2771 & 15 \\\\\n",
"\t16 & 16 & 0.110096 & 9.77109 & 15 \\\\\n",
"\t17 & 17 & -0.251176 & 10.3317 & 6 \\\\\n",
"\t18 & 18 & 0.369714 & 9.18312 & 5 \\\\\n",
"\t19 & 19 & 0.0721164 & 7.98043 & 12 \\\\\n",
"\t20 & 20 & -1.50343 & 8.91239 & 13 \\\\\n",
"\t21 & 21 & 1.56417 & 7.54655 & 14 \\\\\n",
"\t22 & 22 & -1.39674 & 8.91657 & 5 \\\\\n",
"\t23 & 23 & 1.1055 & 8.62701 & 8 \\\\\n",
"\t24 & 24 & -1.10673 & 8.57414 & 9 \\\\\n",
"\t25 & 25 & -3.21136 & 9.34588 & 5 \\\\\n",
"\t26 & 26 & -0.0740145 & 11.0297 & 9 \\\\\n",
"\t27 & 27 & 0.150976 & 14.8349 & 10 \\\\\n",
"\t28 & 28 & 0.769278 & 9.38405 & 14 \\\\\n",
"\t29 & 29 & -0.310153 & 12.4906 & 15 \\\\\n",
"\t30 & 30 & -0.602707 & 9.9001 & 7 \\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"30×4 DataFrame\n",
"│ Row │ ID │ var1 │ var2 │ var3 │\n",
"│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mInt64\u001b[39m │\n",
"├─────┼───────┼────────────┼─────────┼───────┤\n",
"│ 1 │ 1 │ 0.867347 │ 7.44066 │ 14 │\n",
"│ 2 │ 2 │ -0.901744 │ 11.9946 │ 13 │\n",
"│ 3 │ 3 │ -0.494479 │ 10.6048 │ 12 │\n",
"│ 4 │ 4 │ -0.902914 │ 9.92711 │ 9 │\n",
"│ 5 │ 5 │ 0.864401 │ 10.2839 │ 15 │\n",
"│ 6 │ 6 │ 2.21188 │ 11.0425 │ 14 │\n",
"│ 7 │ 7 │ 0.532813 │ 11.7935 │ 15 │\n",
"│ 8 │ 8 │ -0.271735 │ 8.97294 │ 9 │\n",
"│ 9 │ 9 │ 0.502334 │ 8.4704 │ 9 │\n",
"│ 10 │ 10 │ -0.516984 │ 6.91715 │ 8 │\n",
"⋮\n",
"│ 20 │ 20 │ -1.50343 │ 8.91239 │ 13 │\n",
"│ 21 │ 21 │ 1.56417 │ 7.54655 │ 14 │\n",
"│ 22 │ 22 │ -1.39674 │ 8.91657 │ 5 │\n",
"│ 23 │ 23 │ 1.1055 │ 8.62701 │ 8 │\n",
"│ 24 │ 24 │ -1.10673 │ 8.57414 │ 9 │\n",
"│ 25 │ 25 │ -3.21136 │ 9.34588 │ 5 │\n",
"│ 26 │ 26 │ -0.0740145 │ 11.0297 │ 9 │\n",
"│ 27 │ 27 │ 0.150976 │ 14.8349 │ 10 │\n",
"│ 28 │ 28 │ 0.769278 │ 9.38405 │ 14 │\n",
"│ 29 │ 29 │ -0.310153 │ 12.4906 │ 15 │\n",
"│ 30 │ 30 │ -0.602707 │ 9.9001 │ 7 │"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = DataFrame(ID = 1:30, var1 = rand(Normal(0,1),30), var2 = rand(Normal(10,2),30), var3 = rand(5:15,30))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"4.In code cells below, write the code to calculate the mean and variance of each column in the dataframe. For example for the first variable this could be done using the println function and referring to each column (variable) by its symbol notation. Try to shorten the code with a for-loop, iterating over the variables names (in symbol format)."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"┌ Warning: `getindex(df::DataFrame, col_ind::ColumnIndex)` is deprecated, use `df[!, col_ind]` instead.\n",
"│ caller = top-level scope at In[4]:3\n",
"└ @ Core ./In[4]:3\n",
"┌ Warning: `getindex(df::DataFrame, col_ind::ColumnIndex)` is deprecated, use `df[!, col_ind]` instead.\n",
"│ caller = top-level scope at In[4]:4\n",
"└ @ Core ./In[4]:4\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The mean of var1 is: -0.061674963752526096, the variance is: 1.1790054448274625\n",
"The mean of var2 is: 9.580613055613338, the variance is: 2.948790077536739\n",
"The mean of var3 is: 11.0, the variance is: 11.724137931034482\n"
]
}
],
"source": [
"for s in [:var1,:var2,:var3] #names(df)\n",
" colname = String(s)\n",
" meancol = mean(df[s])\n",
" variancecol = var(df[s])\n",
" println(\"The mean of $colname is: $meancol, the variance is: $variancecol\")\n",
"end"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"5. In a code cells below, create a new DataFrame named df2 from the last 20 rows of the original DataFrame, df1."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"data-frame\"><thead><tr><th></th><th>ID</th><th>var1</th><th>var2</th><th>var3</th></tr><tr><th></th><th>Int64</th><th>Float64</th><th>Float64</th><th>Int64</th></tr></thead><tbody><p>20 rows × 4 columns</p><tr><th>1</th><td>11</td><td>-0.560501</td><td>9.83968</td><td>15</td></tr><tr><th>2</th><td>12</td><td>-0.0192918</td><td>7.81756</td><td>14</td></tr><tr><th>3</th><td>13</td><td>0.128064</td><td>8.83897</td><td>11</td></tr><tr><th>4</th><td>14</td><td>1.85278</td><td>9.36913</td><td>10</td></tr><tr><th>5</th><td>15</td><td>-0.827763</td><td>7.2771</td><td>15</td></tr><tr><th>6</th><td>16</td><td>0.110096</td><td>9.77109</td><td>15</td></tr><tr><th>7</th><td>17</td><td>-0.251176</td><td>10.3317</td><td>6</td></tr><tr><th>8</th><td>18</td><td>0.369714</td><td>9.18312</td><td>5</td></tr><tr><th>9</th><td>19</td><td>0.0721164</td><td>7.98043</td><td>12</td></tr><tr><th>10</th><td>20</td><td>-1.50343</td><td>8.91239</td><td>13</td></tr><tr><th>11</th><td>21</td><td>1.56417</td><td>7.54655</td><td>14</td></tr><tr><th>12</th><td>22</td><td>-1.39674</td><td>8.91657</td><td>5</td></tr><tr><th>13</th><td>23</td><td>1.1055</td><td>8.62701</td><td>8</td></tr><tr><th>14</th><td>24</td><td>-1.10673</td><td>8.57414</td><td>9</td></tr><tr><th>15</th><td>25</td><td>-3.21136</td><td>9.34588</td><td>5</td></tr><tr><th>16</th><td>26</td><td>-0.0740145</td><td>11.0297</td><td>9</td></tr><tr><th>17</th><td>27</td><td>0.150976</td><td>14.8349</td><td>10</td></tr><tr><th>18</th><td>28</td><td>0.769278</td><td>9.38405</td><td>14</td></tr><tr><th>19</th><td>29</td><td>-0.310153</td><td>12.4906</td><td>15</td></tr><tr><th>20</th><td>30</td><td>-0.602707</td><td>9.9001</td><td>7</td></tr></tbody></table>"
],
"text/latex": [
"\\begin{tabular}{r|cccc}\n",
"\t& ID & var1 & var2 & var3\\\\\n",
"\t\\hline\n",
"\t& Int64 & Float64 & Float64 & Int64\\\\\n",
"\t\\hline\n",
"\t1 & 11 & -0.560501 & 9.83968 & 15 \\\\\n",
"\t2 & 12 & -0.0192918 & 7.81756 & 14 \\\\\n",
"\t3 & 13 & 0.128064 & 8.83897 & 11 \\\\\n",
"\t4 & 14 & 1.85278 & 9.36913 & 10 \\\\\n",
"\t5 & 15 & -0.827763 & 7.2771 & 15 \\\\\n",
"\t6 & 16 & 0.110096 & 9.77109 & 15 \\\\\n",
"\t7 & 17 & -0.251176 & 10.3317 & 6 \\\\\n",
"\t8 & 18 & 0.369714 & 9.18312 & 5 \\\\\n",
"\t9 & 19 & 0.0721164 & 7.98043 & 12 \\\\\n",
"\t10 & 20 & -1.50343 & 8.91239 & 13 \\\\\n",
"\t11 & 21 & 1.56417 & 7.54655 & 14 \\\\\n",
"\t12 & 22 & -1.39674 & 8.91657 & 5 \\\\\n",
"\t13 & 23 & 1.1055 & 8.62701 & 8 \\\\\n",
"\t14 & 24 & -1.10673 & 8.57414 & 9 \\\\\n",
"\t15 & 25 & -3.21136 & 9.34588 & 5 \\\\\n",
"\t16 & 26 & -0.0740145 & 11.0297 & 9 \\\\\n",
"\t17 & 27 & 0.150976 & 14.8349 & 10 \\\\\n",
"\t18 & 28 & 0.769278 & 9.38405 & 14 \\\\\n",
"\t19 & 29 & -0.310153 & 12.4906 & 15 \\\\\n",
"\t20 & 30 & -0.602707 & 9.9001 & 7 \\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"20×4 DataFrame\n",
"│ Row │ ID │ var1 │ var2 │ var3 │\n",
"│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mInt64\u001b[39m │\n",
"├─────┼───────┼────────────┼─────────┼───────┤\n",
"│ 1 │ 11 │ -0.560501 │ 9.83968 │ 15 │\n",
"│ 2 │ 12 │ -0.0192918 │ 7.81756 │ 14 │\n",
"│ 3 │ 13 │ 0.128064 │ 8.83897 │ 11 │\n",
"│ 4 │ 14 │ 1.85278 │ 9.36913 │ 10 │\n",
"│ 5 │ 15 │ -0.827763 │ 7.2771 │ 15 │\n",
"│ 6 │ 16 │ 0.110096 │ 9.77109 │ 15 │\n",
"│ 7 │ 17 │ -0.251176 │ 10.3317 │ 6 │\n",
"│ 8 │ 18 │ 0.369714 │ 9.18312 │ 5 │\n",
"│ 9 │ 19 │ 0.0721164 │ 7.98043 │ 12 │\n",
"│ 10 │ 20 │ -1.50343 │ 8.91239 │ 13 │\n",
"│ 11 │ 21 │ 1.56417 │ 7.54655 │ 14 │\n",
"│ 12 │ 22 │ -1.39674 │ 8.91657 │ 5 │\n",
"│ 13 │ 23 │ 1.1055 │ 8.62701 │ 8 │\n",
"│ 14 │ 24 │ -1.10673 │ 8.57414 │ 9 │\n",
"│ 15 │ 25 │ -3.21136 │ 9.34588 │ 5 │\n",
"│ 16 │ 26 │ -0.0740145 │ 11.0297 │ 9 │\n",
"│ 17 │ 27 │ 0.150976 │ 14.8349 │ 10 │\n",
"│ 18 │ 28 │ 0.769278 │ 9.38405 │ 14 │\n",
"│ 19 │ 29 │ -0.310153 │ 12.4906 │ 15 │\n",
"│ 20 │ 30 │ -0.602707 │ 9.9001 │ 7 │"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = df[11:end,:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"6. In a code cells below, show the results of computing simple descriptive statistics on this new DataFrame using the describe() function."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"data-frame\"><thead><tr><th></th><th>variable</th><th>mean</th><th>min</th><th>median</th><th>max</th><th>nunique</th><th>nmissing</th><th>eltype</th></tr><tr><th></th><th>Symbol</th><th>Float64</th><th>Real</th><th>Float64</th><th>Real</th><th>Nothing</th><th>Nothing</th><th>DataType</th></tr></thead><tbody><p>4 rows × 8 columns</p><tr><th>1</th><td>ID</td><td>20.5</td><td>11</td><td>20.5</td><td>30</td><td></td><td></td><td>Int64</td></tr><tr><th>2</th><td>var1</td><td>-0.187058</td><td>-3.21136</td><td>-0.0466532</td><td>1.85278</td><td></td><td></td><td>Float64</td></tr><tr><th>3</th><td>var2</td><td>9.49853</td><td>7.2771</td><td>9.2645</td><td>14.8349</td><td></td><td></td><td>Float64</td></tr><tr><th>4</th><td>var3</td><td>10.6</td><td>5</td><td>10.5</td><td>15</td><td></td><td></td><td>Int64</td></tr></tbody></table>"
],
"text/latex": [
"\\begin{tabular}{r|cccccccc}\n",
"\t& variable & mean & min & median & max & nunique & nmissing & eltype\\\\\n",
"\t\\hline\n",
"\t& Symbol & Float64 & Real & Float64 & Real & Nothing & Nothing & DataType\\\\\n",
"\t\\hline\n",
"\t1 & ID & 20.5 & 11 & 20.5 & 30 & & & Int64 \\\\\n",
"\t2 & var1 & -0.187058 & -3.21136 & -0.0466532 & 1.85278 & & & Float64 \\\\\n",
"\t3 & var2 & 9.49853 & 7.2771 & 9.2645 & 14.8349 & & & Float64 \\\\\n",
"\t4 & var3 & 10.6 & 5 & 10.5 & 15 & & & Int64 \\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"4×8 DataFrame. Omitted printing of 2 columns\n",
"│ Row │ variable │ mean │ min │ median │ max │ nunique │\n",
"│ │ \u001b[90mSymbol\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mReal\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mReal\u001b[39m │ \u001b[90mNothing\u001b[39m │\n",
"├─────┼──────────┼───────────┼──────────┼────────────┼─────────┼─────────┤\n",
"│ 1 │ ID │ 20.5 │ 11 │ 20.5 │ 30 │ │\n",
"│ 2 │ var1 │ -0.187058 │ -3.21136 │ -0.0466532 │ 1.85278 │ │\n",
"│ 3 │ var2 │ 9.49853 │ 7.2771 │ 9.2645 │ 14.8349 │ │\n",
"│ 4 │ var3 │ 10.6 │ 5 │ 10.5 │ 15 │ │"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"describe(df2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"7. In a code cells below, add a column named cat1 to the df2 DataFrame consisting of a random selection of 20 values from the sample space GroupA and GroupB.m"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"data-frame\"><thead><tr><th></th><th>ID</th><th>var1</th><th>var2</th><th>var3</th><th>Col1</th></tr><tr><th></th><th>Int64</th><th>Float64</th><th>Float64</th><th>Int64</th><th>String</th></tr></thead><tbody><p>20 rows × 5 columns</p><tr><th>1</th><td>11</td><td>-0.560501</td><td>9.83968</td><td>15</td><td>GroupA</td></tr><tr><th>2</th><td>12</td><td>-0.0192918</td><td>7.81756</td><td>14</td><td>GroupB</td></tr><tr><th>3</th><td>13</td><td>0.128064</td><td>8.83897</td><td>11</td><td>GroupB</td></tr><tr><th>4</th><td>14</td><td>1.85278</td><td>9.36913</td><td>10</td><td>GroupB</td></tr><tr><th>5</th><td>15</td><td>-0.827763</td><td>7.2771</td><td>15</td><td>GroupB</td></tr><tr><th>6</th><td>16</td><td>0.110096</td><td>9.77109</td><td>15</td><td>GroupA</td></tr><tr><th>7</th><td>17</td><td>-0.251176</td><td>10.3317</td><td>6</td><td>GroupB</td></tr><tr><th>8</th><td>18</td><td>0.369714</td><td>9.18312</td><td>5</td><td>GroupA</td></tr><tr><th>9</th><td>19</td><td>0.0721164</td><td>7.98043</td><td>12</td><td>GroupB</td></tr><tr><th>10</th><td>20</td><td>-1.50343</td><td>8.91239</td><td>13</td><td>GroupA</td></tr><tr><th>11</th><td>21</td><td>1.56417</td><td>7.54655</td><td>14</td><td>GroupB</td></tr><tr><th>12</th><td>22</td><td>-1.39674</td><td>8.91657</td><td>5</td><td>GroupB</td></tr><tr><th>13</th><td>23</td><td>1.1055</td><td>8.62701</td><td>8</td><td>GroupA</td></tr><tr><th>14</th><td>24</td><td>-1.10673</td><td>8.57414</td><td>9</td><td>GroupA</td></tr><tr><th>15</th><td>25</td><td>-3.21136</td><td>9.34588</td><td>5</td><td>GroupA</td></tr><tr><th>16</th><td>26</td><td>-0.0740145</td><td>11.0297</td><td>9</td><td>GroupA</td></tr><tr><th>17</th><td>27</td><td>0.150976</td><td>14.8349</td><td>10</td><td>GroupA</td></tr><tr><th>18</th><td>28</td><td>0.769278</td><td>9.38405</td><td>14</td><td>GroupA</td></tr><tr><th>19</th><td>29</td><td>-0.310153</td><td>12.4906</td><td>15</td><td>GroupA</td></tr><tr><th>20</th><td>30</td><td>-0.602707</td><td>9.9001</td><td>7</td><td>GroupA</td></tr></tbody></table>"
],
"text/latex": [
"\\begin{tabular}{r|ccccc}\n",
"\t& ID & var1 & var2 & var3 & Col1\\\\\n",
"\t\\hline\n",
"\t& Int64 & Float64 & Float64 & Int64 & String\\\\\n",
"\t\\hline\n",
"\t1 & 11 & -0.560501 & 9.83968 & 15 & GroupA \\\\\n",
"\t2 & 12 & -0.0192918 & 7.81756 & 14 & GroupB \\\\\n",
"\t3 & 13 & 0.128064 & 8.83897 & 11 & GroupB \\\\\n",
"\t4 & 14 & 1.85278 & 9.36913 & 10 & GroupB \\\\\n",
"\t5 & 15 & -0.827763 & 7.2771 & 15 & GroupB \\\\\n",
"\t6 & 16 & 0.110096 & 9.77109 & 15 & GroupA \\\\\n",
"\t7 & 17 & -0.251176 & 10.3317 & 6 & GroupB \\\\\n",
"\t8 & 18 & 0.369714 & 9.18312 & 5 & GroupA \\\\\n",
"\t9 & 19 & 0.0721164 & 7.98043 & 12 & GroupB \\\\\n",
"\t10 & 20 & -1.50343 & 8.91239 & 13 & GroupA \\\\\n",
"\t11 & 21 & 1.56417 & 7.54655 & 14 & GroupB \\\\\n",
"\t12 & 22 & -1.39674 & 8.91657 & 5 & GroupB \\\\\n",
"\t13 & 23 & 1.1055 & 8.62701 & 8 & GroupA \\\\\n",
"\t14 & 24 & -1.10673 & 8.57414 & 9 & GroupA \\\\\n",
"\t15 & 25 & -3.21136 & 9.34588 & 5 & GroupA \\\\\n",
"\t16 & 26 & -0.0740145 & 11.0297 & 9 & GroupA \\\\\n",
"\t17 & 27 & 0.150976 & 14.8349 & 10 & GroupA \\\\\n",
"\t18 & 28 & 0.769278 & 9.38405 & 14 & GroupA \\\\\n",
"\t19 & 29 & -0.310153 & 12.4906 & 15 & GroupA \\\\\n",
"\t20 & 30 & -0.602707 & 9.9001 & 7 & GroupA \\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"20×5 DataFrame\n",
"│ Row │ ID │ var1 │ var2 │ var3 │ Col1 │\n",
"│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mString\u001b[39m │\n",
"├─────┼───────┼────────────┼─────────┼───────┼────────┤\n",
"│ 1 │ 11 │ -0.560501 │ 9.83968 │ 15 │ GroupA │\n",
"│ 2 │ 12 │ -0.0192918 │ 7.81756 │ 14 │ GroupB │\n",
"│ 3 │ 13 │ 0.128064 │ 8.83897 │ 11 │ GroupB │\n",
"│ 4 │ 14 │ 1.85278 │ 9.36913 │ 10 │ GroupB │\n",
"│ 5 │ 15 │ -0.827763 │ 7.2771 │ 15 │ GroupB │\n",
"│ 6 │ 16 │ 0.110096 │ 9.77109 │ 15 │ GroupA │\n",
"│ 7 │ 17 │ -0.251176 │ 10.3317 │ 6 │ GroupB │\n",
"│ 8 │ 18 │ 0.369714 │ 9.18312 │ 5 │ GroupA │\n",
"│ 9 │ 19 │ 0.0721164 │ 7.98043 │ 12 │ GroupB │\n",
"│ 10 │ 20 │ -1.50343 │ 8.91239 │ 13 │ GroupA │\n",
"│ 11 │ 21 │ 1.56417 │ 7.54655 │ 14 │ GroupB │\n",
"│ 12 │ 22 │ -1.39674 │ 8.91657 │ 5 │ GroupB │\n",
"│ 13 │ 23 │ 1.1055 │ 8.62701 │ 8 │ GroupA │\n",
"│ 14 │ 24 │ -1.10673 │ 8.57414 │ 9 │ GroupA │\n",
"│ 15 │ 25 │ -3.21136 │ 9.34588 │ 5 │ GroupA │\n",
"│ 16 │ 26 │ -0.0740145 │ 11.0297 │ 9 │ GroupA │\n",
"│ 17 │ 27 │ 0.150976 │ 14.8349 │ 10 │ GroupA │\n",
"│ 18 │ 28 │ 0.769278 │ 9.38405 │ 14 │ GroupA │\n",
"│ 19 │ 29 │ -0.310153 │ 12.4906 │ 15 │ GroupA │\n",
"│ 20 │ 30 │ -0.602707 │ 9.9001 │ 7 │ GroupA │"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"insertcols!(df2,:Col1 => rand([\"GroupA\",\"GroupB\"],20))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"8. In a code cells below, create a DataFrame named df3 with columns named *id*, var4 and var5 such that id contains the values 11 through 30, var4 contains the values 21 through 40 and var5 contains the values 41 through 60."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"data-frame\"><thead><tr><th></th><th>ID</th><th>var4</th><th>var5</th></tr><tr><th></th><th>Int64</th><th>Int64</th><th>Int64</th></tr></thead><tbody><p>20 rows × 3 columns</p><tr><th>1</th><td>11</td><td>21</td><td>41</td></tr><tr><th>2</th><td>12</td><td>22</td><td>42</td></tr><tr><th>3</th><td>13</td><td>23</td><td>43</td></tr><tr><th>4</th><td>14</td><td>24</td><td>44</td></tr><tr><th>5</th><td>15</td><td>25</td><td>45</td></tr><tr><th>6</th><td>16</td><td>26</td><td>46</td></tr><tr><th>7</th><td>17</td><td>27</td><td>47</td></tr><tr><th>8</th><td>18</td><td>28</td><td>48</td></tr><tr><th>9</th><td>19</td><td>29</td><td>49</td></tr><tr><th>10</th><td>20</td><td>30</td><td>50</td></tr><tr><th>11</th><td>21</td><td>31</td><td>51</td></tr><tr><th>12</th><td>22</td><td>32</td><td>52</td></tr><tr><th>13</th><td>23</td><td>33</td><td>53</td></tr><tr><th>14</th><td>24</td><td>34</td><td>54</td></tr><tr><th>15</th><td>25</td><td>35</td><td>55</td></tr><tr><th>16</th><td>26</td><td>36</td><td>56</td></tr><tr><th>17</th><td>27</td><td>37</td><td>57</td></tr><tr><th>18</th><td>28</td><td>38</td><td>58</td></tr><tr><th>19</th><td>29</td><td>39</td><td>59</td></tr><tr><th>20</th><td>30</td><td>40</td><td>60</td></tr></tbody></table>"
],
"text/latex": [
"\\begin{tabular}{r|ccc}\n",
"\t& ID & var4 & var5\\\\\n",
"\t\\hline\n",
"\t& Int64 & Int64 & Int64\\\\\n",
"\t\\hline\n",
"\t1 & 11 & 21 & 41 \\\\\n",
"\t2 & 12 & 22 & 42 \\\\\n",
"\t3 & 13 & 23 & 43 \\\\\n",
"\t4 & 14 & 24 & 44 \\\\\n",
"\t5 & 15 & 25 & 45 \\\\\n",
"\t6 & 16 & 26 & 46 \\\\\n",
"\t7 & 17 & 27 & 47 \\\\\n",
"\t8 & 18 & 28 & 48 \\\\\n",
"\t9 & 19 & 29 & 49 \\\\\n",
"\t10 & 20 & 30 & 50 \\\\\n",
"\t11 & 21 & 31 & 51 \\\\\n",
"\t12 & 22 & 32 & 52 \\\\\n",
"\t13 & 23 & 33 & 53 \\\\\n",
"\t14 & 24 & 34 & 54 \\\\\n",
"\t15 & 25 & 35 & 55 \\\\\n",
"\t16 & 26 & 36 & 56 \\\\\n",
"\t17 & 27 & 37 & 57 \\\\\n",
"\t18 & 28 & 38 & 58 \\\\\n",
"\t19 & 29 & 39 & 59 \\\\\n",
"\t20 & 30 & 40 & 60 \\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"20×3 DataFrame\n",
"│ Row │ ID │ var4 │ var5 │\n",
"│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │\n",
"├─────┼───────┼───────┼───────┤\n",
"│ 1 │ 11 │ 21 │ 41 │\n",
"│ 2 │ 12 │ 22 │ 42 │\n",
"│ 3 │ 13 │ 23 │ 43 │\n",
"│ 4 │ 14 │ 24 │ 44 │\n",
"│ 5 │ 15 │ 25 │ 45 │\n",
"│ 6 │ 16 │ 26 │ 46 │\n",
"│ 7 │ 17 │ 27 │ 47 │\n",
"│ 8 │ 18 │ 28 │ 48 │\n",
"│ 9 │ 19 │ 29 │ 49 │\n",
"│ 10 │ 20 │ 30 │ 50 │\n",
"│ 11 │ 21 │ 31 │ 51 │\n",
"│ 12 │ 22 │ 32 │ 52 │\n",
"│ 13 │ 23 │ 33 │ 53 │\n",
"│ 14 │ 24 │ 34 │ 54 │\n",
"│ 15 │ 25 │ 35 │ 55 │\n",
"│ 16 │ 26 │ 36 │ 56 │\n",
"│ 17 │ 27 │ 37 │ 57 │\n",
"│ 18 │ 28 │ 38 │ 58 │\n",
"│ 19 │ 29 │ 39 │ 59 │\n",
"│ 20 │ 30 │ 40 │ 60 │"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3 = DataFrame(ID = collect(11:30), var4 = collect(21:40), var5 = collect(41:60))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"9. In a code cells below, do a join of DataFrames df2 and df3 on the id column and save the result as a new dataframe called df4"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"data-frame\"><thead><tr><th></th><th>ID</th><th>var1</th><th>var2</th><th>var3</th><th>Col1</th><th>var4</th><th>var5</th></tr><tr><th></th><th>Int64</th><th>Float64</th><th>Float64</th><th>Int64</th><th>String</th><th>Int64</th><th>Int64</th></tr></thead><tbody><p>20 rows × 7 columns</p><tr><th>1</th><td>11</td><td>-0.560501</td><td>9.83968</td><td>15</td><td>GroupA</td><td>21</td><td>41</td></tr><tr><th>2</th><td>12</td><td>-0.0192918</td><td>7.81756</td><td>14</td><td>GroupB</td><td>22</td><td>42</td></tr><tr><th>3</th><td>13</td><td>0.128064</td><td>8.83897</td><td>11</td><td>GroupB</td><td>23</td><td>43</td></tr><tr><th>4</th><td>14</td><td>1.85278</td><td>9.36913</td><td>10</td><td>GroupB</td><td>24</td><td>44</td></tr><tr><th>5</th><td>15</td><td>-0.827763</td><td>7.2771</td><td>15</td><td>GroupB</td><td>25</td><td>45</td></tr><tr><th>6</th><td>16</td><td>0.110096</td><td>9.77109</td><td>15</td><td>GroupA</td><td>26</td><td>46</td></tr><tr><th>7</th><td>17</td><td>-0.251176</td><td>10.3317</td><td>6</td><td>GroupB</td><td>27</td><td>47</td></tr><tr><th>8</th><td>18</td><td>0.369714</td><td>9.18312</td><td>5</td><td>GroupA</td><td>28</td><td>48</td></tr><tr><th>9</th><td>19</td><td>0.0721164</td><td>7.98043</td><td>12</td><td>GroupB</td><td>29</td><td>49</td></tr><tr><th>10</th><td>20</td><td>-1.50343</td><td>8.91239</td><td>13</td><td>GroupA</td><td>30</td><td>50</td></tr><tr><th>11</th><td>21</td><td>1.56417</td><td>7.54655</td><td>14</td><td>GroupB</td><td>31</td><td>51</td></tr><tr><th>12</th><td>22</td><td>-1.39674</td><td>8.91657</td><td>5</td><td>GroupB</td><td>32</td><td>52</td></tr><tr><th>13</th><td>23</td><td>1.1055</td><td>8.62701</td><td>8</td><td>GroupA</td><td>33</td><td>53</td></tr><tr><th>14</th><td>24</td><td>-1.10673</td><td>8.57414</td><td>9</td><td>GroupA</td><td>34</td><td>54</td></tr><tr><th>15</th><td>25</td><td>-3.21136</td><td>9.34588</td><td>5</td><td>GroupA</td><td>35</td><td>55</td></tr><tr><th>16</th><td>26</td><td>-0.0740145</td><td>11.0297</td><td>9</td><td>GroupA</td><td>36</td><td>56</td></tr><tr><th>17</th><td>27</td><td>0.150976</td><td>14.8349</td><td>10</td><td>GroupA</td><td>37</td><td>57</td></tr><tr><th>18</th><td>28</td><td>0.769278</td><td>9.38405</td><td>14</td><td>GroupA</td><td>38</td><td>58</td></tr><tr><th>19</th><td>29</td><td>-0.310153</td><td>12.4906</td><td>15</td><td>GroupA</td><td>39</td><td>59</td></tr><tr><th>20</th><td>30</td><td>-0.602707</td><td>9.9001</td><td>7</td><td>GroupA</td><td>40</td><td>60</td></tr></tbody></table>"
],
"text/latex": [
"\\begin{tabular}{r|ccccccc}\n",
"\t& ID & var1 & var2 & var3 & Col1 & var4 & var5\\\\\n",
"\t\\hline\n",
"\t& Int64 & Float64 & Float64 & Int64 & String & Int64 & Int64\\\\\n",
"\t\\hline\n",
"\t1 & 11 & -0.560501 & 9.83968 & 15 & GroupA & 21 & 41 \\\\\n",
"\t2 & 12 & -0.0192918 & 7.81756 & 14 & GroupB & 22 & 42 \\\\\n",
"\t3 & 13 & 0.128064 & 8.83897 & 11 & GroupB & 23 & 43 \\\\\n",
"\t4 & 14 & 1.85278 & 9.36913 & 10 & GroupB & 24 & 44 \\\\\n",
"\t5 & 15 & -0.827763 & 7.2771 & 15 & GroupB & 25 & 45 \\\\\n",
"\t6 & 16 & 0.110096 & 9.77109 & 15 & GroupA & 26 & 46 \\\\\n",
"\t7 & 17 & -0.251176 & 10.3317 & 6 & GroupB & 27 & 47 \\\\\n",
"\t8 & 18 & 0.369714 & 9.18312 & 5 & GroupA & 28 & 48 \\\\\n",
"\t9 & 19 & 0.0721164 & 7.98043 & 12 & GroupB & 29 & 49 \\\\\n",
"\t10 & 20 & -1.50343 & 8.91239 & 13 & GroupA & 30 & 50 \\\\\n",
"\t11 & 21 & 1.56417 & 7.54655 & 14 & GroupB & 31 & 51 \\\\\n",
"\t12 & 22 & -1.39674 & 8.91657 & 5 & GroupB & 32 & 52 \\\\\n",
"\t13 & 23 & 1.1055 & 8.62701 & 8 & GroupA & 33 & 53 \\\\\n",
"\t14 & 24 & -1.10673 & 8.57414 & 9 & GroupA & 34 & 54 \\\\\n",
"\t15 & 25 & -3.21136 & 9.34588 & 5 & GroupA & 35 & 55 \\\\\n",
"\t16 & 26 & -0.0740145 & 11.0297 & 9 & GroupA & 36 & 56 \\\\\n",
"\t17 & 27 & 0.150976 & 14.8349 & 10 & GroupA & 37 & 57 \\\\\n",
"\t18 & 28 & 0.769278 & 9.38405 & 14 & GroupA & 38 & 58 \\\\\n",
"\t19 & 29 & -0.310153 & 12.4906 & 15 & GroupA & 39 & 59 \\\\\n",
"\t20 & 30 & -0.602707 & 9.9001 & 7 & GroupA & 40 & 60 \\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"20×7 DataFrame\n",
"│ Row │ ID │ var1 │ var2 │ var3 │ Col1 │ var4 │ var5 │\n",
"│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │\n",
"├─────┼───────┼────────────┼─────────┼───────┼────────┼───────┼───────┤\n",
"│ 1 │ 11 │ -0.560501 │ 9.83968 │ 15 │ GroupA │ 21 │ 41 │\n",
"│ 2 │ 12 │ -0.0192918 │ 7.81756 │ 14 │ GroupB │ 22 │ 42 │\n",
"│ 3 │ 13 │ 0.128064 │ 8.83897 │ 11 │ GroupB │ 23 │ 43 │\n",
"│ 4 │ 14 │ 1.85278 │ 9.36913 │ 10 │ GroupB │ 24 │ 44 │\n",
"│ 5 │ 15 │ -0.827763 │ 7.2771 │ 15 │ GroupB │ 25 │ 45 │\n",
"│ 6 │ 16 │ 0.110096 │ 9.77109 │ 15 │ GroupA │ 26 │ 46 │\n",
"│ 7 │ 17 │ -0.251176 │ 10.3317 │ 6 │ GroupB │ 27 │ 47 │\n",
"│ 8 │ 18 │ 0.369714 │ 9.18312 │ 5 │ GroupA │ 28 │ 48 │\n",
"│ 9 │ 19 │ 0.0721164 │ 7.98043 │ 12 │ GroupB │ 29 │ 49 │\n",
"│ 10 │ 20 │ -1.50343 │ 8.91239 │ 13 │ GroupA │ 30 │ 50 │\n",
"│ 11 │ 21 │ 1.56417 │ 7.54655 │ 14 │ GroupB │ 31 │ 51 │\n",
"│ 12 │ 22 │ -1.39674 │ 8.91657 │ 5 │ GroupB │ 32 │ 52 │\n",
"│ 13 │ 23 │ 1.1055 │ 8.62701 │ 8 │ GroupA │ 33 │ 53 │\n",
"│ 14 │ 24 │ -1.10673 │ 8.57414 │ 9 │ GroupA │ 34 │ 54 │\n",
"│ 15 │ 25 │ -3.21136 │ 9.34588 │ 5 │ GroupA │ 35 │ 55 │\n",
"│ 16 │ 26 │ -0.0740145 │ 11.0297 │ 9 │ GroupA │ 36 │ 56 │\n",
"│ 17 │ 27 │ 0.150976 │ 14.8349 │ 10 │ GroupA │ 37 │ 57 │\n",
"│ 18 │ 28 │ 0.769278 │ 9.38405 │ 14 │ GroupA │ 38 │ 58 │\n",
"│ 19 │ 29 │ -0.310153 │ 12.4906 │ 15 │ GroupA │ 39 │ 59 │\n",
"│ 20 │ 30 │ -0.602707 │ 9.9001 │ 7 │ GroupA │ 40 │ 60 │"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df4 = innerjoin(df2,df3,on = :ID)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 1.2.0",
"language": "julia",
"name": "julia-1.2"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "1.2.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because it is too large Load Diff

201
Week4_Working_with_data.jl Normal file
View File

@ -0,0 +1,201 @@
# WORKING WITH DATA
# -----------------
# I Distributions
# * Data point values for a distribution usually follow a pattern
# * Such patterns are called distributions
# * Distributions are either discrete or continuous
# * The Distribution.jl package contains most of the common
# data distributions
# 1 Importing Distributions.jl
using Distributions
using Random
# 2 The standard normal distribution
#Seed the pseudo-random number generator
Random.seed!(1234)
#Saving the standard normal distribution as an object
n = Normal()
#Parameter values of the standard normal distribution
params(n)
#Select 10 elements at random from n
var1 = rand(n, 10)
#Calculating the mean and standard deviation of var1
mean(var1)
std(var1)
#Probability density function value at x = 0.3
pdf(Normal(), 0.3)
#Cumulative distribution function as x = 0.25
cdf(Normal(), 0.25)
#Quantiles
quantile(Normal(), 0.025)
quantile(Normal(), 0.975)
# 3 The normal distribution
#Returning the parameters of the normal distribution
fieldnames(Normal)
#Creating 100 data point values from a normal distribution
# with a mean of 100 and a standard deviation of 10
var2 = rand(Normal(100, 10), 100)
#Calculating the mean and standard deviation of var2
mean(var2)
std(var2)
#Using fit() to calculate the parameters of a distribution
fit(Normal, var2)
# 3 Skewness and kurtosis
skewness(var2)
kurtosis(var2)
# 4 Beta distribution
b = Beta(1, 1)
params(b)
var3 = rand(b, 100)
fit(Beta, var3)
# 5 χ2 distribution
c = Chisq(1)
var4 = rand(c, 100)
fieldnames(Chisq) # Degrees of freedom
# 6 Distribution types are hierarchical
supertype(Normal)
subtypes(Distribution{Univariate,Continuous})
subtypes(Distribution{Univariate,Discrete})
# * Search for help in the REPL
# II DataFrames
using DataFrames
# * Allows for creation of a flat data structure (rows and columns)
# * Columns are variables
# * Rows are subjects (examples)
# 1 Create a DataFrame
typeof(var2)
#Create and empty DataFrame
df = DataFrame()
# 2 Add a column with data point values (rows)
df[:Var2] = var2
#View first five rows
head(df)
# 3 Add another columns
df[:Var3] = var3
#View last three rows
tail(df, 3)
# 4 Dimensions of a DataFrame
size(df)
rows = size(df, 1)
columns = size(df, 2)
# 5 Inspect content
showcols(df)
#Data type only
eltypes(df)
#Descriptive statistics
describe(df)
#Print in console
print(describe(df))
# 6 Create a bigger DataFrame
df2 = DataFrame()
df2[:A] = 1:10
df2[:B] = ["I", "II", "II", "I", "II","I", "II", "II", "I", "II"]
df2[:C] = rand(Normal(), 10)
df2[:D] = rand(Chisq(1), 10)
# 7 Slicing
#First three rows
df2[1:3, :]
#All rows columns 1 and 3
df2[:, [1, 3]]
#Different notation
df2[:, [:A, :C]]
# III Importing data files
# 1 Set working directory in Atom settings under Julia tab
pwd()
# 2 Import CSV
using CSV
# 3 Import csv file (in same directory)
data1 = CSV.read("CCS.csv")
# 4 Explore the data
typeof(data1)
head(data1)
showcols(data1)
eltypes(data1)
describe(data1)
# 5 Combining DataFrames
#Creating DataFrames
subjects = DataFrame(Number = [100, 101, 102, 103], Stage = ["I", "III", "II", "I"])
treatment = DataFrame(Number = [103, 102, 101, 100], Treatment = ["A", "B", "A", "B"])
subjects
treatment
#Joining
df3 = join(subjects, treatment, on = :Number);
df3
#Adding a longer list of subjects
subjects = DataFrame(Number = [100, 101, 102, 103, 104, 105], Stage = ["I", "III", "II", "I", "II", "II"])
#Inner join
df4 = join(subjects, treatment, on = :Number, kind = :inner);
df4
#Outer joing: empty fields filled with missing
df5 = join(subjects, treatment, on = :Number, kind = :outer);
df5
# 6 Grouping
#Creating a new DataFrame
df6 = DataFrame(Group = rand(["A", "B", "C"], 15), Variable1 = randn(15), Variable2 = rand(15));
df6
#Grouping using by()
by(df6, :Group, size)
#Count unique data point values in :Group column
by(df6, :Group, dfc -> DataFrame(Count = size(dfc, 1)))
#Aggregate for descriptive statistics
print(aggregate(df6, :Group, [mean, std]))
#Group
groupby(df6, :Group)
length(groupby(df6, :Group))
groupby(df6, :Group)[2]
# 7 Selection
df6[:Variable1] .> 0
sub(df6, df6[:Variable1] .> 0)
# 8 New DataFrame by selection
df6A = sub(df6, df6[:Group] .== "A");
df6A
# 9 Sorting
df6S = sort!(df6, cols = [:Group, :Variable1], rev = true);
df6S
df6S2 = sort!(df6, cols = [:Group, :Variable1, :Variable2], rev = (false, false, true));
df6S2
# 10 Unique rows
#Creating a DataFrame with an obvious duplicate row
df7 = DataFrame(A = [1, 2, 2, 3, 4, 5], B = [11, 12, 12, 13, 14, 15], C = ["A", "B", "B", "C", "D", "E"]);
df7
#Only unique rows
unique(df7)
df7
#Permanant change
unique!(df7)
df7
# 11 Delete rows
#Permanently
deleterows!(df7, [1, 5])
df7