my net house

WAHEGURU….!

Learning Dataframes in Julia

Week4_DataF

Week 4 – Working with Distributions and DataFrames.

In [1]:
# Import the required packages
using Distributions, DataFrames
In [2]:
# Seed the random number generator
srand(1234);
In [3]:
# Question 4: Create the 3 x 30 array named array_1
# 30 rows and 3 columns array
array_1 = [rand(30) rand(30) rand(30)]
size(array_1)
array_1
Out[3]:
30×3 Array{Float64,2}:
 0.590845   0.931115   0.643704 
 0.766797   0.438939   0.401421 
 0.566237   0.246862   0.525057 
 0.460085   0.0118196  0.61201  
 0.794026   0.0460428  0.432577 
 0.854147   0.496169   0.082207 
 0.200586   0.732      0.199058 
 0.298614   0.299058   0.576082 
 0.246837   0.449182   0.218177 
 0.579672   0.875096   0.362036 
 0.648882   0.0462887  0.204728 
 0.0109059  0.698356   0.932984 
 0.066423   0.365109   0.827263 
 ⋮                              
 0.0566425  0.404953   0.0396356
 0.842714   0.499531   0.79041  
 0.950498   0.658815   0.431188 
 0.96467    0.515627   0.137658 
 0.945775   0.260715   0.60808  
 0.789904   0.59552    0.255054 
 0.82116    0.292462   0.498734 
 0.0341601  0.28858    0.0940369
 0.0945445  0.61816    0.52509  
 0.314926   0.66426    0.265511 
 0.12781    0.753508   0.110096 
 0.374187   0.0368842  0.834362
In [4]:
# Question 5: Mean and variance of column 1
mean_column_1 = mean(array_1[:,1])
var_column_1=var(array_1[:,1])
println("mean=",mean_column_1)
println("var=",var_column_1)
mean=0.5014887976938368
var=0.10653465363277906
In [5]:
# Question 5 (continued): Mean and variance of column 2
mean_column_2 = mean(array_1[:,2])
var_column_2=var(array_1[:,2])
println("mean=",mean_column_2)
println("var=",var_column_2)
mean=0.4160447968360426
var=0.06360439983290869
In [6]:
# Question 5 (continued): Mean and variance of column 3
mean_column_3 = mean(array_1[:,3])
var_column_3=var(array_1[:,3])
println("mean=",mean_column_3)
println("var=",var_column_3)
mean=0.4372634519427959
var=0.07568707224628725
In [7]:
# Question 6: Import array_1 into a DataFrame named df
df = DataFrame(array_1)
Out[7]:
x1 x2 x3
1 0.5908446386657102 0.9311151512445586 0.6437042811826996
2 0.7667970365022592 0.43893895933102156 0.40142056533714965
3 0.5662374165061859 0.24686248047491066 0.5250572942486489
4 0.4600853424625171 0.011819583479107054 0.6120098074984683
5 0.7940257103317943 0.046042826396498704 0.43257652982765626
6 0.8541465903790502 0.496168672722459 0.0822070287962946
7 0.20058603493384108 0.7320003814997245 0.19905799020907944
8 0.2986142783434118 0.29905752670238184 0.5760819730593403
9 0.24683718661000897 0.4491821088563024 0.21817706596841413
10 0.5796722333690416 0.8750962647851142 0.3620355262053865
11 0.6488819502093455 0.046288741031345504 0.20472832290217324
12 0.010905889635595356 0.6983555060532487 0.93298350850828
13 0.06642303695533736 0.3651093677271471 0.8272627957034728
14 0.9567533636029237 0.3024777928234499 0.09929915955881308
15 0.646690981531646 0.3725754415996787 0.6342997886044144
16 0.11248587118714015 0.15050782744925795 0.1327153585755645
17 0.2760209506672211 0.14732938279328955 0.7751941503856596
18 0.6516642063795697 0.2834013103457036 0.8692366891234362
19 0.05664246860321187 0.40495283364883794 0.039635617270926904
20 0.8427136165865521 0.49953074411487797 0.7904095314876494
21 0.9504984071553011 0.6588147837334961 0.43118828904466633
22 0.9646697763820897 0.5156272179795256 0.1376583132625555
23 0.9457754052519123 0.26071522632820776 0.6080803126880718
24 0.7899036826169576 0.5955204840509289 0.2550540600167448
25 0.8211604203482923 0.2924615242315285 0.4987340031883092
26 0.03416010848943718 0.2885798506061561 0.09403688346569439
27 0.09454448946400307 0.6181597973815087 0.5250899072103514
28 0.31492622391998415 0.6642598175011505 0.2655109248498748
29 0.12780989889368866 0.7535081177709988 0.11009621399607639
30 0.374186714831074 0.03688418241886171 0.8343616661080064
In [8]:
# check available names and fieldnames in Julia, Python's alternative
f_name =fieldnames(df)
name=names(df)
println(f_name,name)
Symbol[:columns, :colindex]Symbol[:x1, :x2, :x3]
In [9]:
# Accessing different columns of df
df[:x3]
Out[9]:
30-element Array{Float64,1}:
 0.643704 
 0.401421 
 0.525057 
 0.61201  
 0.432577 
 0.082207 
 0.199058 
 0.576082 
 0.218177 
 0.362036 
 0.204728 
 0.932984 
 0.827263 
 ⋮        
 0.0396356
 0.79041  
 0.431188 
 0.137658 
 0.60808  
 0.255054 
 0.498734 
 0.0940369
 0.52509  
 0.265511 
 0.110096 
 0.834362
In [10]:
# Question 7: Change the names of the columns to Var1, Var2, and Var3
rename!(df,Dict(:x1=>:Var1,:x2=>:Var2,:x3=>:Var))
Out[10]:
Var1 Var2 Var
1 0.5908446386657102 0.9311151512445586 0.6437042811826996
2 0.7667970365022592 0.43893895933102156 0.40142056533714965
3 0.5662374165061859 0.24686248047491066 0.5250572942486489
4 0.4600853424625171 0.011819583479107054 0.6120098074984683
5 0.7940257103317943 0.046042826396498704 0.43257652982765626
6 0.8541465903790502 0.496168672722459 0.0822070287962946
7 0.20058603493384108 0.7320003814997245 0.19905799020907944
8 0.2986142783434118 0.29905752670238184 0.5760819730593403
9 0.24683718661000897 0.4491821088563024 0.21817706596841413
10 0.5796722333690416 0.8750962647851142 0.3620355262053865
11 0.6488819502093455 0.046288741031345504 0.20472832290217324
12 0.010905889635595356 0.6983555060532487 0.93298350850828
13 0.06642303695533736 0.3651093677271471 0.8272627957034728
14 0.9567533636029237 0.3024777928234499 0.09929915955881308
15 0.646690981531646 0.3725754415996787 0.6342997886044144
16 0.11248587118714015 0.15050782744925795 0.1327153585755645
17 0.2760209506672211 0.14732938279328955 0.7751941503856596
18 0.6516642063795697 0.2834013103457036 0.8692366891234362
19 0.05664246860321187 0.40495283364883794 0.039635617270926904
20 0.8427136165865521 0.49953074411487797 0.7904095314876494
21 0.9504984071553011 0.6588147837334961 0.43118828904466633
22 0.9646697763820897 0.5156272179795256 0.1376583132625555
23 0.9457754052519123 0.26071522632820776 0.6080803126880718
24 0.7899036826169576 0.5955204840509289 0.2550540600167448
25 0.8211604203482923 0.2924615242315285 0.4987340031883092
26 0.03416010848943718 0.2885798506061561 0.09403688346569439
27 0.09454448946400307 0.6181597973815087 0.5250899072103514
28 0.31492622391998415 0.6642598175011505 0.2655109248498748
29 0.12780989889368866 0.7535081177709988 0.11009621399607639
30 0.374186714831074 0.03688418241886171 0.8343616661080064
In [11]:
### we can also tail function see last required entries
tail(df,20)
Out[11]:
Var1 Var2 Var
1 0.6488819502093455 0.046288741031345504 0.20472832290217324
2 0.010905889635595356 0.6983555060532487 0.93298350850828
3 0.06642303695533736 0.3651093677271471 0.8272627957034728
4 0.9567533636029237 0.3024777928234499 0.09929915955881308
5 0.646690981531646 0.3725754415996787 0.6342997886044144
6 0.11248587118714015 0.15050782744925795 0.1327153585755645
7 0.2760209506672211 0.14732938279328955 0.7751941503856596
8 0.6516642063795697 0.2834013103457036 0.8692366891234362
9 0.05664246860321187 0.40495283364883794 0.039635617270926904
10 0.8427136165865521 0.49953074411487797 0.7904095314876494
11 0.9504984071553011 0.6588147837334961 0.43118828904466633
12 0.9646697763820897 0.5156272179795256 0.1376583132625555
13 0.9457754052519123 0.26071522632820776 0.6080803126880718
14 0.7899036826169576 0.5955204840509289 0.2550540600167448
15 0.8211604203482923 0.2924615242315285 0.4987340031883092
16 0.03416010848943718 0.2885798506061561 0.09403688346569439
17 0.09454448946400307 0.6181597973815087 0.5250899072103514
18 0.31492622391998415 0.6642598175011505 0.2655109248498748
19 0.12780989889368866 0.7535081177709988 0.11009621399607639
20 0.374186714831074 0.03688418241886171 0.8343616661080064
In [12]:
# Creatring Second DataFrame
df2=DataFrame(tail(df,20))
Out[12]:
Var1 Var2 Var
1 0.6488819502093455 0.046288741031345504 0.20472832290217324
2 0.010905889635595356 0.6983555060532487 0.93298350850828
3 0.06642303695533736 0.3651093677271471 0.8272627957034728
4 0.9567533636029237 0.3024777928234499 0.09929915955881308
5 0.646690981531646 0.3725754415996787 0.6342997886044144
6 0.11248587118714015 0.15050782744925795 0.1327153585755645
7 0.2760209506672211 0.14732938279328955 0.7751941503856596
8 0.6516642063795697 0.2834013103457036 0.8692366891234362
9 0.05664246860321187 0.40495283364883794 0.039635617270926904
10 0.8427136165865521 0.49953074411487797 0.7904095314876494
11 0.9504984071553011 0.6588147837334961 0.43118828904466633
12 0.9646697763820897 0.5156272179795256 0.1376583132625555
13 0.9457754052519123 0.26071522632820776 0.6080803126880718
14 0.7899036826169576 0.5955204840509289 0.2550540600167448
15 0.8211604203482923 0.2924615242315285 0.4987340031883092
16 0.03416010848943718 0.2885798506061561 0.09403688346569439
17 0.09454448946400307 0.6181597973815087 0.5250899072103514
18 0.31492622391998415 0.6642598175011505 0.2655109248498748
19 0.12780989889368866 0.7535081177709988 0.11009621399607639
20 0.374186714831074 0.03688418241886171 0.8343616661080064
In [13]:
# Question 9: Calculate simple descriptive statistics of all the columns in df2 using the describe() function
describe(df2)
Var1
Summary Stats:
Mean:           0.484341
Minimum:        0.010906
1st Quartile:   0.108001
Median:         0.510439
3rd Quartile:   0.826549
Maximum:        0.964670
Length:         20
Type:           Float64

Var2
Summary Stats:
Mean:           0.397753
Minimum:        0.036884
1st Quartile:   0.277730
Median:         0.368842
3rd Quartile:   0.601180
Maximum:        0.753508
Length:         20
Type:           Float64

Var
Summary Stats:
Mean:           0.453279
Minimum:        0.039636
1st Quartile:   0.136423
Median:         0.464961
3rd Quartile:   0.778998
Maximum:        0.932984
Length:         20
Type:           Float64

In [14]:
# Question 10: Add a column to df2 named Cat1 to df2 consisting of randomly selecting either the strings GroupA or GroupB
df2 = hcat(df2, rand(["GroupA","GroupB"],20))
rename!(df2,Dict(:x1=>:Cat1))
Out[14]:
Var1 Var2 Var Cat1
1 0.6488819502093455 0.046288741031345504 0.20472832290217324 GroupB
2 0.010905889635595356 0.6983555060532487 0.93298350850828 GroupB
3 0.06642303695533736 0.3651093677271471 0.8272627957034728 GroupA
4 0.9567533636029237 0.3024777928234499 0.09929915955881308 GroupA
5 0.646690981531646 0.3725754415996787 0.6342997886044144 GroupA
6 0.11248587118714015 0.15050782744925795 0.1327153585755645 GroupA
7 0.2760209506672211 0.14732938279328955 0.7751941503856596 GroupB
8 0.6516642063795697 0.2834013103457036 0.8692366891234362 GroupB
9 0.05664246860321187 0.40495283364883794 0.039635617270926904 GroupB
10 0.8427136165865521 0.49953074411487797 0.7904095314876494 GroupB
11 0.9504984071553011 0.6588147837334961 0.43118828904466633 GroupA
12 0.9646697763820897 0.5156272179795256 0.1376583132625555 GroupB
13 0.9457754052519123 0.26071522632820776 0.6080803126880718 GroupA
14 0.7899036826169576 0.5955204840509289 0.2550540600167448 GroupB
15 0.8211604203482923 0.2924615242315285 0.4987340031883092 GroupA
16 0.03416010848943718 0.2885798506061561 0.09403688346569439 GroupB
17 0.09454448946400307 0.6181597973815087 0.5250899072103514 GroupB
18 0.31492622391998415 0.6642598175011505 0.2655109248498748 GroupA
19 0.12780989889368866 0.7535081177709988 0.11009621399607639 GroupA
20 0.374186714831074 0.03688418241886171 0.8343616661080064 GroupA
In [15]:
# Question 11: Create a new DataFrame named df3
df3 = DataFrame(A=1:20,B=21:40,C=41:60)
Out[15]:
A B C
1 1 21 41
2 2 22 42
3 3 23 43
4 4 24 44
5 5 25 45
6 6 26 46
7 7 27 47
8 8 28 48
9 9 29 49
10 10 30 50
11 11 31 51
12 12 32 52
13 13 33 53
14 14 34 54
15 15 35 55
16 16 36 56
17 17 37 57
18 18 38 58
19 19 39 59
20 20 40 60
In [16]:
# Question 12: Change indicated values to empty entries
#In a code cells below, change the values in df3 of the following cells to NA: row 10, column 1, row 15, column 2 and row #19, column 3
df3[10,1] = NA
df3[15,2] = NA 
df3[19,3] = NA
df3
Out[16]:
A B C
1 1 21 41
2 2 22 42
3 3 23 43
4 4 24 44
5 5 25 45
6 6 26 46
7 7 27 47
8 8 28 48
9 9 29 49
10 NA 30 50
11 11 31 51
12 12 32 52
13 13 33 53
14 14 34 54
15 15 NA 55
16 16 36 56
17 17 37 57
18 18 38 58
19 19 39 NA
20 20 40 60
In [17]:
# Question 13: Create DataFrame df4 that contains no rows with NaN (NA) values
df4 = completecases!(df3)
Out[17]:
A B C
1 1 21 41
2 2 22 42
3 3 23 43
4 4 24 44
5 5 25 45
6 6 26 46
7 7 27 47
8 8 28 48
9 9 29 49
10 11 31 51
11 12 32 52
12 13 33 53
13 14 34 54
14 16 36 56
15 17 37 57
16 18 38 58
17 20 40 60

 

 

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: