Advanced Data Analysis in R organised by NUGS-China¶

Session: Two

Date: 18/03/2021

Intructor: Clement Twumasi

Website: https://twumasiclement.wixsite.com/website

YouTube channel: https://www.youtube.com/channel/UCxrpjLYi_Akmbc2QICDvKaQ

Objectives¶

What is the general motivation of loops and apply functions?

For loops with conditions (single, double loops, etc).
While loops with conditions.
Saving results from loops.
Introduction to simulating probability distributions and plotting with loops.
Use of apply functions (apply, lapply, sapply, tapply, parLapply and mclapply).

1a. Introduction to loops (for loop & while loop) in R¶

NB: Avoid loops if not needed. Only use them when necessary.

You don't need a loop to add, substract, multiple or divide entries of two or more vectors.

Disclaimer: Loops can be used for more complicated tasks in simulations (eg. Monte Carlo siumations, stochastic modelling, etc.), plotting different items and for more simple or complex functions iteratively. The addition is used here as a toy example.

x<- 1:100
sum(x)

total=0
for(i in seq_along(x)){
    total=total+i
}

total

#Addition without loops
x <- seq(5,1000, length.out=30) #generating 30 numbers from 5 to 1000
y <- seq(1,150,by=5) #generating 30 numbers from 1 to 150 with contants increments of 5

sum_x_y_no_loop<- x+y

#sum_x_y_no_loop
print(sum_x_y_no_loop)
#y-x
#y/x

 [1]    6.00000   45.31034   84.62069  123.93103  163.24138  202.55172
 [7]  241.86207  281.17241  320.48276  359.79310  399.10345  438.41379
[13]  477.72414  517.03448  556.34483  595.65517  634.96552  674.27586
[19]  713.58621  752.89655  792.20690  831.51724  870.82759  910.13793
[25]  949.44828  988.75862 1028.06897 1067.37931 1106.68966 1146.00000

print(round(sum_x_y_no_loop, 2))# rounding the sum to 2 d.p

 [1]    6.00   45.31   84.62  123.93  163.24  202.55  241.86  281.17  320.48
[10]  359.79  399.10  438.41  477.72  517.03  556.34  595.66  634.97  674.28
[19]  713.59  752.90  792.21  831.52  870.83  910.14  949.45  988.76 1028.07
[28] 1067.38 1106.69 1146.00

For loops¶

The syntax for for loop is as follows:

for (val in sequence)

{

statement

}

Addition with for loops as saving results as a vector (as an example)

#print(x)

1:length(x)
seq_along(x)

#NB: seq_along(x) returns the index position of each value of x
print(seq_along(x))

 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30

For loop: Method 1

Saving loop results as a vector

x <- seq(5,1000, length.out=30) #generating 30 numbers from 5 to 1000
y <- seq(1,150,by=5) #generating 30 numbers from 1 to 150 with contants increments of 5

# sum_loop1: For storing results for each iteration
sum_loop1<- numeric(length(x)) #returns 0's of length equal to the total elements of the vector x

 #Alternatively
# sum_loop1<- rep(NA,length=length(x)) or sum_loop1<- rep("Clement",length=length(x))

for(K in seq_along(x)){
   sum_loop1[K]<- x[K]+y[K]  # you can also use the function:  sum(x[i],y[i]) instead of x[i]+y[i]
}

print(sum_loop1)

 [1]    6.00000   45.31034   84.62069  123.93103  163.24138  202.55172
 [7]  241.86207  281.17241  320.48276  359.79310  399.10345  438.41379
[13]  477.72414  517.03448  556.34483  595.65517  634.96552  674.27586
[19]  713.58621  752.89655  792.20690  831.51724  870.82759  910.13793
[25]  949.44828  988.75862 1028.06897 1067.37931 1106.68966 1146.00000

For loop: Method 2

Saving loop results as a vector

sum_loop2<- rep("clement",length=length(x)) #or sum_loop2<- rep(FALSE,length=length(x))

for(i in 1:length(x)){  #NB: 1:length(x) is similar to seq_along(x). However, seq_along() function is preferable.
   sum_loop2[i]<- sum(x[i],y[i])
}

print(sum_loop2)

class(sum_loop2)

 [1] "6"                "45.3103448275862" "84.6206896551724" "123.931034482759"
 [5] "163.241379310345" "202.551724137931" "241.862068965517" "281.172413793103"
 [9] "320.48275862069"  "359.793103448276" "399.103448275862" "438.413793103448"
[13] "477.724137931034" "517.034482758621" "556.344827586207" "595.655172413793"
[17] "634.965517241379" "674.275862068965" "713.586206896552" "752.896551724138"
[21] "792.206896551724" "831.51724137931"  "870.827586206897" "910.137931034483"
[25] "949.448275862069" "988.758620689655" "1028.06896551724" "1067.37931034483"
[29] "1106.68965517241" "1146"

print(as.numeric(sum_loop2))

 [1]    6.00000   45.31034   84.62069  123.93103  163.24138  202.55172
 [7]  241.86207  281.17241  320.48276  359.79310  399.10345  438.41379
[13]  477.72414  517.03448  556.34483  595.65517  634.96552  674.27586
[19]  713.58621  752.89655  792.20690  831.51724  870.82759  910.13793
[25]  949.44828  988.75862 1028.06897 1067.37931 1106.68966 1146.00000

For loop: Method 3

Saving loop results as a list

sum_loop3<- NULL #Alternatively: sum_loop3<- NULL  or sum_loop3<- c()

for(i in seq_along(x)){
   sum_loop3[[i]]<- x[i]+y[i]
}

sum_loop3

#To extract say the 5 and 30 list; use double square brackets [[ ]] to extract lists 
# and single square bracket [ ] to extract vectors
sum_loop3[[5]]
sum_loop3[[30]]

#Unlist change the list to a vector
unlist(sum_loop3)

Using "while loops" instead of "for loops"¶

Method 4

While loops¶

The syntax for while loop is as follows:

while (condition of interest)

{

statement

}

x <- seq(5,1000, length.out=30) #generating 30 numbers from 5 to 1000
y <- seq(1,150,by=5) #generating 30 numbers from 1 to 150 with contants increments of 5
sum_loop4<- c() #another way of saving each iterative as a list

i=1
while(i<= length(x)){
  sum_loop4[[i]]<- x[i]+y[i]  # you can also use the function:  sum(x[i],y[i]) instead of x[i]+y[i]
    
    i=i+1 #without adding this line, you will get an indefinite or unending loop since there won't be any iterations.
}

 
print(sum_loop4)

 [1]    6.00000   45.31034   84.62069  123.93103  163.24138  202.55172
 [7]  241.86207  281.17241  320.48276  359.79310  399.10345  438.41379
[13]  477.72414  517.03448  556.34483  595.65517  634.96552  674.27586
[19]  713.58621  752.89655  792.20690  831.51724  870.82759  910.13793
[25]  949.44828  988.75862 1028.06897 1067.37931 1106.68966 1146.00000

Using loops with plots¶

options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300) #Setting plot size

Introduction to simulating probability distributions

Simulating 5000 samples from binomial, Poisson, uniform, normal, lognormal and exponential distributions.

rpois(n=100,lambda=50)

set.seed(1)
rpois(n=100,lambda=50)

set.seed(123) #for reproducibility
names_prob_distn<-c("binomial","poisson","uniform","exponential","normal","lognormal")
prob_distn=NULL #list( )# c()

prob_distn[[1]]<-rbinom(n=5000,size=30,prob=0.5) #binomial sample
prob_distn[[2]]<-rpois(n=5000, lambda=2)#poisson_sample
prob_distn[[3]]<-runif(n=5000,min=0, max=5)#uniform sample
prob_distn[[4]]<-rexp(n=5000, rate=2)#exponential sample
prob_distn[[5]]<-rnorm(n=5000, mean = 0, sd = 1)#normal sample
prob_distn[[6]]<-rlnorm(n=5000, meanlog = 0, sdlog = 1)#lognormal sample

seq_along(names_prob_distn)

1:length(names_prob_distn)

par(mfrow=c(3,2),mar=c(4,4,1,0))

colours<- c("red","blue","green","yellow","violet","brown")
for(j in seq_along(names_prob_distn)){
    # you can replace col=colours[j] by just col=j (each colour in R has a unique index)
    #there are other packages for customized colours like package "RColorBrewer"
hist(prob_distn[[j]],col=colours[j],main=paste(names_prob_distn[j]),xlab="Samples",ylab="Frequency")
}

#trying n=1000 and increase n to 100000 (returns an elliptical shape for bivariate normality)
set.seed(1)
normal_sample1<- rnorm(n=100000, mean = 2, sd = 1) 
normal_sample2<- rnorm(n=100000, mean = 3, sd = 1)

plot(normal_sample1,normal_sample2,col="red",type="p",xlab="Normal samples 1",ylab="Normal sample 2")

Use of Apply functions¶

The apply functions are:

lapply(): lapply can be used for looping as well more efficiently as lists

sapply(): sapply is an efficient form of lapply() by storing list as vectors & matrices

tapply(): tapply() computes a measure (mean, median, min, max, etc..) or a function for each factor variable in a vector

apply(): apply() computes a measure (mean, median, min, max, etc..) or a function for numerical variables across either each row or each column of a dataframe

parlapply() for windows machines or mclapply() for linux machines : These parlapply() or mclapply() is used for parallelizing codes to run on multiple cores/processors of your computer to increase computational speed for much complex problems; but mclapply() is much powerful.

Recall the previous loop below for instance

#Recall the previous loop below
x <- seq(5,1000, length.out=30) #generating 30 numbers from 5 to 1000
y <- seq(1,150,by=5) #generating 30 numbers from 1 to 150 with contants increments of 5


sum_loop2<- rep(NA,length=length(x)) # it was previously: rep("Clement",length=length(x))

for(i in seq_along(x)){  #NB: 1:length(x) is similar to seq_along(x). However, seq_along() function is preferable.
   sum_loop2[i]<- sum(x[i],y[i]) #x[i]+y[i]
}

print(sum_loop2)

 [1]    6.00000   45.31034   84.62069  123.93103  163.24138  202.55172
 [7]  241.86207  281.17241  320.48276  359.79310  399.10345  438.41379
[13]  477.72414  517.03448  556.34483  595.65517  634.96552  674.27586
[19]  713.58621  752.89655  792.20690  831.51724  870.82759  910.13793
[25]  949.44828  988.75862 1028.06897 1067.37931 1106.68966 1146.00000

Using lapply for the same tasks above

#It saves results as a list in a much simpler manner
sum_loop5<- lapply(seq_along(x),function(k) sum(x[k],y[k])) #Or: lapply(seq_along(x),function(i) x[i]+y[i])
sum_loop5

Using sapply for the same tasks above

#Sapply saved results as a vector instead of a list
sum_loop6<- sapply(seq_along(x),function(i) x[i]+y[i] )
sum_loop6

Let's compare the computational speed/time (in secs) between for loops and lapply & sapply functions¶

For for loop

sample_sizes<- c(10000,300000,5000000,9000000,90000000)
normal_samples1<- NULL #to save results as list for each sample size

set.seed(1) #for reproducibility

#For for loop
start_time<- proc.time()
for(k in seq_along(sample_sizes))   normal_samples1[[k]]<- rnorm(n=sample_sizes[k],mean=5, sd=2)
    
end_time<- proc.time()-start_time

print(end_time)

   user  system elapsed 
   8.42    0.11    8.67

print(normal_samples1[[1]][1:100]) #to view first 100 normal samples at first sample size

#to view normal samples at first sample size
#print(normal_samples1[[1]]) 

#to view normal samples at third sample size
# print(normal_samples1[[3]])

  [1] 3.7470924 5.3672866 3.3287428 8.1905616 5.6590155 3.3590632 5.9748581
  [8] 6.4766494 6.1515627 4.3892232 8.0235623 5.7796865 3.7575188 0.5706002
 [15] 7.2498618 4.9101328 4.9676195 6.8876724 6.6424424 6.1878026 6.8379547
 [22] 6.5642726 5.1491300 1.0212966 6.2396515 4.8877425 4.6884090 2.0584952
 [29] 4.0436999 5.8358831 7.7173591 4.7944245 5.7753432 4.8923899 2.2458809
 [36] 4.1700109 4.2114201 4.8813732 7.2000507 6.5263515 4.6709528 4.4932766
 [43] 6.3939268 6.1133264 3.6224886 3.5850097 5.7291639 6.5370658 4.7753076
 [50] 6.7622155 5.7962118 3.7759472 5.6822394 2.7412738 7.8660474 8.9607998
 [57] 4.2655570 2.9117307 6.1394393 4.7298908 9.8032355 4.9215200 6.3794787
 [64] 5.0560043 3.5134536 5.3775846 1.3900827 7.9311097 5.3065067 9.3452233
 [71] 5.9510191 3.5801071 6.2214527 3.1318047 2.4927332 5.5828925 4.1134163
 [78] 5.0022107 5.1486826 3.8209581 3.8626625 4.7296428 7.3561740 1.9528664
 [85] 6.1878924 5.6659007 7.1261997 4.3916322 5.7400376 5.5341976 3.9149599
 [92] 7.4157356 7.3208052 6.4004273 8.1736669 6.1169729 2.4468156 3.8534692
 [99] 2.5507748 4.0531987

For lapply

#For lapply
start_time<- proc.time()
normal_samples2<- lapply(seq_along(sample_sizes), function(k) rnorm(n=sample_sizes[k],mean=5, sd=2))
end_time<- proc.time()-start_time
print(end_time)

   user  system elapsed 
   8.01    0.05    8.10

print(normal_samples2[[1]][1:100]) #to view first 100 normal samples at first sample size

#to view normal samples at first sample size
#print(normal_samples2[[1]]) 

#to view normal samples at third sample size
# print(normal_samples2[[3]])

  [1]  6.9374489  5.2069029  4.3122163  3.9115143  3.1293012  7.6795842
  [7]  7.0554352  5.6831158  4.9502476  5.2117229  6.7472558  9.8100738
 [13]  2.4562446  0.3314382  4.9483727  5.0979897 11.7191751  2.2796151
 [19]  7.7365746  5.9370868  6.1704643  5.6420747  4.1323065  8.6589243
 [25]  4.6611309  5.9995899  6.0013003  3.9354306  6.0004457  4.6587521
 [31]  4.9398255  5.3184170  4.0283645  2.9875326  5.4851867  3.8101859
 [37]  2.5716092  3.5866949  6.6951080  1.6759173  5.9592057  4.1595915
 [43]  4.6398186  3.6817163  5.1407931  6.2954271  6.9869326  4.6461554
 [49]  3.6766414  3.0293257  5.4502918  1.5736696  6.7363325  6.4551526
 [55]  2.2556055  5.7527906  7.5040078  5.4692807  5.4564573  6.0867920
 [61]  5.7024115  2.7183722  4.8726119  5.9365661  5.8339562  4.6308998
 [67]  5.3000306  5.4218274  0.8895970  3.0018588  5.0737620  5.7474361
 [73]  3.9113246  7.0761162  6.6738332  2.2569671  4.6505383  5.0675559
 [79]  3.0489150  4.4207956  8.8239129  4.7279426  6.2695764  7.8687539
 [85]  0.4928368  4.5099937 -0.1718006  4.6363367  5.9192034  3.9353943
 [91]  6.2584999  5.6037305  1.2978323  4.4250210  3.4686359  4.0281820
 [97]  3.5207554  3.5695827  8.3916057  5.0573394

print(normal_samples2[[3]][1:100]) #to view first 100 normal samples at 3rd sample size

  [1]  8.78842452  4.72474208  7.56198361  5.43499298  4.70063167  7.10035096
  [7]  3.51060468  3.07422934  5.70744295  4.21315665  3.83956671 10.94687259
 [13]  6.89484314  6.18873914  5.55125231 10.30528442  7.27691521  1.45517028
 [19]  0.19661623  2.93971889  5.74695139  4.83837413  9.16130085  6.54553534
 [25]  5.71381202  4.68674049  7.99097337  5.03712500  2.50246605  5.64863588
 [31]  6.86977695  6.01929433  5.55393215  3.90796749  6.74291152  4.53076579
 [37]  7.06232293  6.39348096  5.90421702  3.74997710  5.09896747  4.42026115
 [43]  5.38729384  6.09818747  7.45377723  3.78761096  4.85850295  1.50703587
 [49]  4.23439851  5.25878305  1.92190722  7.76437618  7.01046067  0.08227576
 [55]  5.22518629  6.58700601  6.72570046  2.18664460  7.09256987  2.98545734
 [61]  0.22103885  3.42936789  3.85556978  7.28374771  5.15464512  3.57983003
 [67]  5.76955796  4.00071360  4.94726015  7.12035310  3.44769042  9.14649429
 [73]  4.78418180  5.62352013  5.31993963  5.96592268  3.52734374  5.85613164
 [79]  1.71868617 -0.29963524  3.12787835  3.96141844  4.76689760  5.72946127
 [85]  6.54588295  5.32690005  5.51740610  1.97753742  2.65163514  8.08966809
 [91]  2.85377631  4.84286614  1.34995186  3.50336220  4.96646163  3.98342810
 [97]  4.12826629  7.57502759  5.48164062 10.28188609

For sapply

#For sapply
start_time<- proc.time()
normal_sample3<- sapply(seq_along(sample_sizes), function(k) rnorm(n=sample_sizes[k],mean=5, sd=2))
end_time<- proc.time()-start_time
print(end_time)

   user  system elapsed 
   7.87    0.12    8.02

print(normal_sample3[[1]][1:100]) #to view first 100 normal samples at first sample size

#to view normal samples at first sample size
#print(normal_sample3[[1]]) 

#to view normal samples at third sample size
# print(normal_sample3[[3]])

  [1]  4.829131  6.954489  4.922750  5.655671  4.686172  6.766904  7.908523
  [8]  7.187910  4.777816  5.429329  3.901003  6.600824  2.862909  4.552807
 [15]  6.706373  3.642072  5.408924  7.717857  7.465483  5.164684  4.533913
 [22]  8.032069  3.487182  3.769250  4.345548  5.009717  3.295832  5.770777
 [29]  4.037874  7.334999  5.287437 11.482118  7.557251  6.335800  3.968419
 [36]  5.407528  3.092221  5.114010  5.598206  6.544105  6.086862  7.604950
 [43]  3.231629  5.466490  4.370631  5.792476  1.301517  8.870107  3.268895
 [50]  5.658108  2.492119  4.115655  4.012276  4.575528  4.208562  5.862882
 [57]  5.552489  2.867058  3.475000  5.633297  2.572082  5.837646  2.954129
 [64]  6.266442  4.855227  7.328758  6.209392  5.628997  3.099826  5.496738
 [71]  5.115542  5.541282  4.590284  3.565649  5.409454  5.667402  6.594403
 [78]  3.448784  4.342199  3.909395  3.476031  3.486316  6.021867  2.497778
 [85]  3.305525  5.641889  4.496831  5.287264  3.980621  3.632434  4.382841
 [92]  7.451975  3.079897  4.665415  5.519484  5.948513  2.970681  4.540660
 [99]  8.174342  8.106671

Parallelization of codes¶

For parLapply and mclappy functions (parallelizing the code to run on multiple cores)

#install.packages("parallel")
library(parallel)

#To detect the number of processors or cores of your computer 
numCores<- detectCores()
numCores

For mclapply¶

NB: mclapply() function works on Linux machines for any number of detected cores$\geq 1$

However, for windows machines, mclapply() function only works when the number of detected cores is strictly set at 1.

#For mclapply (run this on only linux machines for number of cores>1)
start_time<- proc.time()
normal_samples4<- mclapply(seq_along(sample_sizes), function(k) rnorm(n=sample_sizes[k],mean=5, sd=2),mc.cores=numCores)
end_time<- proc.time()-start_time
print(end_time)

Error in mclapply(seq_along(sample_sizes), function(k) rnorm(n = sample_sizes[k], : 'mc.cores' > 1 is not supported on Windows
Traceback:

1. mclapply(seq_along(sample_sizes), function(k) rnorm(n = sample_sizes[k], 
 .     mean = 5, sd = 2), mc.cores = Number_cores)
2. stop("'mc.cores' > 1 is not supported on Windows")

#For mclapply (for windows, it only works with one core)
start_time<- proc.time()
normal_samples4<- mclapply(seq_along(sample_sizes), function(k) rnorm(n=sample_sizes[k],mean=5, sd=2),mc.cores=1)
end_time<- proc.time()-start_time
print(end_time)

   user  system elapsed 
   7.91    0.31    8.28

print(normal_samples4[[1]][1:100]) #to view first 100 normal samples at first sample size

#to view normal samples at first sample size
#print(normal_samples4[[1]]) 

#to view normal samples at third sample size
# print(normal_samples4[[3]])

  [1]  4.8943872  3.7132102  5.0106242  4.9756401  3.7049026  6.8179066
  [7]  3.4802291  5.1318394  8.1596523  3.2461804  3.6059390  7.4517973
 [13]  5.1895605  3.8815565  5.4972884  6.6559866  3.5933359  9.5930180
 [19]  5.9826078  5.4836546  4.9178209  1.5392019  6.9956902  1.1929255
 [25]  7.6666729  2.6923182  6.9310336  3.1259885  4.4042657  3.9576004
 [31]  3.9500897  7.8745859  4.8673913  5.8924704  2.9960236  3.9722747
 [37]  3.3042267 11.2162399  6.2198085  3.9396766  8.0889728  6.1556146
 [43]  7.9571161  5.5745569  1.7291490  4.9445142  1.2388796  2.3332612
 [49]  6.5490404  5.8906866  4.2417554  2.2510295  4.7209824  7.1936231
 [55]  6.3905606  6.2134278  5.5197559 11.2932014  8.7257056  5.8732445
 [61]  4.2684778  4.8949918 10.2840566  4.9783102  5.6107502  3.9424728
 [67]  6.7173721  2.8135508  3.6218324  7.1321417  6.2337128  5.3246559
 [73]  4.2598127  8.6084281  4.6285549  7.8829529  3.5290462  4.6355375
 [79]  4.0580794  5.4686175  5.5942441  1.9602261  5.9387112  7.2281950
 [85]  3.6821882  4.5842195  6.6085469  3.7961262  4.7002934  0.2890575
 [91]  2.7759910  1.5061394  7.0431978  3.5994650  6.6684191  4.9008872
 [97]  7.0090975  3.2281337  1.2710850  6.8688184

For parLapply¶

NB: parLapply() function works on windows machines for any number of detected cores$\geq 1$

However, for linux machines, use mclapply(). mclapply() on linux machines is generally effective than parLapply() on windows.

NB: We also have parSapply()

#For parlapply, first makeCluster
clusters <- makeCluster(numCores)
clusters

socket cluster with 4 nodes on host 'localhost'

sample_sizes<- c(10000,300000,5000000,9000000,90000000)

#For parLapply (for windows)

index<- 1:length(sample_sizes)
#Defining global variables
clusterExport(cl=clusters, varlist=c("index","sample_sizes"), envir=environment())


start_time<- proc.time() 

normal_samples5<- parLapply(cl = clusters, index, function(x) rnorm(n=sample_sizes[x],mean=5, sd=2))
end_time<- proc.time()-start_time
                            
print(end_time)

#You need to stop cluster after running parLapply()
stopCluster(clusters)

   user  system elapsed 
   2.34    1.13   13.87

print(normal_samples5[[1]][1:100]) #to view first 100 normal samples at first sample size

#to view normal samples at first sample size
#print(normal_samples4[[1]]) 

#to view normal samples at third sample size
# print(normal_samples4[[3]])

  [1]  5.27150422  5.62720929  7.61251799  3.94512172  5.48606198  3.75143538
  [7]  7.19382470  5.92394698  7.83139878  4.84131314  5.24495795  7.21076062
 [13]  4.83650873  7.02714817  2.28515156  2.43054763  3.94236758  3.50695375
 [19]  2.28413265  4.41191269  6.03178124  3.29556561  1.58753156  6.57001748
 [25]  5.79880094  4.04243477  2.47109774  4.68392117  1.96028803  8.98412605
 [31]  4.57637206  5.08157992  3.22343411  5.83654742 10.15425304  4.35738172
 [37]  4.76873821  5.30171311  7.50986008  6.36523463  7.13545537  4.23937054
 [43]  5.95368847  5.70489773  6.41713725  6.32452712  5.07371831  0.08193126
 [49]  4.91542085  1.76608833  4.09712025  6.15317564  2.28265341  4.45357309
 [55]  4.52117361  9.58116775  3.08931491  1.80803936  7.64568799  7.47805983
 [61]  5.39458346  3.49272343  7.60451020  8.16075232  4.93976428  6.21607861
 [67]  8.08404473  5.26328870  3.67557022  6.05614850  3.95623115  6.02878636
 [73]  4.84931831  7.68778857  6.52633515  7.67487503  5.59092654  5.65273474
 [79]  5.20728314  5.77096013  7.71758168  5.91092286  6.12194433  5.65750411
 [85]  7.02860384  6.71824515  2.89661450  4.05618654  2.18111806  4.07193769
 [91]  8.30986566  0.98120993  7.38757664  1.65884011  5.61330266  2.23281746
 [97]  6.68885028  4.67192393  1.95047931  3.01777884

Importing a data from your working directory to implement tapply() and apply() functions¶

setwd("C:/Users/user/Desktop/DataAnalysis_results_R/NUGSChina_R_class")

#Importing CSV data
MurderRates<-read.csv("MurderRates_data.csv")
head(MurderRates,n=10) # view first 10 rows

#Check the variables names of the data
names(MurderRates)
MurderRates[1:3,1:3 ]

apply() function¶

NB: Setting MARGIN=2 evaluate the function for each column or variable

NB: Setting MARGIN=1 evaluate the function for each row of the data

#Setting MARGIN=2 evaluate the function for each column or variable
apply(X=MurderRates[ ,c(1,2,3,4,5,6,7)],MARGIN=2,FUN=mean)

#Setting MARGIN=2 evaluate the function for each column or variable
print(apply(MurderRates[,c(1,2,3,4,5,6,7)],MARGIN=2,mean))

        rate  convictions   executions         time       income          lfp 
  5.40363636   0.26047727   0.06034091 136.52272727   1.78090909  53.06590909 
     noncauc 
  0.10559091

#Setting MARGIN=1 evaluate the function for each row of the data
print(apply(MurderRates[,c(1,2,3,4,5,6)],MARGIN=1,mean))

 [1] 19.79817 19.22633 23.43217 26.69633 46.56033 37.45883 37.35067 22.76100
 [9] 45.80750 23.27800 45.37500 40.41717 40.41117 27.42067 31.06483 41.54333
[17] 26.75700 49.87117 25.78000 39.68833 31.49750 19.13250 25.80350 42.88967
[25] 59.15033 34.31617 22.36183 35.67083 39.46717 34.35433 31.04650 26.11033
[33] 29.67317 34.55867 31.59983 56.43517 24.43417 16.69867 26.64967 26.53067
[41] 43.01500 26.48800 26.91000 29.86433

Creating a function to return the mean and variance of a variable

Mean_variance<-function(x){
    mean_value<- mean(x)
    variance_value<- var(x)
    return(list(mean=mean_value,variance=variance_value))
}


results_mean_var<- apply(X=MurderRates[,c(1,2,3,4,5,6)],MARGIN=2,FUN=Mean_variance)
print(results_mean_var)

$rate
$rate$mean
[1] 5.403636

$rate$variance
[1] 19.92254


$convictions
$convictions$mean
[1] 0.2604773

$convictions$variance
[1] 0.02007965


$executions
$executions$mean
[1] 0.06034091

$executions$variance
[1] 0.004709951


$time
$time$mean
[1] 136.5227

$time$variance
[1] 3795.092


$income
$income$mean
[1] 1.780909

$income$variance
[1] 0.1568782


$lfp
$lfp$mean
[1] 53.06591

$lfp$variance
[1] 6.150671

#extracting results for the variable rate
results_mean_var$lfp

#extracting only mean results for the variable rate
results_mean_var$rate$mean

tapply() function¶

names(MurderRates)

?tapply #to check the arguments of the in-built function tapply

print(tapply(X=MurderRates$rate,INDEX=MurderRates$southern,FUN=mean))

      no      yes 
 2.97069 10.10733

tapply(X=MurderRates$rate,INDEX=MurderRates$southern,FUN=Mean_variance)

names(MurderRates)

results_numericVariables<- lapply(1:6,function(variable_position) tapply(X=MurderRates[,variable_position],
                                INDEX=MurderRates$southern,FUN=Mean_variance))

index=1
names(MurderRates)[index]                                  
results_numericVariables[[index]]

index=2
names(MurderRates)[index]                                  
results_numericVariables[[index]]

	rate	convictions	executions	time	income	lfp	noncauc	southern
	<dbl>	<dbl>	<dbl>	<int>	<dbl>	<dbl>	<dbl>	<fct>
1	19.25	0.204	0.035	47	1.10	51.2	0.321	yes
2	7.53	0.327	0.081	58	0.92	48.5	0.224	yes
3	5.66	0.401	0.012	82	1.72	50.8	0.127	no
4	3.21	0.318	0.070	100	2.18	54.4	0.063	no
5	2.80	0.350	0.062	222	1.75	52.4	0.021	no
6	1.41	0.283	0.100	164	2.26	56.7	0.027	no
7	6.18	0.204	0.050	161	2.07	54.6	0.139	yes
8	12.15	0.232	0.054	70	1.43	52.7	0.218	yes
9	1.34	0.199	0.086	219	1.92	52.3	0.008	no
10	3.71	0.138	0.000	81	1.82	53.0	0.012	no

	rate	convictions	executions
	<dbl>	<dbl>	<dbl>
1	19.25	0.204	0.035
2	7.53	0.327	0.081
3	5.66	0.401	0.012