Session: Two
Date: 18/03/2021
Intructor: Clement Twumasi
Website: https://twumasiclement.wixsite.com/website
YouTube channel: https://www.youtube.com/channel/UCxrpjLYi_Akmbc2QICDvKaQ
What is the general motivation of loops and apply functions?
For loops with conditions (single, double loops, etc).
While loops with conditions.
Saving results from loops.
Introduction to simulating probability distributions and plotting with loops.
Use of apply functions (apply, lapply, sapply, tapply, parLapply and mclapply).
NB: Avoid loops if not needed. Only use them when necessary.
You don't need a loop to add, substract, multiple or divide entries of two or more vectors.
Disclaimer: Loops can be used for more complicated tasks in simulations (eg. Monte Carlo siumations, stochastic modelling, etc.), plotting different items and for more simple or complex functions iteratively. The addition is used here as a toy example.
x<- 1:100
sum(x)
total=0
for(i in seq_along(x)){
total=total+i
}
total
#Addition without loops
x <- seq(5,1000, length.out=30) #generating 30 numbers from 5 to 1000
y <- seq(1,150,by=5) #generating 30 numbers from 1 to 150 with contants increments of 5
sum_x_y_no_loop<- x+y
#sum_x_y_no_loop
print(sum_x_y_no_loop)
#y-x
#y/x
print(round(sum_x_y_no_loop, 2))# rounding the sum to 2 d.p
Addition with for loops as saving results as a vector (as an example)
#print(x)
1:length(x)
seq_along(x)
#NB: seq_along(x) returns the index position of each value of x
print(seq_along(x))
For loop: Method 1
Saving loop results as a vector
x <- seq(5,1000, length.out=30) #generating 30 numbers from 5 to 1000
y <- seq(1,150,by=5) #generating 30 numbers from 1 to 150 with contants increments of 5
# sum_loop1: For storing results for each iteration
sum_loop1<- numeric(length(x)) #returns 0's of length equal to the total elements of the vector x
#Alternatively
# sum_loop1<- rep(NA,length=length(x)) or sum_loop1<- rep("Clement",length=length(x))
for(K in seq_along(x)){
sum_loop1[K]<- x[K]+y[K] # you can also use the function: sum(x[i],y[i]) instead of x[i]+y[i]
}
print(sum_loop1)
For loop: Method 2
Saving loop results as a vector
sum_loop2<- rep("clement",length=length(x)) #or sum_loop2<- rep(FALSE,length=length(x))
for(i in 1:length(x)){ #NB: 1:length(x) is similar to seq_along(x). However, seq_along() function is preferable.
sum_loop2[i]<- sum(x[i],y[i])
}
print(sum_loop2)
class(sum_loop2)
print(as.numeric(sum_loop2))
For loop: Method 3
Saving loop results as a list
sum_loop3<- NULL #Alternatively: sum_loop3<- NULL or sum_loop3<- c()
for(i in seq_along(x)){
sum_loop3[[i]]<- x[i]+y[i]
}
sum_loop3
#To extract say the 5 and 30 list; use double square brackets [[ ]] to extract lists
# and single square bracket [ ] to extract vectors
sum_loop3[[5]]
sum_loop3[[30]]
#Unlist change the list to a vector
unlist(sum_loop3)
x <- seq(5,1000, length.out=30) #generating 30 numbers from 5 to 1000
y <- seq(1,150,by=5) #generating 30 numbers from 1 to 150 with contants increments of 5
sum_loop4<- c() #another way of saving each iterative as a list
i=1
while(i<= length(x)){
sum_loop4[[i]]<- x[i]+y[i] # you can also use the function: sum(x[i],y[i]) instead of x[i]+y[i]
i=i+1 #without adding this line, you will get an indefinite or unending loop since there won't be any iterations.
}
print(sum_loop4)
options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300) #Setting plot size
Introduction to simulating probability distributions
Simulating 5000 samples from binomial, Poisson, uniform, normal, lognormal and exponential distributions.
rpois(n=100,lambda=50)
set.seed(1)
rpois(n=100,lambda=50)
set.seed(123) #for reproducibility
names_prob_distn<-c("binomial","poisson","uniform","exponential","normal","lognormal")
prob_distn=NULL #list( )# c()
prob_distn[[1]]<-rbinom(n=5000,size=30,prob=0.5) #binomial sample
prob_distn[[2]]<-rpois(n=5000, lambda=2)#poisson_sample
prob_distn[[3]]<-runif(n=5000,min=0, max=5)#uniform sample
prob_distn[[4]]<-rexp(n=5000, rate=2)#exponential sample
prob_distn[[5]]<-rnorm(n=5000, mean = 0, sd = 1)#normal sample
prob_distn[[6]]<-rlnorm(n=5000, meanlog = 0, sdlog = 1)#lognormal sample
seq_along(names_prob_distn)
1:length(names_prob_distn)
par(mfrow=c(3,2),mar=c(4,4,1,0))
colours<- c("red","blue","green","yellow","violet","brown")
for(j in seq_along(names_prob_distn)){
# you can replace col=colours[j] by just col=j (each colour in R has a unique index)
#there are other packages for customized colours like package "RColorBrewer"
hist(prob_distn[[j]],col=colours[j],main=paste(names_prob_distn[j]),xlab="Samples",ylab="Frequency")
}
#trying n=1000 and increase n to 100000 (returns an elliptical shape for bivariate normality)
set.seed(1)
normal_sample1<- rnorm(n=100000, mean = 2, sd = 1)
normal_sample2<- rnorm(n=100000, mean = 3, sd = 1)
plot(normal_sample1,normal_sample2,col="red",type="p",xlab="Normal samples 1",ylab="Normal sample 2")
The apply functions are:
lapply(): lapply can be used for looping as well more efficiently as lists
sapply(): sapply is an efficient form of lapply() by storing list as vectors & matrices
tapply(): tapply() computes a measure (mean, median, min, max, etc..) or a function for each factor variable in a vector
apply(): apply() computes a measure (mean, median, min, max, etc..) or a function for numerical variables across either each row or each column of a dataframe
parlapply() for windows machines or mclapply() for linux machines : These parlapply() or mclapply() is used for parallelizing codes to run on multiple cores/processors of your computer to increase computational speed for much complex problems; but mclapply() is much powerful.
Recall the previous loop below for instance
#Recall the previous loop below
x <- seq(5,1000, length.out=30) #generating 30 numbers from 5 to 1000
y <- seq(1,150,by=5) #generating 30 numbers from 1 to 150 with contants increments of 5
sum_loop2<- rep(NA,length=length(x)) # it was previously: rep("Clement",length=length(x))
for(i in seq_along(x)){ #NB: 1:length(x) is similar to seq_along(x). However, seq_along() function is preferable.
sum_loop2[i]<- sum(x[i],y[i]) #x[i]+y[i]
}
print(sum_loop2)
Using lapply for the same tasks above
#It saves results as a list in a much simpler manner
sum_loop5<- lapply(seq_along(x),function(k) sum(x[k],y[k])) #Or: lapply(seq_along(x),function(i) x[i]+y[i])
sum_loop5
Using sapply for the same tasks above
#Sapply saved results as a vector instead of a list
sum_loop6<- sapply(seq_along(x),function(i) x[i]+y[i] )
sum_loop6
For for loop
sample_sizes<- c(10000,300000,5000000,9000000,90000000)
normal_samples1<- NULL #to save results as list for each sample size
set.seed(1) #for reproducibility
#For for loop
start_time<- proc.time()
for(k in seq_along(sample_sizes)) normal_samples1[[k]]<- rnorm(n=sample_sizes[k],mean=5, sd=2)
end_time<- proc.time()-start_time
print(end_time)
print(normal_samples1[[1]][1:100]) #to view first 100 normal samples at first sample size
#to view normal samples at first sample size
#print(normal_samples1[[1]])
#to view normal samples at third sample size
# print(normal_samples1[[3]])
For lapply
#For lapply
start_time<- proc.time()
normal_samples2<- lapply(seq_along(sample_sizes), function(k) rnorm(n=sample_sizes[k],mean=5, sd=2))
end_time<- proc.time()-start_time
print(end_time)
print(normal_samples2[[1]][1:100]) #to view first 100 normal samples at first sample size
#to view normal samples at first sample size
#print(normal_samples2[[1]])
#to view normal samples at third sample size
# print(normal_samples2[[3]])
print(normal_samples2[[3]][1:100]) #to view first 100 normal samples at 3rd sample size
For sapply
#For sapply
start_time<- proc.time()
normal_sample3<- sapply(seq_along(sample_sizes), function(k) rnorm(n=sample_sizes[k],mean=5, sd=2))
end_time<- proc.time()-start_time
print(end_time)
print(normal_sample3[[1]][1:100]) #to view first 100 normal samples at first sample size
#to view normal samples at first sample size
#print(normal_sample3[[1]])
#to view normal samples at third sample size
# print(normal_sample3[[3]])
For parLapply and mclappy functions (parallelizing the code to run on multiple cores)
#install.packages("parallel")
library(parallel)
#To detect the number of processors or cores of your computer
numCores<- detectCores()
numCores
NB: mclapply() function works on Linux machines for any number of detected cores$\geq 1$
However, for windows machines, mclapply() function only works when the number of detected cores is strictly set at 1.
#For mclapply (run this on only linux machines for number of cores>1)
start_time<- proc.time()
normal_samples4<- mclapply(seq_along(sample_sizes), function(k) rnorm(n=sample_sizes[k],mean=5, sd=2),mc.cores=numCores)
end_time<- proc.time()-start_time
print(end_time)
#For mclapply (for windows, it only works with one core)
start_time<- proc.time()
normal_samples4<- mclapply(seq_along(sample_sizes), function(k) rnorm(n=sample_sizes[k],mean=5, sd=2),mc.cores=1)
end_time<- proc.time()-start_time
print(end_time)
print(normal_samples4[[1]][1:100]) #to view first 100 normal samples at first sample size
#to view normal samples at first sample size
#print(normal_samples4[[1]])
#to view normal samples at third sample size
# print(normal_samples4[[3]])
NB: parLapply() function works on windows machines for any number of detected cores$\geq 1$
However, for linux machines, use mclapply(). mclapply() on linux machines is generally effective than parLapply() on windows.
NB: We also have parSapply()
#For parlapply, first makeCluster
clusters <- makeCluster(numCores)
clusters
sample_sizes<- c(10000,300000,5000000,9000000,90000000)
#For parLapply (for windows)
index<- 1:length(sample_sizes)
#Defining global variables
clusterExport(cl=clusters, varlist=c("index","sample_sizes"), envir=environment())
start_time<- proc.time()
normal_samples5<- parLapply(cl = clusters, index, function(x) rnorm(n=sample_sizes[x],mean=5, sd=2))
end_time<- proc.time()-start_time
print(end_time)
#You need to stop cluster after running parLapply()
stopCluster(clusters)
print(normal_samples5[[1]][1:100]) #to view first 100 normal samples at first sample size
#to view normal samples at first sample size
#print(normal_samples4[[1]])
#to view normal samples at third sample size
# print(normal_samples4[[3]])
setwd("C:/Users/user/Desktop/DataAnalysis_results_R/NUGSChina_R_class")
#Importing CSV data
MurderRates<-read.csv("MurderRates_data.csv")
head(MurderRates,n=10) # view first 10 rows
#Check the variables names of the data
names(MurderRates)
MurderRates[1:3,1:3 ]
NB: Setting MARGIN=2 evaluate the function for each column or variable
NB: Setting MARGIN=1 evaluate the function for each row of the data
#Setting MARGIN=2 evaluate the function for each column or variable
apply(X=MurderRates[ ,c(1,2,3,4,5,6,7)],MARGIN=2,FUN=mean)
#Setting MARGIN=2 evaluate the function for each column or variable
print(apply(MurderRates[,c(1,2,3,4,5,6,7)],MARGIN=2,mean))
#Setting MARGIN=1 evaluate the function for each row of the data
print(apply(MurderRates[,c(1,2,3,4,5,6)],MARGIN=1,mean))
Creating a function to return the mean and variance of a variable
Mean_variance<-function(x){
mean_value<- mean(x)
variance_value<- var(x)
return(list(mean=mean_value,variance=variance_value))
}
results_mean_var<- apply(X=MurderRates[,c(1,2,3,4,5,6)],MARGIN=2,FUN=Mean_variance)
print(results_mean_var)
#extracting results for the variable rate
results_mean_var$lfp
#extracting only mean results for the variable rate
results_mean_var$rate$mean
names(MurderRates)
?tapply #to check the arguments of the in-built function tapply
print(tapply(X=MurderRates$rate,INDEX=MurderRates$southern,FUN=mean))
tapply(X=MurderRates$rate,INDEX=MurderRates$southern,FUN=Mean_variance)
names(MurderRates)
results_numericVariables<- lapply(1:6,function(variable_position) tapply(X=MurderRates[,variable_position],
INDEX=MurderRates$southern,FUN=Mean_variance))
index=1
names(MurderRates)[index]
results_numericVariables[[index]]
index=2
names(MurderRates)[index]
results_numericVariables[[index]]