Below is the outline of today's class
Solving a few selected questions in the last week's assignment tasks
a) Extract multiples of 12 from 10 to 100000 (save results as a vector & list respectively) using i) for loop and ii) while loop
b) Create a Monte Carlo Simulation to describe the outcome of pregnancy for 100 women at birth (number of runs=100) who can either give birth to a male or female. Note that male and female babies are equally likely to be born (probability of 0.5 respectively). Record the total number of woman who gave birth to males and female respectively out of the 100 women from the Monte Carlo Simulation. Using a barplot, plot the proportion of male and female borns. Increase the number of women to 1000, and compare the findings with that of the 100 women.
Simulating samples from probability distributions (exponential, normal, lognormal, Poisson, uniform and binomial distributions).
Prove the Central Limit theorem of at least the mean based on simulated samples from different probability distributions.
Use Bootstrapping & Jackknife estimation techniques to estimate and bound unknown parameters such as population means, population proportions, etc.
Save results as a vector & list respectively using:
i) for loop and ii) while loop
Using For Loop to find all multiples of 12 from 10 to 100000
# Q1a) Extract multiples of 12 from 10 to 100000 and storing results as a vector
#Using For loop
n <- 10:100000 #seq(10,100000,by=1)
multiple_12<-rep(NA,length=length(n)) #creating a vector of NA to store results
for (i in seq_along(n)) {# for(i in 1:length(n))
if(n[i]%%12 == 0) {
multiple_12[i]<- n[i]
}
}
#multiple_12 #This willcontain some NA
multiples_of_12<-na.omit(multiple_12) #stores all multiples of 12 eliminating NAs
print(paste("Total multiples of 12=",length(multiples_of_12))) #total multiples of 12
print(multiples_of_12[1:100])# printing first 100 multiple of 12.
# Q1b) Extract multiples of 12 from 10 to 100000 and storing results as a list
#Using For loop
n <- 10:100000
multiple_12<-list()# c() #creating a empty list to store results
for (i in seq_along(n)) {
if(n[i]%%12 == 0) {
multiple_12[[i]]<- n[i]
}
}
multiples_of_12<-unlist(multiple_12) #unlisting to create a vector of only multiples of 12
print(paste("Total multiples of 12=",length(multiples_of_12))) #total multiples of 12
print(multiples_of_12[1:100]) # printing first 100 multiple of 12.
Using While Loop to find all multiples of 12 from 10 to 100000
# Q1 c) Extract multiples of 12 from 10 to 100000 and storing results as a list
#Using For While
multiple_12<- list() #creating a empty list to store results
#multiple_12=rep(NA, length(n))
last_number=100000
n <- 10
while (n<=last_number){
if(n%%12 == 0) {
multiple_12[[n]]<- n
}
n=n+1
}
#multiple_12 #This willcontain some NA
multiples_of_12<-unlist(multiple_12) #stores all multiples of 12 eliminating NAs
print(paste("Total multiples of 12=",length(multiples_of_12))) #total multiples of 12
print(multiples_of_12[1:100]) # printing first 100 multiple of 12.
Create a Monte Carlo Simulation to describe the outcome of pregnancy for 100 women at birth (number of runs=100) who can either give birth to a male or female. Note that male and female babies are equally likely to be born (probability of 0.5 respectively). Record the total number of woman who gave birth to males and female respectively out of the 100 women from the Monte Carlo Simulation. Using a barplot, plot the proportion of male and female borns. Increase the number of women to 1000, and compare the findings with that of the 100 women.
Outcome of birth for a single Mother/birth
#Create a Monte Carlo Simulation to describe the outcome of pregnancy
#for a single mother at birth (number of runs=1) who can either give birth to
#a male or female. Note that male and female babies are equally
#likely to be born (probability of 0.5 respectively)
count=rep(0,2) # or numeric(2) #to stores the total of the two outcomes (male or female borns)
rand=runif(n=1,0,1) #a random variable to decide which type of birth occurs ( for a single woman)
for (i in rand){
if (i>=0 & i<=1/2){ #or just if(i<=1/2)
count[1]=count[1]+1
print("Boy")
} else if(i>1/2 & i<=1){ #or just else{}
count[2]=count[2]+1
print("Girl")
}
}
Outcome of birth for 100 Mothers/births
#Create a Monte Carlo Simulation to describe the outcome of pregnancy
#for 100 women at birth (number of runs=100) who can either give birth to
#a male or female. Note that male and female babies are equally
#likely to be born (probability of 0.5 respectively)
count=rep(0,2) # or numeric(2) #to stores the total of the two outcomes (male or female borns)
rand=runif(n=100,0,1) #a random variable to decide which type of birth occurs (100 women)
for (i in rand){
if (i>=0 & i<=1/2){ #or just if(i<=1/2)
count[1]=count[1]+1
#print("Boy")
} else if(i>1/2 & i<=1){ #or just else{}
count[2]=count[2]+1
#print("Girl")
}
}
#Printing total number of male and female borns
type_born<- c("Boy","Girl")
for (i in seq_along(count)) {
print(paste(type_born[i],"", "=",count[i]))
}
Outcome of birth for 1000 Mothers/births
#Create a Monte Carlo Simulation to describe the outcome of pregnancy
#for 1000 women at birth (number of runs=100) who can either give birth to
#a male or female. Note that male and female babies are equally
#likely to be born (probability of 0.5 respectively)
count=rep(0,2) # or numeric(2) #to stores the total of the two outcomes (male or female borns)
#set.seed(1)
rand=runif(n=1000,0,1) #a random variable to decide which type of birth occurs (1000 women)
for (i in rand){
if (i>=0 && i<=1/2){ #or just if(i<=1/2)
count[1]=count[1]+1
#print("Boy")
} else if(i>1/2 && i<=1){ #or just else{}
count[2]=count[2]+1
#print("Girl")
}
}
#Printing total number of male and female borns
type_born<- c("Boy","Girl")
for (i in seq_along(count)) {
print(paste(type_born[i],"", "=",count[i]))
}
options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300) #Setting plot size & resolution
total_women=1000
barplot(count/total_women,col="blue",xlab="Sex of baby",horiz = FALSE,
names.arg =c("Boy","Girl"))
text(.7, 0.45, paste(round((count[1]/total_women*100),3),"%", "(","n=",count[1],")"),col="black") #pasting percentage of male borns
text(2, 0.4, paste(round((count[2]/total_women*100),3),"%","(","n=",count[2],")"),col="black")#pasting percentage of female borns
The Central Limit Theorem states that the sampling distribution of the sample means approaches a normal distribution as the sample size gets larger — no matter what the shape of the population distribution or distribution of the population. This fact holds especially true for sample sizes at least 30.
Simulating samples from probability distributions for CLT
Exponential, normal, lognormal, Poisson, uniform and binomial distributions.
set.seed(123) #for reproducibility
names_prob_distn<-c("binomial","poisson","uniform","exponential","normal","lognormal")
prob_distn=NULL #list( )# c()
prob_distn[[1]]<-rbinom(n=1000,size=10,prob=0.5) #binomial sample
prob_distn[[2]]<-rpois(n=1000, lambda=1)#poisson_sample
prob_distn[[3]]<-runif(n=1000,min=0, max=1)#uniform sample
prob_distn[[4]]<-rexp(n=1000, rate=1)#exponential sample
prob_distn[[5]]<-rnorm(n=1000, mean = 0, sd = 1)#normal sample
prob_distn[[6]]<-rlnorm(n=1000, meanlog = 0, sdlog = 1)#lognormal sample
par(mfrow=c(3,2),mar=c(4,4,1,0))
for(i in seq_along(names_prob_distn)){
hist(prob_distn[[i]],col=i,main=paste(names_prob_distn[i]),xlab=paste(names_prob_distn[i],"","Samples"))
}
For a large number of times (100 thousand replicates each)
#Simulating 100000 different samples of size 30 from different population distribution respectively
names_prob_distn<-c("exponential","poisson","uniform","normal","lognormal","binomial")
#create 6 lists as objects for each distribution
probability_samples=vector("list",length=length(names_prob_distn))
for(i in seq_along(names_prob_distn)){ #creating list for each list
probability_samples[[i]]=list()
}
nsim=100000
for(k in 1:nsim){ #100000 replicates/repititions for each probability distribution
probability_samples[[1]][[k]]<- rexp(n=30, rate=1) # 30 exponential samples with mean=1
probability_samples[[2]][[k]]<- rpois(n=30, lambda=1)#poisson_sample
probability_samples[[3]][[k]]<- runif(n=30,min=0, max=1)#uniform sample
probability_samples[[4]][[k]]<- rnorm(n=30, mean = 0, sd = 1)#normal sample
probability_samples[[5]][[k]]<- rlnorm(n=30, meanlog = 0, sdlog = 1)#lognormal sample
probability_samples[[6]][[k]]<- rbinom(n=30,size=10,prob=0.5) #binomial sample
}
probability_samples[[1]][[1]]#view first exponential sample
probability_samples[[1]][[2]]#view second exponential sample
Finding mean for each exponential sample
#create 6 lists as objects for means of each distribution
means_distribution=vector("list",length=length(names_prob_distn))
#Create vector of 0 of size 10000 for each list
for(i in seq_along(names_prob_distn)){
means_distribution[[i]]=numeric(nsim)
}
for(i in seq_along(names_prob_distn)){
for(k in 1:nsim){
means_distribution[[i]][k]<- mean(probability_samples[[i]][[k]]) # 30 exponential samples with mean=1
}
}
Plotting the sampling distribution of means for each probability distribution
par(mfrow=c(3,2),mar=c(4,4,1,0))
for(i in seq_along(names_prob_distn)){
hist(means_distribution[[i]],col=i,main=paste(names_prob_distn[i]),xlab="Sampling distribution of means")
}
sample=c(2,9,7,10)
sample
sort(sample)
par(mfrow=c(3,2),mar=c(4,4,1,0))
for(i in seq_along(names_prob_distn)){
qqnorm(means_distribution[[i]], col=i, pch = 1, frame = FALSE,main=paste("Normal Q-Q plot:",names_prob_distn[i]))
qqline(means_distribution[[i]], col = "steelblue", lwd = 3)
}
Shapiro-Wilk test (parameteric test for large samples at most 5000: recommended)
Anderson-Darling test (non-parameteric test)
Lilliefors test
Kolmogorov-Smirnov test (non-parameteric test for large samples more than 5000). Cannot consider ties in the data.
Shapiro-Francia test (parameteric test)
Skewness-Kurtosis test (non-parameteric test)
Jarque–Bera test (for time series data)
The hypothesis for the normality test
H0: The data is normal
H1: The data is not normal
NB: The three common procedures in assessing whether a random sample of independent observations of size n come from a population with a normal distribution are: graphical methods (histograms, boxplots, Q-Q-plots), numerical methods (skewness and kurtosis indices) and formal normality tests.
Use Bootstrapping & Jackknife estimation techniques or resampling method to estimate and bound unknown parameters such as population means, population proportions, etc.
NB: Bootstrap samples are simple samples obtained with replacement
Simulating a 100 samples or data from exponential distribution with parameter=2 to represent your population from which bootstrap samples will be obtained
### Finding the bootstrap and Jackknife means of samples from exponential distribution###
#Simulating a 100 samples or data from exponential distribution for instance
set.seed(1)#setting seed number for reproducibility
Exponential_samples<- rexp(n=100,rate=2)
print(Exponential_samples)
mean(Exponential_samples)
x=c(1,4,7,3,8,0)
x
sample(x,size=length(x),replace=TRUE)
Bootstrap_samples=list()
for(i in 1:10000) Bootstrap_samples[[i]]<- sample(Exponential_samples,size=100, replace=TRUE)
Bootstrap_samples[[1]][1:10] #view first 10 data point of 1st bootstrap sample
Bootstrap_samples[[2]][1:10] ##view first 10 data point of 2nd bootstrap sample
Finding means of each Bootstrap samples
means_bootstrap_samples<- numeric(10000)
for (i in seq_along(means_bootstrap_samples)) means_bootstrap_samples[i]<- mean(Bootstrap_samples[[i]])
NB: We are expecting a symmetrical distribution of the means for the Bootstrappping technique to be accurate or good.
hist(means_bootstrap_samples,col="blue",main="Sampling distribution of means from the Bootstrap resampling",xlab="Means",
ylim=c(0,2000))
abline(v=mean(means_bootstrap_samples),lwd=3,col="red",lty=2)
text(0.58,1800,paste("True mean=",1/2),col="black")
text(0.58,1740,paste("Bootstrap mean=",round(mean(means_bootstrap_samples),4)),col="red")
text(0.61,1690,paste("Average of the actual data=",round(mean(Exponential_samples),4)),col="green")
The bootstrap mean is the overall mean across all the means from the bootstrap samples
paste("The bootstrap mean=",mean(means_bootstrap_samples))
prediction_interval_95=quantile(means_bootstrap_samples, c(0.025,0.975))
paste("The 95% prediction interval:","[" ,prediction_interval_95[1],";",prediction_interval_95[2],"]" )
paste("Bias of bootstrap estimate=",mean(means_bootstrap_samples)-0.5)
leave_one_out_means=numeric(length(Exponential_samples))
for(i in seq_along(Exponential_samples)){
leave_one_out_means[i]<- mean(Exponential_samples[-i])
}
Jackknife_estimate<- mean(leave_one_out_means)
paste("The Jackknife mean=",Jackknife_estimate)
prediction_interval_95=quantile(leave_one_out_means, c(0.025,0.975))
paste("The 95% prediction interval:","[" ,prediction_interval_95[1],";",prediction_interval_95[2],"]" )
paste("Bias of jackknife estimate=",Jackknife_estimate-0.5)
hist(leave_one_out_means,main="Sampling distribution of means from Jackknife resampling",xlab="means",col="blue")
abline(v=Jackknife_estimate,lwd=3,col="yellow",lty=2)
text(0.505,55,paste("True mean=",1/2),col="black")
text(0.505,50,paste("Jackknife mean=",round(Jackknife_estimate,4)),col="yellow")
text(0.505,45,paste("Bootstrap mean=",round(mean(means_bootstrap_samples),4)),col="red")
text(0.505,40,paste("Average of the actual data=",round(mean(Exponential_samples),4)),col="green")