Session: Three
Date: 19/03/2021
Intructor: Clement Twumasi
Website: https://twumasiclement.wixsite.com/website
YouTube channel: https://www.youtube.com/channel/UCxrpjLYi_Akmbc2QICDvKaQ
Why do you need to create your own functions and why data visualization & analysis in R
Create a complex function to compute descriptive summaries of all variables of data with appropriate graphs.
Perform correlation analysis based on all the numerical variables with informative correlation matrix plots.
To download in-built data in R for use or practice, click below:
https://vincentarelbundock.github.io/Rdatasets/datasets.html
Basic syntax of functions in R
function_name<- function(arguments) {
perform task 1
perform task 2 etc.
.......
return (expected results)
}
NB:* A function can take as many arguments as you wish and you can return as many output as you want.
Let's create our own function to find the mean $\bar{x}$ and variance $\sigma^{2}(x)$ of a variable
#Let's create any vector to find its mean
set.seed(1) #for reproducibity
data<- rnorm(n=500, mean=300,sd=50) # simulating 500 normal data/random samples with mean 300 and sd=50
#method 1
my.mean1<- function(x){
n=length(x) #the number of elements/observations
mean_value<- sum(x)/n #computing the mean
return(mean_value) # returning the output of interest
}
my.mean1(x=data) #find the mean using function my.mean1
#In-built mean function in R
mean(x=data)
#method 2
my.mean2<- function(x){
n=length(x) #the number of elements/observations
mean_value<- sum(x)/n #computing the mean
mean_value # returning the output of interest without bring return()
}
my.mean2(x=data) #find the mean using function my.mean1
#method 2
my.mean2<- function(x){
n=length(x); mean_value<- sum(x)/n
mean_value
}
my.mean2(x=data) #find the mean using function my.mean1
##Never forget to return your results
#Try running this (No results will come out since it doesn't know what to return)
#method 2
my.mean<- function(x){
n=length(x) #the number of elements/observations
mean_value<- sum(x)/n #computing the mean
}
my.mean(x=data) #find the mean using function my.mean1
#method 3: for such a simple function you can create it like this
my.mean3<- function(x){
a=sum(x)/length(x)
return(a)
}
my.mean3(x=data) # run: my.mean2(data)
#method 4: for such a simple function you can create it like this
my.mean4<- function(x) return(sum(x)/length(x))
my.mean4(x=data) # run: my.mean4(data)
Formula 1:
$$\sigma_{1}^{2}(x)=\frac{\sum \limits_{i=1}^{n}(x_i-\bar{x})^2}{n-1} \qquad \text{and} \qquad \sigma_{1}(x)=\sqrt{\frac{\sum \limits_{i=1}^{n}(x_i-\bar{x})^2}{n-1} } $$OR
Formula 2:
$$\sigma_{2}^{2}(x)=\frac{\sum \limits_{i=1}^{n}x_i^{2}-\frac{\left(\sum \limits_{i=1}^nX_i \right)^2}{n}}{n-1} \qquad \text{and} \qquad \sigma_{2}(x)= \sqrt{\frac{\sum \limits_{i=1}^{n}x_i^{2}-\frac{\left(\sum \limits_{i=1}^nX_i \right)^2}{n}}{n-1}} $$Creating a function to return your own mean as well as variance & standard deviation given by formula 1
Formula 1:
$$\sigma_{1}^{2}(x)=\frac{\sum \limits_{i=1}^{n}(x_i-\bar{x})^2}{n-1} \qquad \text{and} \qquad \sigma_{1}(x)=\sqrt{\frac{\sum \limits_{i=1}^{n}(x_i-\bar{x})^2}{n-1} } $$x=5
2*x
sqrt(4)
summary_func1= function(x){
n=length(x)
xbar<- my.mean1(x) #compute mean with our own mean function my.mean1()
sigma_squared<- sum((x-xbar)^2)/(n-1) # compute the variance given by formula 1
sigma<- sqrt(sigma_squared) # compute the standard deviation given by formula 1
dataframe1<-data.frame(mean=xbar, variance=sigma_squared,std_deviation=sigma)
return(list(mean=xbar, variance=sigma_squared,std_deviation=sigma,df= dataframe1)) #return mean, variance and std dev as a list
#return(data.frame(mean=xbar, variance=sigma_squared,std_deviation=sigma,df= dataframe1)) #return mean, variance and std dev as a dataframe
}
#Finding the mean, variance and std deviation using our own function summary_func1
summary_func1(x=data)
results1<-summary_func1(x=data)
results1$mean
results1<-summary_func1(x=data)
results1$mean #extract only the mean from the list of output for example
Creating a function to return your own mean as well as variance & standard deviation given by formula 2
Formula 2:
$$\sigma_{2}^{2}(x)=\frac{\sum \limits_{i=1}^{n}x_i^{2}-\frac{\left(\sum \limits_{i=1}^nX_i \right)^2}{n}}{n-1} \qquad \text{and} \qquad \sigma_{2}(x)= \sqrt{\frac{\sum \limits_{i=1}^{n}x_i^{2}-\frac{\left(\sum \limits_{i=1}^nX_i \right)^2}{n}}{n-1}} $$summary_func2= function(x){
n=length(x)
xbar<- my.mean1(x) #compute mean with our own mean function my.mean1()
sigma_squared<-(sum(x^2)-(sum(x)^2 /n))/ (n-1) # compute the variance given by formula 2
sigma<- sqrt(sigma_squared) # compute the standard deviation given by formula 2
#return(list(mean=xbar, variance=sigma_squared,std_deviation=sigma)) #return mean, variance and std dev as a list
return(data.frame(mean=xbar, variance=sigma_squared,std_deviation=sigma)) #return mean, variance and std dev as a dataframe
}
#Finding the mean, variance and std deviation using our own function summary_func2
summary_func2(x=data)
x=rnorm(800,mean=50,sd=10)
y=rexp(800,rate=50)
cor(x,y)
Create your own function to find the Pearson correlation coefficient ($r_{xy}$) between two variable $x$ and $y$ and compare with the in-built cor() function in R
where
$$ r_{xy}= \frac{n \sum_{i=1}^{n} x_iy_i -\sum_{i=1}^{n} x_i \sum y_i}{\sqrt{ \left[n \sum_{i=1}^{n} x_{i}^{2}- \left(\sum_{i=1}^{n} x_i \right)^2 \right] \times \left[n \sum_{i=1}^{n} y_{i}^{2}- \left(\sum_{i=1}^{n} y_i \right)^2 \right]} } $$where $x$ & $y$ are the two variables, and $n$=sample size.
Specific tasks:
Set a seed number at 123 for reproducibility using:
set.seed(123); x<- rexp(n=800, rate=50)
set.seed(3); y<- rnorm(n=800, mean=50, sd=10)
Generate 100 numbers between 1 to 10000 [recall: seq(1,10000,length.out=100)]
Obtain a combined plot of the four functions and choose different colours for each.
x<- seq(1,10000,length.out=100)
set.seed(1)
y<-rexp(1000)
f_1=function(x) 1/x
par(mfrow=c(2,2))
plot(f_1(x=y),type="l",col="red",xlab="",ylab="sample")
plot(x^2+3*x,type="l")
Import MurderRate CSV data from your working directory
Description of data to be used:
Add a new categorical variable called income level with levels Low level if income is less than 1.5 (in thousand), Medium level if income is from 1.5 but less than 2 (in thousand) and High level if greater or equal to 2 (in thousand).
Compute descriptive summaries of all the variables with appropriate graphs
Perform correlation analysis based on all the numerial varibles with informative correlation matrix plots
#Intalling our first R package (NB: You will need an internet to install packages only)
#install.packages("moments") #run this to install the package called "moments"
library("moments") # load the package to be able to calculate skewness and kurtosis only
library("RColorBrewer")
library("PerformanceAnalytics") #for correlation matrix plot
#setting working directory
setwd("C:/Users/user/Desktop/DataAnalysis_results_R/NUGSChina_R_class")
#Setting plot size for large view and specific resolution
options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300)
Data_murderRates<-read.csv(file="MurderRates_data.csv")
head(Data_murderRates, n=10) #to view firs t 10 rows of the data
Creating the income level variable
min(Data_murderRates$income)
max(Data_murderRates$income)
thresholds<- c(0, 1.5, 2, 3)
Data_murderRates$income_levels= cut(Data_murderRates$income,breaks=thresholds, right=FALSE,labels=c("Low","Medium","High"))
#View updated data
head(Data_murderRates, n=10) #to view first 6 rows of the data
The summary statistics function should first:
Function to compute mode
# Creating the function to compute mode in R
getmode <- function(x) {
uniq_x <- unique(x)
return(uniq_x[which.max(tabulate(match(x, uniq_x)))])
}
x<-c(2,5,8,10)
var(x)
mode(x)
#Creating a function to estimate summary statistics of the data
#by determining the type of variable
#If it's numeric find & return mean, median, mode standard deviation, standard error,
#skewness, kurtosis and 95% quantile recorded to 2 decimal places
#But if its categorical find & return percentages for all categories/levels
#(in 1 decimal place) and the name of the categories as a dataframe
Summary_stats<-function(data, variable_index){
Variable_name<-names(data)[variable_index]
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==TRUE){ #if variable is numeric/quantitative
#compute mean, median, standard deviation, standard error,
#skewness and kurtosis
mean_value<-mean(Variable) #compute mean
median_value<-median(Variable) #compute median
modal_value<-getmode(Variable)
std<-sd(Variable) #compute standard deviation
standard_error<-std/sqrt(length(Variable)) #compute standard error
skewness<-skewness(Variable) #compute skewness
kurtosis<-kurtosis(Variable) #compute kurtosis
quantile_95percent<- as.vector(quantile(Variable,c(0.025,0.975))) #compute 95% quantile
graph<-hist(Variable,xlab=paste(Variable_name),col=variable_index, main="")
#returns the mean, median, standard deviation, standard error,skewness and kurtosis
numerical_summaries<- data.frame(mean=round(mean_value,2),median=round(median_value,2),mode=modal_value,std=round(std,2),SE=round(standard_error,2),
skewness=round(skewness,2),kurtosis=round(kurtosis,2),quantile_95percent_lower=round(quantile_95percent,2)[1],
quantile_95percent_upper=round(quantile_95percent,2)[2] )
return(list(Variable_name=Variable_name,numerical_summaries=numerical_summaries ,histogram=graph$histogram))
} else if(is.factor(Variable)==TRUE){ #else if categorical
#compute the percentages rounded in 1 decimal place
percentage<-paste(round((table(Variable)/dim(data)[1])*
100,1),"%")
levels_variable<-levels(Variable)
output<-data.frame(Categories=levels_variable,percentage=percentage)#storing output as dataframe
#Plotting the pie chart for the categorical variable
Percentage_values<- round((table(Variable)/dim(data)[1])*100,1)
labels_variables <- paste(levels(Variable),":", Percentage_values) # add percents to labels
labels_variables <- paste( labels_variables,"%",sep="") # ad % to labels
#Deciding how many colours to choose if the number of categories is < 3 or >=3 before plot
if(length(levels_variable)==2){
#colours_two_categories<- c("red","blue")
colours_two_categories<- rainbow(n=2)
pie_chart<- pie(x=Percentage_values, labels =labels_variables,radius =.7,cex=0.71,main="",
col =colours_two_categories,font=2,clockwise = TRUE,init.angle=90)
} else if(length(levels_variable)>=3){
#colours_categories<-brewer.pal(n = length(Percentage_values), name = "Paired")
colours_categories<-rainbow(n =length(Percentage_values))
pie_chart<- pie(x=Percentage_values, labels =labels_variables,radius =.7,cex=0.71,main="",
col =colours_categories,font=2,clockwise = TRUE,init.angle=90)
}
#return variable name and a dataframe of percentages for each category
return(list(Variable_name=Variable_name,output=output, pie_chart= pie_chart))
}
}
Summary_stats(data=Data_murderRates, variable_index=1)
names(Data_murderRates)
Summary_stats(data=Data_murderRates, variable_index=5)
Summary_stats(data=Data_murderRates, variable_index=8)
A function to extract all quantative variables of your data
Quantitative_variabes<-function(data){
Variable_name=list()
for (variable_index in seq_along(names(data))){
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==TRUE){ #if variable is numeric/quantitative
Variable_name[[variable_index]]<-names(data)[variable_index]
}
}
return(unlist(Variable_name))
}
Quantitative_variabes(data=Data_murderRates)
A function to extract all categorical variables of your data
Categorical_variabes<-function(data){
Variable_name=list()
for (variable_index in seq_along(names(data))){
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==FALSE){ #if variable is numeric/quantitative
Variable_name[[variable_index]]<-names(data)[variable_index]
}
}
return(unlist(Variable_name))
}
Categorical_variabes(data=Data_murderRates)
From PerformanceAnalytics R package
Data_numeric=Data_murderRates[ ,Quantitative_variabes(data=Data_murderRates)]
#colnames(Data_numeric)=c("rate","convictions","executions","time","income","labour fp","Prop NC")
#method = c("pearson", "kendall", "spearman")
#options(warn=-1) #ignore warnings for instance with running non-parametric correlation due to ties
chart.Correlation(Data_numeric, histogram=TRUE, pch=19,method = c("pearson"))