To download in-built data in R for use, click below:
https://vincentarelbundock.github.io/Rdatasets/datasets.html
Multiple (Linear) Regression
NB: The goal is to find out the Determinants of Murder Rates in the United States based on an existing empirical data.
NB: A data frame containing 44 observations on 8 variables.
rate: Murder rate per 100,000 (FBI estimate, 1950).
convictions: Number of convictions divided by number of murders in 1950.
executions: Average number of executions during 1946–1950 divided by convictions in 1950.
time: Median time served (in months) of convicted murderers released in 1951.
income: Median family income in 1949 (in 1,000 USD).
lfp: Labor force participation rate in 1950 (in percent).
noncauc: Proportion of population that is non-Caucasian in 1950.
southern: Factor indicating region or residential status.
Add a new categorical variable called income level with levels Low level if income is less than 1.5 (in thousand), Medium level if income is from 1.5 but less than 2 (in thousand) and High level if greater or equal to 2 (in thousand).
Compute descriptive summaries of all the 8 variables with appropriate graphs
Perform correlation analysis based on all the numerial varibles with informative correlation matrix plots
Find the relationship between the income level and their residential status (whether in southern area or not). NB: A Chi-square test in this case since both variables are categorical.
Fit a regression model to find out the Determinants of Murder Rates.
NB: Based on the variables: convictions, executions, time, income, lfp, noncauc and southern/residential status
setwd("C:/Users/user/Desktop/DataAnalysis_results_R/DataAnalysis_NUGSA_Shangai")
#Intalling our first R package (NB: You will need an internet to install packages only)
#install.packages("moments") #run this to install the package called "moments"
library("moments") # load the package to be able to calculate skewness and kurtosis only
library("RColorBrewer")
library("PerformanceAnalytics")
#Setting plot size for large view and specific resolution
options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300)
Data_murderRates<-read.csv(file="MurderRates_data.csv")
head(Data_murderRates, n=10) #to view firs t 10 rows of the data
Checking for missing values
#To check whether there is any missing data
any(is.na(Data_murderRates)) #it returned false implies no missing data
Creating the income level variable
min(Data_murderRates$income)
max(Data_murderRates$income)
detach(Data_murderRates)
Data_murderRates$rate
thresholds<- c(0, 1.5, 2, 3)
Data_murderRates$income_levels= cut(Data_murderRates$income,breaks=thresholds, right=FALSE,labels=c("Low","Medium","High"))
#View updated data
head(Data_murderRates, n=10) #to view first 6 rows of the data
levels(Data_murderRates$income_levels)
table(Data_murderRates$income_levels)
dim(Data_murderRates)
dim(Data_murderRates)[1]
table(Data_murderRates$income_levels,Data_murderRates$southern)
table(Data_murderRates$income_levels,Data_murderRates$southern)/dim(Data_murderRates)[1]
table(Data_murderRates$income_levels,Data_murderRates$southern)/dim(Data_murderRates)[1]*100
round(table(Data_murderRates$income_levels,Data_murderRates$southern)/dim(Data_murderRates)[1]*100, 2)
The summary statistics function should first:
Function to compute mode
# Creating the function to compute mode in R
getmode <- function(x) {
uniq_x <- unique(x)
return(uniq_x[which.max(tabulate(match(x, uniq_x)))])
}
getmode(Data_murderRates$income)
table(Data_murderRates$income)
class(Data_murderRates$income_level)
Summary statistic function
#Creating a function to estimate summary statistics of the data
#by determining the type of variable
#If it's numeric find & return mean, median, mode standard deviation, standard error,
#skewness, kurtosis and 95% quantile recorded to 2 decimal places
#But if its categorical find & return percentages for all categories/levels
#(in 1 decimal place) and the name of the categories as a dataframe
Summary_stats<-function(data, variable_index){
Variable_name<-names(data)[variable_index]
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==TRUE){ #if variable is numeric/quantitative
#compute mean, median, standard deviation, standard error,
#skewness and kurtosis
mean_value<-mean(Variable) #compute mean
median_value<-median(Variable) #compute median
modal_value<-getmode(Variable)
std<-sd(Variable) #compute standard deviation
standard_error<-std/sqrt(length(Variable)) #compute standard error
skewness<-skewness(Variable) #compute skewness
kurtosis<-kurtosis(Variable) #compute kurtosis
quantile_95percent<- as.vector(quantile(Variable,c(0.025,0.975))) #compute 95% quantile
graph<-hist(Variable,xlab=paste(Variable_name),col=variable_index, main="")
#returns the mean, median, standard deviation, standard error,skewness and kurtosis
numerical_summaries<- data.frame(mean=round(mean_value,2),median=round(median_value,2),mode=modal_value,std=round(std,2),SE=round(standard_error,2),
skewness=round(skewness,2),kurtosis=round(kurtosis,2),quantile_95percent_lower=round(quantile_95percent,2)[1],
quantile_95percent_upper=round(quantile_95percent,2)[2] )
return(list(Variable_name=Variable_name,numerical_summaries=numerical_summaries ,histogram=graph$histogram))
} else if(is.factor(Variable)==TRUE){ #else if categorical
#compute the percentages rounded in 1 decimal place
percentage<-paste(round((table(Variable)/dim(data)[1])*
100,1),"%")
levels_variable<-levels(Variable)
output<-data.frame(Categories=levels_variable,percentage=percentage)#storing output as dataframe
#Plotting the pie chart for the categorical variable
Percentage_values<- round((table(Variable)/dim(data)[1])*100,1)
labels_variables <- paste(levels(Variable),":", Percentage_values) # add percents to labels
labels_variables <- paste( labels_variables,"%",sep="") # ad % to labels
#Deciding how many colours to choose if the number of categories is < 3 or >=3 before plot
if(length(levels_variable)==2){
#colours_two_categories<- c("red","blue")
colours_two_categories<- rainbow(n=2)
pie_chart<- pie(x=Percentage_values, labels =labels_variables,radius =.7,cex=0.71,main="",
col =colours_two_categories,font=2,clockwise = TRUE,init.angle=90)
} else if(length(levels_variable)>=3){
#colours_categories<-brewer.pal(n = length(Percentage_values), name = "Paired")
colours_categories<-rainbow(n =length(Percentage_values))
pie_chart<- pie(x=Percentage_values, labels =labels_variables,radius =.7,cex=0.71,main="",
col =colours_categories,font=2,clockwise = TRUE,init.angle=90)
}
#return variable name and a dataframe of percentages for each category
return(list(Variable_name=Variable_name,output=output, pie_chart= pie_chart))
}
}
Summary_stats(data=Data_murderRates, variable_index=1)
Summary_stats(data=Data_murderRates, variable_index=2)
Summary_stats(data=Data_murderRates, variable_index=3)
Summary_stats(data=Data_murderRates, variable_index=4)
Summary_stats(data=Data_murderRates, variable_index=5)
Summary_stats(data=Data_murderRates, variable_index=6)
Summary_stats(data=Data_murderRates, variable_index=7)
Summary_stats(data=Data_murderRates, variable_index=8)
Let's use the pie chart above for income levels and save directory using a command or by a code.
#SAVING PLOT AS A PNG
#step 1
par(mar=c(4,4,1,1))
png(file="Income_level_PieChart.png")
#Step 2: Create the plot with R code
Summary_stats(data=Data_murderRates, variable_index=9)$pie_chart
# Step 3: Run dev.off() to create the file!
dev.off()
Based on only the quantitative variables.
Automated function to decide on the quantitative variables
names(Data_murderRates)
Quantitative_variabes<-function(data){
Variable_name=list()
for (variable_index in seq_along(names(data))){
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==TRUE){ #if variable is numeric/quantitative
Variable_name[[variable_index]]<-names(data)[variable_index]
}
}
return(unlist(Variable_name))
}
Categorical_variabes<-function(data){
Variable_name=list()
for (variable_index in seq_along(names(data))){
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==FALSE){ #if variable is numeric/quantitative
Variable_name[[variable_index]]<-names(data)[variable_index]
}
}
return(unlist(Variable_name))
}
Categorical_variabes(data=Data_murderRates)
#printing all numerical variables in any data
Quantitative_variabes(data=Data_murderRates)
Determining the normality of all the numeric variables
shapiro.test(Data_murderRates$rate) #0.0000874 p<0.001
shapiro.test(Data_murderRates$convictions)
#shapiro.test(Data_murderRates$rate) #normality test for only the variable: rate
Normality_test<- function(data){
Numerical_variables<- Quantitative_variabes(data=Data_murderRates)
#Using lapply function instead for loop
ShapiroWilk_test<- function(k) shapiro.test(data[ ,Numerical_variables[k]])
results<- lapply(seq_along(Numerical_variables),ShapiroWilk_test)
for(i in seq_along(Numerical_variables)) print(paste("Index=",i,"",Numerical_variables[i]))
return(results)
}
Normality_test(data=Data_murderRates)
Correlation matrix plot
Data_numeric=Data_murderRates[,Quantitative_variabes(data=Data_murderRates)]
#colnames(Data_numeric)=c("rate","convictions","executions","time","income","labour fp","Prop NC")
#method = c("pearson", "kendall", "spearman")
#options(warn=-1) #ignore warnings for instance with running non-parametric correlation due to ties
chart.Correlation(Data_numeric, histogram=TRUE, pch=19,method = c("pearson"))
Relationship between the income level and their residential status
chisq.test(Data_murderRates$income_levels, Data_murderRates$southern)
#Cross tabulation between income levels and residential status
table(Data_murderRates$income_levels, Data_murderRates$southern)
#Cross tabulation between income levels and residential status (as proportions)
table(Data_murderRates$income_levels, Data_murderRates$southern)/dim( Data_murderRates)[1]
#Cross tabulation between income levels and residential status (as percentages)
table(Data_murderRates$income_levels, Data_murderRates$southern)/dim( Data_murderRates)[1]*100
#Cross tabulation between income levels and residential status (as percentages in 2 d.p)
round(table(Data_murderRates$income_levels, Data_murderRates$southern)/dim( Data_murderRates)[1]*100,
2)
Fit a regression model to find out the Determinants of Murder Rates
Based on the variables: convictions, executions, time, income, lfp, noncauc and southern/residential status
names(Data_murderRates)
Glm_gaussian= glm(rate ~ convictions+executions+time+income+lfp
+noncauc+southern, data = Data_murderRates,family="gaussian")
summary(Glm_gaussian)
OLS_regression <- lm(rate ~ convictions+executions+time+income+lfp
+noncauc+southern, data = Data_murderRates)
summary(OLS_regression)
coef(OLS_regression)
coef(Glm_gaussian)
par(mfrow=c(2,2))
plot(OLS_regression)
Taking the influential observation 28 out and refitting the OLS model
Data_without_Obs_28<- Data_murderRates[-28, ]
OLS_regression_without_Obs_28 <- lm(rate ~ convictions+executions+time+income+lfp
+noncauc+southern, data = Data_without_Obs_28)
summary(OLS_regression_without_Obs_28)
coef(OLS_regression_without_Obs_28)[8]
par(mfrow=c(2,2))
plot(OLS_regression_without_Obs_28)
Comparing the coeficient of determination $R^2$ between the two OLS models
That's, with or without the influential observation 28
paste("R-square of model with inflential observation=",summary(OLS_regression)$r.squared)
paste("R-square of model without inflential observation=",
summary(OLS_regression_without_Obs_28)$r.squared)
#Regression coefficients
best_model=OLS_regression_without_Obs_28
coef(best_model)
Data_murderRates
names(Data_without_Obs_28)
## Binomial logistic model. Note: southern coefficient
logitic_model <- glm(southern ~ convictions+executions+time+income_levels+lfp
+noncauc+southern+rate, data =Data_without_Obs_28 , family = binomial)
summary(logitic_model)