Facilitator: Clement Twumasi (Postdoctoral Researcher, Oxford University Statistics Department, UK).
Date: July 1, 2022.
Personal Website: https://twumasiclement.wixsite.com/website
YouTube Channel on Advanced R programming videos: https://www.youtube.com/channel/UCxrpjLYi_Akmbc2QICDvKaQ/videos
NB: The class/type of statistical tests and models to fit are predominantly dependent on the type/nature of your data.
Link to some of the aforementioned univariate tests in R: https://rpubs.com/sujith/IS
Multivariate analysis in R: https://little-book-of-r-for-multivariate-analysis.readthedocs.io/en/latest/src/multivariateanalysis.html
http://www.sthda.com/english/wiki/manova-test-in-r-multivariate-analysis-of-variance
PCA, SVD, Correspondence Analysis and Multidimensional Scaling:
https://web.stanford.edu/class/bios221/labs/multivariate/lab_5_multivariate.html
Note some of the class of regression models available. Knowing which one is appropriate for any given data and research problem is very imperative.
Introduction to linear regression: https://www.machinelearningplus.com/machine-learning/complete-introduction-linear-regression-r/
OLS linear regression in R: http://r-statistics.co/Linear-Regression.html
Different class of linear models in R: https://www.statmethods.net/advstats/glm.html
Time series analysis in R:
https://ourcodingclub.github.io/tutorials/time/
https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/
###How to install R packages#####
#install.packages("parallel") #a package for parallel computing
#install.packages("glmnet") # a package for performing penalised or regularised regression models
#install.packages("ggplot2") #for other customised data visualisation
#install.packages("PerformanceAnalytics") #for customised correlation matrix plots
####Loading installed packages of interest#######
library("parallel")
library("glmnet")
library("ggplot2")
library("PerformanceAnalytics")
library("foreign")#for importing data such as SAS, Spss and Stata, etc.
library(haven) #for also importing data such as SAS, Spss and Stata, etc.
library("readxl") #package for loading excel data
options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300) #Setting plot size
#setting working directory to import and/or export data
setwd("C:/Users/user/Desktop/DataAnalysis_results_R/NUGSChina_R_class2022")
#Importing CSV data
MurderRates<-read.csv("MurderRates_data.csv")
head(MurderRates,n=10) # view first 10 rows
tail(MurderRates,n=6) # view last 6 rows
#Importing excel data directly without changing to CSV
library("readxl")
Excel_data<- read_excel("Transformed_data.xlsx")
head(Excel_data)
Excel_data<- Excel_data[,-1]
head(Excel_data)
#Importing SPSS data into R
SPSS_data<- read.spss("Combined_data_SPSS.sav", use.value.label=TRUE, to.data.frame=TRUE)
head(SPSS_data)
#With Haven package
#install.packaes("haven")
#library("haven")
ADP_data <- read_dta("ADP.dta")
head(ADP_data)
write.csv(ADP_data,"ADP_excel.csv")
#Importing Stata data into R using package "foreign"
Stata_data <- read.dta("imm23.dta")
head(Stata_data )
Run in SAS to convert data into CSV before importing into R (Long approach) :)
proc export data=dataset
outfile="datast.csv"
dbms=csv;
run;
And then, run this in R
df <- read.csv("dataset.csv",header=T,as.is=T)
Alternatively (simple approach) using package haven
#Alternatively (simple approach) using package haven
#library(haven)
SAS_data<- read_sas("imm10.sas7bdat")
head(SAS_data)
Declaring some variables in data called SAS_data as categorical with specific levels/categories
#Declaring some variables in data called SAS_data as categorical with specific levels/categories
SAS_data$sex<- factor(SAS_data$sex,levels=c(1,2),labels=c("male","female"))
head(SAS_data)
#Frequency across sex
cat("Frequencies across sex")
table(SAS_data$sex)
cat("Proprotions across sex")
table(SAS_data$sex)/sum(table(SAS_data$sex))
print(SAS_data$race)
paste("Frequencies across race")
table(as.factor(SAS_data$race))
paste("Percentages across race (%)")
(table(as.factor(SAS_data$race))/sum(table(as.factor(SAS_data$race))))*100
SAS_data$race<- factor(SAS_data$race,levels=c(1,2,3,4),labels=c("asian","Hispanic","Black","White"))
print(SAS_data$race)
#Crosstabulations
table(SAS_data$race,SAS_data$sex)
print(SAS_data$public)
SAS_data$public<- factor(SAS_data$public,levels=c(0,1),labels=c("public","non-public"))
head(SAS_data)
#Cross-tabulation
table(SAS_data$race,SAS_data$sex,SAS_data$public)
NB: No package is required for that
#No package is required
#export SAS data as CSV
write.csv(SAS_data,"SAS_data_updatedCSV.csv")
NB: A package is required for that
#install.packages("writexl")
#export SAS data as excel
#Package "writexl" is required
writexl::write_xlsx(SAS_data,"SAS_data_updatedExcel.xlsx")
#Loading an external script called "Novel_summary_computation_script.R"
setwd("C:/Users/user/Desktop/DataAnalysis_results_R/NUGSChina_R_class2022")
source("Novel_summary_computation_script.R")
The summary statistics function should first:
Summarizing a few of the variables using my novel function
head(SAS_data)
#Exclude data on ID
SAS_data<- SAS_data[, -1]
head(SAS_data)
class(SAS_data)
#Converting SAS_data to data.frame
SAS_data_df<- as.data.frame(SAS_data)
#**Summarizing a few of the variables using my novel function**
Summary_stats(data=SAS_data_df, variable_index=2)
Summary_stats(data=SAS_data_df, variable_index=8)
Summary_stats(data=SAS_data_df, variable_index=13)
A function created to extract all quantative variables of your data
#A function created to extract all quantative variables of your data
Quantitative_variabes<-function(data){
Variable_name=list()
for (variable_index in seq_along(names(data))){
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==TRUE){ #if variable is numeric/quantitative
Variable_name[[variable_index]]<-names(data)[variable_index]
}
}
return(unlist(Variable_name))
}
Quantitative_variabes(data=MurderRates)
A function created to extract all categorical variables of your data
Categorical_variabes<-function(data){
Variable_name=list()
for (variable_index in seq_along(names(data))){
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==FALSE){ #if variable is numeric/quantitative
Variable_name[[variable_index]]<-names(data)[variable_index]
}
}
return(unlist(Variable_name))
}
Categorical_variabes(data=MurderRates)
From PerformanceAnalytics R package
Data_numeric=MurderRates[ ,Quantitative_variabes(data=MurderRates)]
#colnames(Data_numeric)=c("rate","convictions","executions","time","income","labour fp","Prop NC")
#method = c("pearson", "kendall", "spearman")
#options(warn=-1) #ignore warnings for instance with running non-parametric correlation due to ties
chart.Correlation(Data_numeric, histogram=TRUE, pch=19,method = c("pearson"))
Description of the problem
Authors: Tiffany Treadway and Clement Twumasi (2021).
URL Link: https://exarc.net/ark:/88735/10595
#Importing SPSS data into R
Data<- read.spss("Data_task.sav", use.value.label=F, to.data.frame=TRUE)
head(Data)
Comparison between the differences in each group’s BMI and the produced incision area (height/width) were tested using a Multivariate analysis of variance (MANOVA). The BMI groups or fixed factors were divided into three ranges: 18-22.5 (Group A), 22.5-27 (Group B), and 27-31.5 (Group C).
#Install packages before loading/use
library("mvnormtest")
library("MVTests")
#For Non-parametric pairwise comparison
library("PMCMRplus")
library(ggpubr)
library(ggplot2)
library(gridExtra)
# Multivariate normality test
#The data is multivariate normal
mshapiro.test(t(Data[, 4:5]))
shapiro.test(Data[, 4])
shapiro.test(Data[, 5])
p1=ggqqplot(Data, "Area_Knife", facet.by = c("Groups")) +labs(x="",y="Sample quantile (Dagger)")
p2=ggqqplot(Data, "Area_Spear", facet.by = c("Groups")) +labs(x="Normal quantile",y="Sample quantile (Spear)")
grid.arrange(p1,p2,ncol=1)
Tests the null hypothesis that the observed covariance matrices of the dependent variables are equal across groups
The hypotheses are defined as H0:The Covariance matrices are homogeneous and H1:The Covariance matrices are not homogeneous
results <- BoxM(data=Data[, 4:5],group=Data$Groups)
summary(results)
par(mfrow=c(2,1),mar=c(4,4,1,1))
boxplot(Data$Area_Knife~Data$Groups,
main = "",
ylim=c(0,150),
ylab=expression(paste("Incision area in mm"^"2" )),
xlab="",
names = c("Group A (BMI: 18-22.5)","Group B (BMI: 22.5-27)","Group C (BMI: 27-31.5)"),
las = 1,
col = c("red","blue","green"),
border = "black",
horizontal = F,
notch = F
)
mtext(text = "Dagger",
side = 3, #side 2 = left
line = 0,cex=1,font=2,adj =0.01)
boxplot(Data$Area_Spear~Data$Groups,
main = "",
ylim=c(0,200),
ylab=expression(paste("Incision area in mm"^"2 " )),
xlab="",
names = c("Group A (BMI: 18-22.5)","Group B (BMI: 22.5-27)","Group C (BMI: 27-31.5)"),
las = 1,
col = c("red","blue","green"),
border = "black",
horizontal = F,
notch = F
)
mtext(text = "Spear",
side = 3, #side 2 = left
line = 0,cex=1,font=2,adj =0.01)
#18-22.5 (Group A), 22.5-27 (Group B), and 27-31.5 (Group C).
Creating a function for Multivariate Kruskal Wallis test
The function returns the test statistic and its p-value
multkw<- function(group,y,simplify=FALSE){
### sort data by group ###
o<-order(group)
group<-group[o]
y<-as.matrix(y[o,])
n<-length(group)
k<-dim(y)[2] #k=p
if (dim(y)[1] != n)
return("number of observations not equal to length of group")
groupls<-unique(group)
g<-length(groupls) #number of groups (Number of fish-parasite combination)#
groupind<-sapply(groupls,"==",group) #group indicator#
ni<-colSums(groupind) #num of subj of each group (Number of fish in each group)#
r<-apply(y,2,rank) #corresponding rank variable (Parasite at each bodyparts)#
### calculation of statistic ###
r.ik<-t(groupind)%*%r*(1/ni) #gxp, mean rank of kth variate in ith group#
m<- (n+1)/2 #expected value of rik#
u.ik<-t(r.ik-m)
U<-as.vector(u.ik)
V<-1/(n-1)*t(r-m)%*%(r-m) #pooled within-group cov matrix
Vstar<-Matrix::bdiag(lapply(1/ni,"*",V))
W2<-as.numeric(t(U)%*%solve(Vstar)%*%U)
### return stat and p-value ###
returnlist<-data.frame(statistic=W2,d.f.=k*(g-1),
p.value=pchisq(W2,k*(g-1),lower.tail=F))
if (simplify==TRUE) return (W2)
else return (returnlist)
}
Cut and paste the MKW test function in a seperate R script and call it in R using the source() function.
MKW_result<- multkw(group=Data$Group,y=Data[,4:5],simplify=F)
MKW_result
Determing whether an injury was caused by a knife or a spear
#Importing the data
Weapon_classification<- read.csv(file="Training_Stabs.csv")
head(Weapon_classification,n=10)# view the first 10 columns
min(Weapon_classification$BMI)
max(Weapon_classification$BMI)
names(Weapon_classification)#the variables of the data
table(Weapon_classification$Weapon_Type)
#Re-categorizing BMI into 3 BMI statuses
Weapon_classification$BMI_status<- cut(Weapon_classification$BMI,
breaks=c(0,18,25.1,31.5),right=T,labels=c("A","B","C"))
#Create Weapon_type as 0 (Knife) and 1 (Spear)
re_categorize_func<- function(variable){
var<- as.numeric(variable)
for(i in seq_along(variable)){
if(var[i]==1) var[i]<- 0 #if a knife
if(var[i]==2) var[i]<- 1 # if a spear
}
return(var)
}
Weapon_classification$Weapon_type<- re_categorize_func(Weapon_classification$Weapon_Type)
Weapon_classification$Weapon_type<-factor(Weapon_classification$Weapon_type,
levels=c(0,1),label=c("knife","spear"))
head(Weapon_classification,n=10)# view the first 10 columns
dim(Weapon_classification)#dimension of the data
#Establishing training and validation set
set.seed(1)# for reproducility
train_index <- caret::createDataPartition(Weapon_classification$Weapon_type, p=0.8, list=FALSE)
training_data<-Weapon_classification[train_index,] #training data
Validation_data<-Weapon_classification[-train_index,]# Validation data
Dependent variable: Weapon type
Independent variales: BMI and stab length
#Setting knife as the reference categories
training_data$Weapon_type_new<- relevel(training_data$Weapon_type,ref="knife")
training_data$BMI_status<- relevel(training_data$BMI_status,ref="A")
#Fitting logitic regression model
Logit_model<-glm(Weapon_type~BMI+Lengths,family="binomial",data=training_data)
summary(Logit_model)
Estimating the Gini coefficient (otained from the AUC of the ROC curve), the Kolmogorov Sminorv statistics and the percentage of correct classification (obtained from the confusion matrix) as accurcacy measures.
ROC means Receiver Operator Characteristic Curve
#loading R packages
library("ROCR")# loading the ROCR package to obtai the ROC curve
library(pROC)
pred_weapon<- predict(Logit_model, Validation_data,type="response")
## library(ROCR)
df=data.frame(predictions= pred_weapon,labels=Validation_data$Weapon_type)
#logistic regression
pred <- prediction(df$predictions, df$labels)
perf <- performance(pred,"tpr","fpr")
#plot(perf,col="green",main="ROC Curve using Validation Data",xlab="Specificity",ylab="Sensitivity")
#abline(0,1)#add a 45 degree line
pROC_obj <- roc(df$labels,df$predictions,
smoothed = TRUE,
# arguments for ci
ci=TRUE, ci.alpha=0.9, stratified=FALSE,
# arguments for plot
plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
print.auc=TRUE, show.thres=TRUE)
#sens.ci <- ci.se(pROC_obj)
#plot(sens.ci, type="shape", col="lightblue")
## Warning in plot.ci.se(sens.ci, type = "shape", col = "lightblue"): Low
## definition shape.
#plot(sens.ci, type="bars")
Specificity_sensitivity=data.frame(cbind(c(perf@y.values),c(perf@x.values)))
names(Specificity_sensitivity)=c("Sensitivity","Specificity")
head(Specificity_sensitivity)
#Area under the curve
print(paste("Area under the first ROC curve is:",performance(pred,"auc")@"y.values"))
Gini_Coefficient=function(AUROC){
return((AUROC-0.5)/0.5)}
area_under_curve<- as.numeric(performance(pred,"auc")@"y.values")
print(paste("Gini Coefficient of ROC=",Gini_Coefficient(area_under_curve)))
TPRfromROCR=unlist(perf@y.values)
FPRfromROCR=unlist(perf@x.values)
diff_TPRFPR=TPRfromROCR-FPRfromROCR
KS=max(diff_TPRFPR)
print(paste("Kolmogorov Sminorv Value for ROC curve is:", KS))
# use caret and compute a confusion matrix
#Spear=True since it was kept as reference
confusion_matrix<- table(Validation_data$Weapon_type, pred_weapon > 0.5)
confusion_matrix
accuracy= ((confusion_matrix[1]+confusion_matrix[4])/sum(confusion_matrix))*100
print(paste("The percentage of accurcate classification is","",accuracy,"%"))
Creating a function to estimate the probability of weapon being a spear from the binomial logitic model
coef(Logit_model)
Prob_spear=function(model,bmi, stab_length){
b0=as.numeric(coef(model)[1])
b1=as.numeric(coef(model)[2])
b2=as.numeric(coef(model)[3])
Numerator=exp(b0+(b1*bmi)+(b2*stab_length))
return(Numerator/(1+Numerator)) #return probability in percent and in 1 decimal place
}
#Assuming values for the bmi are based on the empirical data
bmi_values<- seq(18,31.5,by=.5)
bmi_values
length_values_LM<- c(30,35,60)
length_values_HW<- c(7.5,15,20,21,30,31)
pred_spear_bmi_LM=pred_knife_bmi_LM=NULL
pred_spear_bmi_HW=pred_knife_bmi_HW=NULL
for(i in seq_along(length_values_LM)){
pred_spear_bmi_LM[[i]]<-Prob_spear(model=Logit_model,bmi=bmi_values, stab_length=length_values_LM[i])
pred_knife_bmi_LM[[i]]<- 1-pred_spear_bmi_LM[[i]]
}
for(i in seq_along(length_values_HW)){
pred_spear_bmi_HW[[i]]<-Prob_spear(model=Logit_model,bmi=bmi_values, stab_length=length_values_HW[i])
pred_knife_bmi_HW[[i]]<- 1-pred_spear_bmi_HW[[i]]
}
colour_LM<- c("blue","red","green")
#par(mfrow=c(2,1),mar=c(4,4,1,1))
plot(bmi_values,pred_spear_bmi_LM[[1]],type="l",lwd=3,
xlab="Possible BMI values",ylab="Probability of a weapon being a spear relative to a dagger",main="Mummy Man",col=colour_LM[1]
,ylim=c(0,1),xlim=c(18,32))
abline(h=0.5,col="black",lwd=3)
text(28,0.5,"Probability=0.5",col="black",font=2,pos=3)
for(i in seq_along(length_values_LM)[-1]){
lines(bmi_values,pred_spear_bmi_LM[[i]],type="l",lwd=3,col=colour_LM[i])
}
text_LM<-c("wound length= 30mm","wound length= 35mm","wound length= 60mm")
legend(x=24,y=.8,text_LM,
col=colour_LM,bty="n",cex=.9,box.lwd = 2,fill=colour_LM)
#plot(length_values,pred_spear_length,type="l",col="red",lwd=3,xlab="Different stab length values",ylab="Probability of a weapon being a spear")
#lines(bmi_values,pred_knife,type="l",col="red",lwd=3)
#legend("center",c("Spear","Knife"),col=c("blue","red"),bty="n",cex=.9,box.lwd = 2,fill=c("blue","red"))
Prob_LM<- as.data.frame(do.call("cbind",pred_spear_bmi_LM))
names(Prob_LM)<-text_LM
Prob_LM$bmi_values<- bmi_values
Prob_LM
write.csv(Prob_LM,"Predicted_probabilities_LM.csv")
colour_HW<- c("green","yellow","orange","pink","blue","red")
#par(mfrow=c(2,1),mar=c(4,4,1,1))
plot(bmi_values,pred_spear_bmi_LM[[1]],type="l",lwd=3,
xlab="Possible BMI values",ylab="Probability of a weapon being a spear relative to a dagger",main="Mummy Woman",col=colour_HW[1]
,ylim=c(0,.8),xlim=c(18,32))
abline(h=0.5,col="black",lwd=3)
text(28,0.5,"Probability=0.5",col="black",font=2,pos=3)
for(i in seq_along(length_values_HW)[-1]){
lines(bmi_values,pred_spear_bmi_HW[[i]],type="l",lwd=3,col=colour_HW[i])
}
text_HW<-c("wound length= 7.5mm","wound length= 15mm","wound length= 20mm",
"wound length= 21mm","wound length= 30mm","wound length= 31mm")
legend(x=24,y=.81,text_HW,
col=colour_HW,bty="n",cex=.9,box.lwd = 2,fill=colour_HW)
Prob_HW<- as.data.frame(do.call("cbind",pred_spear_bmi_HW))
names(Prob_HW)<-text_HW
Prob_HW$bmi_values<- bmi_values
Prob_HW
write.csv(Prob_HW,"Predicted_probabilities_HW.csv")