#You are welcome to Clement's School on Data Analysis
#Setting work directory: Create a folder on your desktop & call name it
# DataAnalysis_results_R and put the csv data named School_data.csv into it.
#Create variables called BMI=weight/(height)^2 and BMI_categories with levels
#1: below 18.5 - you're in the underweight range
#2: between 18.5 and 24.9 - you're in the healthy weight range
#3: between 25 and 29.9 - you're in the overweight range
#4: between >=30 - you're in the obese range
#Sex_school: 1=male 2=female and 3=mixed school
#Region_school: 1=Ashanti 2=Central & 3=Greater Accra
#Creating your own function to compute correlation between number of student &teachers
#Then compare your answer with the in-built cor() function (This will done nextweek)
Setting working directory
#Setting working directory (desktop>Folder called DataAnalysis_results_R )
setwd("C:/Users/user/Desktop/DataAnalysis_results_R")
options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300) #Setting plot size
#Intalling our first R package (NB: You will need an internet to install packages only)
#install.packages("moments") #run this to install the package called "moments"
library("moments") # load the package to be able to calculate skewness and kurtosis only
library("ggplot2") #for plot graphs using ggplot
library("dplyr")
library("RColorBrewer")
library("PerformanceAnalytics")
library("tidyr")
1-Dimensional 2-Dimensional
#Importing data from the working directory
Data_school<-read.csv(file="School_data.csv")
head(Data_school,n=10)#view first 10 rows of the data with object name: "Data_school"
dim(Data_school)#To view the dimension of the data 19 rows and 8 columns
#To check whether there is any missing data
any(is.na(Data_school)) #it returned false implies no missing data
#Assigning variable to categorical variable sex of school & region of school
#which as being dummy coded as 1,2 and 3.
Data_school$Sex_school<-factor(Data_school$Sex_school,levels =c(1,2,3),
labels = c("Male","Female","Mixed"))
Data_school$Region_school<-factor(Data_school$Region_school,levels =c(1,2,3),
labels = c("Ashanti","Central","Greater Accra"))
Cross tabulations expressed as percentages between Region and Sex of School
#Returns the cross tabulations expressed as percentages (over 100)
(table(Data_school$Region_school,Data_school$Sex_school)/dim(Data_school)[1])*100
Data_school$BMI=Data_school$Avg_weight/((Data_school$Avg_height)^2)
head(Data_school)
BMI status must have categories:
i) underweight (if BMI < 18.5)
ii) normal weight (if 18.5<BMI<24.9)
iii) overweight (if 25.0 <BMI<29.9)
iv) obese (if BMI>30).
max(Data_school$BMI)#check the max value of BMI first
thresholds<- c(0,18.5,24.9,29.9,50) #threshold for converting the continous BMI into BMI status
Data_school$BMI_status<-cut(Data_school$BMI,breaks=thresholds,right=FALSE,
labels = c("Underweight", "Normal weight", "Overweight", "Obese"))
Data_school
#Saving the updated data directory into your repository
write.csv(Data_school,"Data_school_new.csv")
quantile(Data_school[,2],c(0.025,0.975))
# Creating the function to compute mode in R
getmode <- function(x) {
uniq_x <- unique(x)
return(uniq_x[which.max(tabulate(match(x, uniq_x)))])
}
The mode of the number of teachers is clearing 15 since it appears more frequently (5 times)
table(Data_school$NO..TEACHERS)
Now let's use of created mode function to get the mode directly
#Run this to get the mode directly
getmode(x=Data_school$NO..TEACHERS)
The summary statistics function should first
Determine the type of variable
If it's numeric find & return mean, median, mode, standard deviation, standard error, skewness, kurtosis and 95% quantile recorded to 2 decimal places as well as histogram plot of the numeric variable coloured by blue colour.
Else if it's categorical, it should find & return percentages for all categories/levels (in 1 decimal place) and the name of the categories as a dataframe.
#Creating a function to estimate summary statistics of the data
#by determining the type of variable
#If it's numeric find & return mean, median, standard deviation, standard error,
#skewness, kurtosis and 95% quantile recorded to 2 decimal places
#But if its categorical find & return percentages for all categories/levels
#(in 1 decimal place) and the name of the categories as a dataframe
Summary_stats<-function(data, variable_index){
Variable_name<-names(data)[variable_index]
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==TRUE){ #if variable is numeric/quantitative
#compute mean, median, standard deviation, standard error,
#skewness and kurtosis
mean_value<-mean(Variable) #compute mean
median_value<-median(Variable) #compute median
modal_value<-getmode(Variable)
std<-sd(Variable) #compute standard deviation
standard_error<-std/sqrt(length(Variable)) #compute standard error
skewness<-skewness(Variable) #compute skewness
kurtosis<-kurtosis(Variable) #compute kurtosis
quantile_95percent<-quantile(Variable,c(0.025,0.975)) #compute 95% quantile
graph<-hist(Variable,xlab=paste(Variable_name),col="blue", main="")
#returns the mean, median, standard deviation, standard error,skewness and kurtosis
return(list(Variable_name=Variable_name,
mean=round(mean_value,2),median=round(median_value,2),mode=modal_value,std=round(std,2),SE=round(standard_error,2),
skewness=round(skewness,2),kurtosis=round(kurtosis,2),quantile_95percent=round(quantile_95percent,2),histogram=graph))
} else if(is.factor(Variable)==TRUE){ #else if categorical
#compute the percentages rounded in 1 decimal place
percentage<-paste(round((table(Variable)/dim(data)[1])*
100,1),"%")
levels_variable<-levels(Variable)
output<-data.frame(Categories=levels_variable,percentage=percentage)#storing output as dataframe
#return variable name and a dataframe of percentages for each category
return(list(Variable_name=Variable_name,output=output))
}
}
for(i in 1:dim(Data_school)[2])
print(paste(names(Data_school)[i],"","Index=",i))
#Viewing summary stats of 9th variable
Summary_stats(data=Data_school, variable_index=9)
names(Data_school)
Data_school$NO..OF.STUDENTS
# Increase bottom margin
par(mar=c(6,4,4,4))
my_bar <- barplot(round(Data_school$NO..OF.STUDENTS*100/sum(Data_school$NO..OF.STUDENTS), 1) , names.arg=Data_school$NAME.OF.SCHOOL ,
las=2 , ylim=c(0,40),
col=c(brewer.pal(n = 12, name = "Paired"),"black","violet","pink","grey","blue4","magenta","cyan"), #choosen 18 different colours
main="Number of students across schools" ,cex.names=0.8,ylab="Percent")
# Add the text
text(my_bar,round(Data_school$NO..OF.STUDENTS*100/sum(Data_school$NO..OF.STUDENTS), 1)+1,
paste(round(Data_school$NO..OF.STUDENTS*100/sum(Data_school$NO..OF.STUDENTS), 1),"%",sep=""),cex=1)
legend("topright", legend = paste("n=","",Data_school$NO..OF.STUDENTS) ,
col =c(brewer.pal(n = 12, name = "Paired"),"black","violet","pink","grey","blue4","magenta","cyan") ,
bty = "n", pch=20 , pt.cex = 2, cex = 0.8, inset = c(0.05, 0.05),ncol=2)
dim(Data_school)
Data_school[Data_school$Sex_school=="Male",]
Total_numbers_students<-c(sum(Data_school$NO..OF.STUDENTS[Data_school$Sex_school=="Male"]),
sum(Data_school$NO..OF.STUDENTS[Data_school$Sex_school=="Female"]),
sum(Data_school$NO..OF.STUDENTS[Data_school$Sex_school=="Mixed"]))
Prop_school=round(Total_numbers_students*100/sum(Data_school$NO..OF.STUDENTS), 1)
lbls <- paste(levels(Data_school$Sex_school),":", Prop_school) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(x=Prop_school, labels =lbls,radius =.7,cex=0.71,main="Distribution of students among sex of schools",
col =c("red","blue","green"),lty=1,
font=2,clockwise = TRUE,init.angle=90)
cor(Data_school[,5],Data_school[,9])
Data=Data_school[,c(5,6,9,2,3)] #extracting height, weight and BMI
colnames(Data)=c("Height","Weight","BMI","Num of Sch","Num of teacher")
chart.Correlation(Data, histogram=TRUE, pch=19)
par(mfrow=c(3,2),mar=c(4,4,1,0))
#par(mfrow=c(3,2))
#1st plot row=1 column=1
plot(Data_school$Avg_weight,Data_school$BMI, type="p",col="red",lwd=3,xlab="Weight",ylab="BMI")
abline(lm(Data_school$BMI~Data_school$Avg_weight),col="red",lwd=3) #adding line of best fit
#2nd plot row=1 column=2
plot(Data_school$Avg_height,Data_school$BMI, type="p",col="blue",lwd=3,xlab="Weight",ylab="BMI")
abline(lm(Data_school$BMI~Data_school$Avg_height),col="blue",lwd=3) #adding line of best fit
#3rd plot row=2 column=1
hist(Data_school$BMI, breaks=4, main="Histogram with breaks=4",col="yellow",xlab="BMI")
#4th plot row=2 column=2
hist(Data_school$BMI, breaks=10, main="Histogram with breaks=8",col="green",freq=FALSE,xlab="BMI")
counts <- table(Data_school$BMI_status)
barplot(counts, main="BMI status", horiz=TRUE, names.arg=levels(Data_school$BMI_status), cex.names=0.65,col="violet")
counts <- table(Data_school$BMI_status)
barplot(counts, main="BMI status", horiz=FALSE, names.arg=levels(Data_school$BMI_status), cex.names=0.65,col="grey")
50 different ggplots in R with codes can be found here: http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html
ggplot(Data_school, aes(x=Region_school, group=Sex_school)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill=" Type") +
facet_grid(~Sex_school) +
scale_y_continuous(labels = scales::percent)+xlab(" Region of school")+ylab("Percentage")+ theme(legend.position = "none")
names(Data_school)
print(paste("The correlation between Height and BMI points=",
round(cor(Data_school$Avg_height,Data_school$BMI),3)))
ggplot(Data_school, aes(x =Avg_height,y =BMI, color =BMI_status)) + geom_point()+
labs(colour = "BMI status")+xlab("Height")+
ylab("BMI")
#You can also try the commented code below by remove the #sign
#ggplot(Data_school, aes(x =Avg_height,y =BMI, color =BMI_status)) + geom_point()+
#labs(colour = "BMI status")+xlab("Height")+
#ylab("BMI")+ geom_smooth(method="lm", se=F)
ggplot(Data_school, aes(x =Region_school, y = NO..OF.STUDENTS)) + geom_boxplot()+xlab("Region of school")+
ylab("Number of students")+theme_bw()
#You can also try the commented code below by remove the #sign
#ggplot(Data_school, aes(x =Region_school, y = NO..OF.STUDENTS)) + geom_boxplot()+xlab("Region of school")+
#ylab("Number of students")
ggplot(Data_school, aes(x=NAME.OF.SCHOOL, y=NO..TEACHERS)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4,color="blue") +
coord_flip() +xlab("Name of school") +ylab("Number of teachers")+
theme_bw()
ggplot(Data_school,aes(x=BMI_status, y=NABCO.NSP, fill=Region_school)) +
geom_boxplot() +
xlab("class") +
xlab("BMI status") +ylab("Number of NABCO NSP")+labs(fill = "Region")
where $x$ & $y$ are the two variables, and $n$=sample size
my_correlation<-function(x,y){
n=length(x) #or n=length(y)
numerator<- n*sum(x*y)- (sum(x)*sum(y))
denominator<- sqrt((n*sum(x^2)-(sum(x))^2 ) * (n*sum(y^2)-(sum(y))^2 ))
results<-numerator/denominator
return(paste("Correlation based on own function=", results))
}
my_correlation(x=Data_school$Avg_height,y=Data_school$BMI)
#Based on in-built function
cor(x=Data_school$Avg_height,y=Data_school$BMI)