Codes for the second R Class¶

#You are welcome to Clement's School on Data Analysis

#Setting work directory: Create a folder on your desktop & call name it 
# DataAnalysis_results_R and put the csv data named School_data.csv into it.

#Create variables called BMI=weight/(height)^2 and BMI_categories with levels
#1: below 18.5 - you're in the underweight range
#2: between 18.5 and 24.9 - you're in the healthy weight range
#3: between 25 and 29.9 - you're in the overweight range
#4: between >=30 - you're in the obese range 

#Sex_school: 1=male  2=female and 3=mixed school
#Region_school: 1=Ashanti   2=Central & 3=Greater Accra

#Creating your own function to compute correlation between number of student &teachers
#Then compare your answer with the in-built cor() function (This will done nextweek)

Setting working directory

#Setting working directory (desktop>Folder called DataAnalysis_results_R )
setwd("C:/Users/user/Desktop/DataAnalysis_results_R")

options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300) #Setting plot size

Loading R packages¶

#Intalling our first R package (NB: You will need an internet to install packages only)
#install.packages("moments") #run this to install the package called "moments"
library("moments") # load the package to be able to calculate skewness and kurtosis only
library("ggplot2") #for plot graphs using ggplot
library("dplyr")
library("RColorBrewer")
library("PerformanceAnalytics")
library("tidyr")

ARRAY¶

1-Dimensional 2-Dimensional

Importing the data into R¶

#Importing data from the working directory 
Data_school<-read.csv(file="School_data.csv")
head(Data_school,n=10)#view first 10 rows of the data with object name: "Data_school"

dim(Data_school)#To view the dimension of the data 19 rows and 8 columns

#To check whether there is any missing data
any(is.na(Data_school)) #it returned false implies no missing data

Assigning variable to categorical variable sex of school & region of school¶

#Assigning variable to categorical variable sex of school & region of school
#which as being dummy coded as 1,2 and 3.
Data_school$Sex_school<-factor(Data_school$Sex_school,levels =c(1,2,3),
                               labels = c("Male","Female","Mixed"))

Data_school$Region_school<-factor(Data_school$Region_school,levels =c(1,2,3),
                               labels = c("Ashanti","Central","Greater Accra"))

Cross tabulations expressed as percentages between Region and Sex of School

#Returns the cross tabulations expressed as percentages (over 100)
(table(Data_school$Region_school,Data_school$Sex_school)/dim(Data_school)[1])*100

               
                     Male    Female     Mixed
  Ashanti       31.578947 10.526316  5.263158
  Central        0.000000 21.052632  0.000000
  Greater Accra 10.526316 21.052632  0.000000

Creating $\text{BMI}=\frac{\text{Weight}}{\text{Height}^2}$ as an additional variable ($kg/m^2$)¶

Data_school$BMI=Data_school$Avg_weight/((Data_school$Avg_height)^2)

head(Data_school)

Creating additional BMI status as a additional variable based on the continuous BMI¶

BMI status must have categories:

i) underweight (if BMI < 18.5)

ii) normal weight (if 18.5<BMI<24.9)

iii) overweight (if 25.0 <BMI<29.9)

iv) obese (if BMI>30).

max(Data_school$BMI)#check the max value of BMI first

thresholds<- c(0,18.5,24.9,29.9,50) #threshold for converting the continous BMI into BMI status
Data_school$BMI_status<-cut(Data_school$BMI,breaks=thresholds,right=FALSE,
                            labels = c("Underweight", "Normal weight", "Overweight", "Obese"))

Data_school

#Saving the updated data directory into your repository
write.csv(Data_school,"Data_school_new.csv")

quantile(Data_school[,2],c(0.025,0.975))

Creating the function to compute mode in R¶

# Creating the function to compute mode in R
getmode <- function(x) {
   uniq_x <- unique(x)
   return(uniq_x[which.max(tabulate(match(x, uniq_x)))])
}

The mode of the number of teachers is clearing 15 since it appears more frequently (5 times)

table(Data_school$NO..TEACHERS)

 8  9 11 12 14 15 16 17 18 19 
 1  1  4  2  1  5  1  1  1  2

Now let's use of created mode function to get the mode directly

#Run this to get the mode directly
getmode(x=Data_school$NO..TEACHERS)

Creating own function to compute descriptive/summary statistics¶

The summary statistics function should first

Determine the type of variable
If it's numeric find & return mean, median, mode, standard deviation, standard error, skewness, kurtosis and 95% quantile recorded to 2 decimal places as well as histogram plot of the numeric variable coloured by blue colour.
Else if it's categorical, it should find & return percentages for all categories/levels (in 1 decimal place) and the name of the categories as a dataframe.

#Creating a function to estimate summary statistics of the data
#by determining the type of variable
#If it's numeric find & return mean, median, standard deviation, standard error, 
#skewness, kurtosis and 95% quantile recorded to 2 decimal places
#But if its categorical find & return percentages for all categories/levels 
#(in 1 decimal place) and the name of the categories as a dataframe

Summary_stats<-function(data, variable_index){
  Variable_name<-names(data)[variable_index]
  Variable<-(data)[,variable_index]
  if(is.numeric(Variable)==TRUE){ #if variable is numeric/quantitative
    #compute mean, median, standard deviation, standard error, 
    #skewness and kurtosis
  mean_value<-mean(Variable) #compute mean
  median_value<-median(Variable) #compute median
  modal_value<-getmode(Variable)
  std<-sd(Variable) #compute standard deviation
  standard_error<-std/sqrt(length(Variable)) #compute standard error
  skewness<-skewness(Variable) #compute skewness
  kurtosis<-kurtosis(Variable) #compute kurtosis
  quantile_95percent<-quantile(Variable,c(0.025,0.975)) #compute 95% quantile
    graph<-hist(Variable,xlab=paste(Variable_name),col="blue", main="")
  #returns the mean, median, standard deviation, standard error,skewness and kurtosis
  return(list(Variable_name=Variable_name,
    mean=round(mean_value,2),median=round(median_value,2),mode=modal_value,std=round(std,2),SE=round(standard_error,2),
    skewness=round(skewness,2),kurtosis=round(kurtosis,2),quantile_95percent=round(quantile_95percent,2),histogram=graph))           

  } else if(is.factor(Variable)==TRUE){ #else if categorical
    #compute the percentages rounded in 1 decimal place
  percentage<-paste(round((table(Variable)/dim(data)[1])*
                           100,1),"%")
  levels_variable<-levels(Variable)
  output<-data.frame(Categories=levels_variable,percentage=percentage)#storing output as dataframe
  
  #return variable name and a dataframe of percentages for each category
  return(list(Variable_name=Variable_name,output=output))
       }
  }

for(i in 1:dim(Data_school)[2])
print(paste(names(Data_school)[i],"","Index=",i))

[1] "NAME.OF.SCHOOL  Index= 1"
[1] "NO..OF.STUDENTS  Index= 2"
[1] "NO..TEACHERS  Index= 3"
[1] "NABCO.NSP  Index= 4"
[1] "Avg_height  Index= 5"
[1] "Avg_weight  Index= 6"
[1] "Sex_school  Index= 7"
[1] "Region_school  Index= 8"
[1] "BMI  Index= 9"
[1] "BMI_status  Index= 10"

#Viewing summary stats of 9th variable
Summary_stats(data=Data_school, variable_index=9)

$Variable_name
[1] "BMI"

$mean
[1] 18.75

$median
[1] 15.68

$mode
[1] 24.57787

$std
[1] 9.85

$SE
[1] 2.26

$skewness
[1] 1.3

$kurtosis
[1] 1.19

$quantile_95percent
 2.5% 97.5% 
 7.37 40.40 

$histogram
$breaks
 [1]  5 10 15 20 25 30 35 40 45 50

$counts
[1] 2 6 5 2 1 2 0 0 1

$density
[1] 0.02105263 0.06315789 0.05263158 0.02105263 0.01052632 0.02105263 0.00000000
[8] 0.00000000 0.01052632

$mids
[1]  7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5

$xname
[1] "Variable"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"

names(Data_school)

Data_school$NO..OF.STUDENTS

Barplot of number of student across names & sex of schools¶

# Increase bottom margin
par(mar=c(6,4,4,4))

my_bar <- barplot(round(Data_school$NO..OF.STUDENTS*100/sum(Data_school$NO..OF.STUDENTS), 1) , names.arg=Data_school$NAME.OF.SCHOOL , 
                  las=2 , ylim=c(0,40),
col=c(brewer.pal(n = 12, name = "Paired"),"black","violet","pink","grey","blue4","magenta","cyan"), #choosen 18 different colours
                  main="Number of students across schools" ,cex.names=0.8,ylab="Percent")

# Add the text 
text(my_bar,round(Data_school$NO..OF.STUDENTS*100/sum(Data_school$NO..OF.STUDENTS), 1)+1, 
    paste(round(Data_school$NO..OF.STUDENTS*100/sum(Data_school$NO..OF.STUDENTS), 1),"%",sep=""),cex=1) 


legend("topright", legend = paste("n=","",Data_school$NO..OF.STUDENTS) , 
     col =c(brewer.pal(n = 12, name = "Paired"),"black","violet","pink","grey","blue4","magenta","cyan") , 
     bty = "n", pch=20 , pt.cex = 2, cex = 0.8, inset = c(0.05, 0.05),ncol=2)

dim(Data_school)

Data_school[Data_school$Sex_school=="Male",]

Total_numbers_students<-c(sum(Data_school$NO..OF.STUDENTS[Data_school$Sex_school=="Male"]),
sum(Data_school$NO..OF.STUDENTS[Data_school$Sex_school=="Female"]),
sum(Data_school$NO..OF.STUDENTS[Data_school$Sex_school=="Mixed"]))

Prop_school=round(Total_numbers_students*100/sum(Data_school$NO..OF.STUDENTS), 1)
lbls <- paste(levels(Data_school$Sex_school),":", Prop_school) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels


pie(x=Prop_school, labels =lbls,radius =.7,cex=0.71,main="Distribution of students among sex of schools",
    col =c("red","blue","green"),lty=1,
   font=2,clockwise = TRUE,init.angle=90)

cor(Data_school[,5],Data_school[,9])

Data=Data_school[,c(5,6,9,2,3)] #extracting height, weight and BMI
colnames(Data)=c("Height","Weight","BMI","Num of Sch","Num of teacher")
chart.Correlation(Data, histogram=TRUE, pch=19)

par(mfrow=c(3,2),mar=c(4,4,1,0)) 

#par(mfrow=c(3,2))   

#1st plot row=1 column=1
plot(Data_school$Avg_weight,Data_school$BMI, type="p",col="red",lwd=3,xlab="Weight",ylab="BMI")
abline(lm(Data_school$BMI~Data_school$Avg_weight),col="red",lwd=3) #adding line of best fit

#2nd plot row=1 column=2
plot(Data_school$Avg_height,Data_school$BMI, type="p",col="blue",lwd=3,xlab="Weight",ylab="BMI")
abline(lm(Data_school$BMI~Data_school$Avg_height),col="blue",lwd=3) #adding line of best fit


#3rd plot row=2 column=1
hist(Data_school$BMI, breaks=4, main="Histogram with breaks=4",col="yellow",xlab="BMI")

#4th plot row=2 column=2
hist(Data_school$BMI, breaks=10, main="Histogram with breaks=8",col="green",freq=FALSE,xlab="BMI")


counts <- table(Data_school$BMI_status)
barplot(counts, main="BMI status", horiz=TRUE, names.arg=levels(Data_school$BMI_status), cex.names=0.65,col="violet")


counts <- table(Data_school$BMI_status)
barplot(counts, main="BMI status", horiz=FALSE, names.arg=levels(Data_school$BMI_status), cex.names=0.65,col="grey")

Plotting using "ggplot2" package¶

50 different ggplots in R with codes can be found here: http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html

ggplot(Data_school, aes(x=Region_school,  group=Sex_school)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill=" Type") +
    facet_grid(~Sex_school) +
    scale_y_continuous(labels = scales::percent)+xlab(" Region of school")+ylab("Percentage")+ theme(legend.position = "none")

names(Data_school)

print(paste("The correlation between Height and BMI points=",
            round(cor(Data_school$Avg_height,Data_school$BMI),3)))

ggplot(Data_school, aes(x =Avg_height,y =BMI, color =BMI_status)) + geom_point()+ 
labs(colour = "BMI status")+xlab("Height")+
ylab("BMI")


#You can also try the commented code below by remove the #sign
#ggplot(Data_school, aes(x =Avg_height,y =BMI, color =BMI_status)) + geom_point()+ 
#labs(colour = "BMI status")+xlab("Height")+
#ylab("BMI")+ geom_smooth(method="lm", se=F)

[1] "The correlation between Height and BMI points= -0.679"

ggplot(Data_school, aes(x =Region_school, y = NO..OF.STUDENTS)) + geom_boxplot()+xlab("Region of school")+
ylab("Number of students")+theme_bw()

#You can also try the commented code below by remove the #sign
#ggplot(Data_school, aes(x =Region_school, y = NO..OF.STUDENTS)) + geom_boxplot()+xlab("Region of school")+
#ylab("Number of students")

ggplot(Data_school, aes(x=NAME.OF.SCHOOL, y=NO..TEACHERS)) +
    geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4,color="blue") +
    coord_flip() +xlab("Name of school") +ylab("Number of teachers")+
    theme_bw()

ggplot(Data_school,aes(x=BMI_status, y=NABCO.NSP, fill=Region_school)) + 
    geom_boxplot() +
    xlab("class") +
   xlab("BMI status") +ylab("Number of NABCO NSP")+labs(fill = "Region")

Creating own function to compute Pearson's Correlation coefficient¶

$$ r_{xy}= \frac{n \sum_{i=1}^{n} x_iy_i -\sum_{i=1}^{n} x_i \sum y_i}{\sqrt{ \left[n \sum_{i=1}^{n} x_{i}^{2}- \left(\sum_{i=1}^{n} x_i \right)^2 \right] \times \left[n \sum_{i=1}^{n} y_{i}^{2}- \left(\sum_{i=1}^{n} y_i \right)^2 \right]} } $$

where $x$ & $y$ are the two variables, and $n$=sample size

my_correlation<-function(x,y){
    n=length(x) #or  n=length(y) 
    numerator<- n*sum(x*y)- (sum(x)*sum(y))
    denominator<- sqrt((n*sum(x^2)-(sum(x))^2 ) * (n*sum(y^2)-(sum(y))^2 ))
    results<-numerator/denominator
    return(paste("Correlation based on own function=", results))
}

my_correlation(x=Data_school$Avg_height,y=Data_school$BMI)

#Based on in-built function
cor(x=Data_school$Avg_height,y=Data_school$BMI)

	NAME.OF.SCHOOL	NO..OF.STUDENTS	NO..TEACHERS	NABCO.NSP	Avg_height	Avg_weight	Sex_school	Region_school
	<fct>	<int>	<int>	<int>	<dbl>	<int>	<int>	<int>
1	ASASS	315	15	13	1.77	77	3	1
2	OWASS	400	18	12	1.86	74	1	1
3	Prempeh	334	14	11	1.89	56	1	1
4	Wesley girls	273	15	9	2.12	63	2	2
5	KASS	530	19	8	1.90	46	2	3
6	Kumasi Girls	175	15	16	1.29	56	2	2
7	PRESEC	260	12	11	1.60	43	2	1
8	Aquinas	115	9	5	1.65	36	1	1
9	Adisadel	315	11	10	1.32	80	1	1
10	Mfantsipem	394	15	3	1.86	65	1	1

	NAME.OF.SCHOOL	NO..OF.STUDENTS	NO..TEACHERS	NABCO.NSP	Avg_height	Avg_weight	Sex_school	Region_school	BMI
	<fct>	<int>	<int>	<int>	<dbl>	<int>	<fct>	<fct>	<dbl>
1	ASASS	315	15	13	1.77	77	Mixed	Ashanti	24.57787
2	OWASS	400	18	12	1.86	74	Male	Ashanti	21.38976
3	Prempeh	334	14	11	1.89	56	Male	Ashanti	15.67705
4	Wesley girls	273	15	9	2.12	63	Female	Central	14.01744
5	KASS	530	19	8	1.90	46	Female	Greater Accra	12.74238
6	Kumasi Girls	175	15	16	1.29	56	Female	Central	33.65182