Dataset description/ Description of variables:
Major – program of freshman
Language – the language of the program
Type – Type of prame A: scholarship, K: tuition fee
Enrolment_Points – points of freshman (maximum is 500)
High level mathematics – Mathematics A level or maturity test at a high level
Mathematics point – maximum is 100 Tasks (marks out of 100)
Residence: 1 means South; 2 means North; 3 means East; 4 means West
Create a new numerical variable called Total_points; which is obtained after adding both the Enrolment_Points and Mathematics points for each student & add this new variable (Total_points) to the original data.
Convert their Mathematics points to a categorical variable called Mathematics_grade with categories: i) Low class if Mathematics point is less 40% ii) Medium class if Mathematics point is from 40% to 70% and iii) High class if Mathematics point > 70%.
Save the new data as a CSV file directly into your working directory/DataAnalysis_results_R folder.
Create your own function to compute descriptive/summary statistics (just update what I created already in the second meeting dated October 24, 2020). The summary statistics function should: i) First determine the type of variable, ii). If it's numeric find & return mean, median, mode, variance, standard deviation, maximum value, minimum value, standard error, skewness, kurtosis and 95% quantile recorded to 2 decimal places as well as histogram plot of the numeric variable coloured by “green” colour. iii) Else if it is categorical, it should find & return percentages for all categories/levels (in 1 decimal place) and the name of the categories as a data frame as well as plot a pie chart with percentages for each category of the variable with different colours.
#Setting working directory (desktop>Folder called DataAnalysis_results_R )
setwd("C:/Users/user/Desktop/DataAnalysis_results_R")
#Setting plot size for large view and specific resolution
options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300)
#Intalling our first R package (NB: You will need an internet to install packages only)
#install.packages("moments") #run this to install the package called "moments"
# Loading R packages
library("moments") # load the package to be able to calculate skewness and kurtosis only
library("ggplot2") #for plot graphs using ggplot
library("RColorBrewer") #To select customized colours (sequential, etc.) but there are lot's of colours without packages
library("PerformanceAnalytics") #for correlation matrix plot
library(gridExtra)
library("tidyr")
library("dplyr")
library(forcats)
#Importing data from the working directory
Data_uni<-read.csv(file="University_data.csv")
head(Data_uni,n=6)#view first 10 rows of the data with object name: "Data_school"
#You can look at the last 10 rows using:
#tail(Data_uni,n=10)
To check whether there is any missing data
#To check whether there is any missing data
any(is.na(Data_uni)) #it returned false implies no missing data
To view the dimension of the data
#To view the dimension of the data 2331 rows and 7 columns
dim(Data_uni)
Names of variables in the data
#Names of variables in the data
names(Data_uni)
1. Assign the names to the levels of the categorical variables (Residence) where 1 means South; 2 means North; 3 means East; 4 means West.
#view the levels for the variable Residence
levels(as.factor(Data_uni$Residence))
#Assign the names to the levels of the categorical variables (Residence)
#where 1 means South; 2 means North; 3 means East; 4 means West.
Data_uni$Residence<-factor(Data_uni$Residence,levels =c(1,2,3,4),
labels = c("South","North","East","West"))
#Viewing the updated data (first 8 rows)
Data_uni[1:8, ] #or head(Data_uni,n=8)
2. Create a new numerical variable called Total_points; which is obtained after adding both the Enrolment_Points and Mathematics points for each student & add this new variable (Total_points) to the original data.
# Create a new numerical variable called:
#Total_points= Enrolment_Points + Mathematics
Data_uni$Total_points<-Data_uni$Enrolment_Points+Data_uni$Mathematics.points
#Viewing the updated data (first 4 rows)
Data_uni[1:4, ]
3. Convert their Mathematics points to catgorical variable called Mathematics_grade with categories: i) Low class if Mathematics point is less 40% ii) Medium class if Mathematics point is from 40% to 70% and iii) High class if Mathematics point > 70%.
#Convert their Mathematics points to catgorical variable called Mathematics_grade with categories:
#i) Low class if Mathematics point is less 40%
#ii) Medium class if Mathematics point is from 40% to 70% and
#iii) High class if Mathematics point > 70%.
threshold<- c( min(Data_uni$Mathematics.points), 40, 71, max(Data_uni$Mathematics.points)+1)
Data_uni$Mathematics_grade<- cut(Data_uni$Mathematics.points,breaks=threshold,right=FALSE,
labels=c("Low class","Medium class","High class"))
#Viewing the updated data (first 10 rows)
head(Data_uni,n=10)
4.Save the updated data as csv data and called it University_data_Updated file directly into your working directory/DataAnalysis_results_R folder.
To know your data has been saved you can either check your folder
#Save the new data as a CSV file directly into your
#working directory/DataAnalysis_results_R folder.
write.csv(Data_uni,"University_data_Updated.csv")
5. Create your own function to compute descriptive/summary statistics (just update what I created already in the second meeting dated October 24, 2020). The summary statistics function should: i) First determine the type of variable, ii). If it's numeric find & return mean, median, mode, variance, standard deviation, maximum value, minimum value, standard error, skewness, kurtosis and 95% quantile recorded to 2 decimal places as well as histogram plot of the numeric variable coloured by “green” colour. iii) Else if it is categorical, it should find & return percentages for all categories/levels (in 1 decimal place) and the name of the categories as a data frame as well as plot a pie chart with percentages for each category of the variable with different colours.
Creating a fuction to compute the mode
It is important to note that the mode may not be relevant for some numeric/quantity variable and may not exist (or could be more than one value). Mode for categorical variables can sometimes be important.
# Creating the function to compute mode in R
getmode <- function(x) {
uniq_x <- unique(x)
return(uniq_x[which.max(tabulate(match(x, uniq_x)))])
}
getmode(Data_uni$Enrolment_Points)
names(Data_uni)
table(Data_uni$Mathematics_grade)
The summary statistics function should first:
#Creating a function to estimate summary statistics of the data
#by determining the type of variable
#If it's numeric find & return mean, median, mode standard deviation, standard error,
#skewness, kurtosis and 95% quantile recorded to 2 decimal places
#But if its categorical find & return percentages for all categories/levels
#(in 1 decimal place) and the name of the categories as a dataframe
Summary_stats<-function(data, variable_index){
Variable_name<-names(data)[variable_index]
Variable<-(data)[,variable_index]
if(is.numeric(Variable)==TRUE){ #if variable is numeric/quantitative
#compute mean, median, standard deviation, standard error,
#skewness and kurtosis
mean_value<-mean(Variable) #compute mean
median_value<-median(Variable) #compute median
modal_value<-getmode(Variable)
std<-sd(Variable) #compute standard deviation
standard_error<-std/sqrt(length(Variable)) #compute standard error
skewness<-skewness(Variable) #compute skewness
kurtosis<-kurtosis(Variable) #compute kurtosis
quantile_95percent<-quantile(Variable,c(0.025,0.975)) #compute 95% quantile
graph<-hist(Variable,xlab=paste(Variable_name),col="blue", main="")
#returns the mean, median, standard deviation, standard error,skewness and kurtosis
return(list(Variable_name=Variable_name,
mean=round(mean_value,2),median=round(median_value,2),mode=modal_value,std=round(std,2),SE=round(standard_error,2),
skewness=round(skewness,2),kurtosis=round(kurtosis,2),quantile_95percent=round(quantile_95percent,2),histogram=graph))
} else if(is.factor(Variable)==TRUE){ #else if categorical
#compute the percentages rounded in 1 decimal place
percentage<-paste(round((table(Variable)/dim(data)[1])*
100,1),"%")
levels_variable<-levels(Variable)
output<-data.frame(Categories=levels_variable,percentage=percentage)#storing output as dataframe
#Plotting the pie chart for the categorical variable
Percentage_values<- round((table(Variable)/dim(data)[1])*100,1)
labels_variables <- paste(levels(Variable),":", Percentage_values) # add percents to labels
labels_variables <- paste( labels_variables,"%",sep="") # ad % to labels
#Deciding how many colours to choose if the number of categories is < 3 or >=3 before plot
if(length(levels_variable)==2){
colours_two_categories<- c("red","blue")
pie_chart<- pie(x=Percentage_values, labels =labels_variables,radius =.7,cex=0.71,main="",
col =colours_two_categories,font=2,clockwise = TRUE,init.angle=90)
} else if(length(levels_variable)>=3){
colours_categories<-brewer.pal(n = length(Percentage_values), name = "Paired")
pie_chart<- pie(x=Percentage_values, labels =labels_variables,radius =.7,cex=0.71,main="",
col =colours_categories,font=2,clockwise = TRUE,init.angle=90)
}
#return variable name and a dataframe of percentages for each category
return(list(Variable_name=Variable_name,output=output, pie_chart= pie_chart))
}
}
#Print index for each variable
for(i in 1:dim(Data_uni)[2]) print(paste(names(Data_uni)[i],"","Index=",i))
#Summary_stats(data=Data_uni, variable_index=4)
#1st variable
Summary_stats(data=Data_uni, variable_index=1)
#2nd variable
Summary_stats(data=Data_uni, variable_index=2)
#3rd variable
Summary_stats(data=Data_uni, variable_index=3)
#4th variable
Summary_stats(data=Data_uni, variable_index=4)
#5th variable
Summary_stats(data=Data_uni, variable_index=5)
#6th variable
Summary_stats(data=Data_uni, variable_index=6)
#7th variable
Summary_stats(data=Data_uni, variable_index=7)
#8th variable
Summary_stats(data=Data_uni, variable_index=8)
#9th variable
Summary_stats(data=Data_uni, variable_index=9)
6. Depict at least 8 figures/plots (it can be more than 8) using only ggplot2 package to describe the data the best you can as a Data scientist (example of the plots are line graphs, scatter plots, barplot, boxplot, histograms of numeric variables across different categories, barchart of categorical variables distinguishing between different categories variables, etc)
http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html
NB: facet_wrap() in the ggplot functions below helps plotting/splitting across the different categories as seen with the variable Language using facet_wrap(~language) for example.
names(Data_uni)
#NB: **facet_wrap()** in the ggplot functions below helps plotting/splitting
#across the different categories as seen
#with the variable **Language** using facet_wrap(~language) for example.
#First plot
ggplot(Data_uni, aes(x= Type, group=Language)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill=" Type",caption="Clement's School on Advanced Data Analysis") +
facet_grid(~Language) +
scale_y_continuous(labels = scales::percent)+xlab(" Type of program")+ylab("Percentage")+ theme(legend.position = "none")
#2nd ggplot
ggplot(Data_uni, aes(x=Residence, group=Mathematics_grade)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..))) +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="Residence",caption="Clement's School on Advanced Data Analysis") +
facet_grid(~Mathematics_grade) +
scale_y_continuous(labels = scales::percent)+xlab("Residence")+ylab("Percentage")+ theme(legend.position = "none")
#3rd & 4th ggplot
Type_levels<-c(levels(Data_uni$Type))
g1 <- ggplot(Data_uni, aes(x=Language, y=Mathematics.points,fill=Residence)) +
geom_bar(aes(fill = factor(Type, levels=Type_levels)),position=position_dodge(), stat="identity")+
facet_wrap( ~ Mathematics_grade)+labs(fill="Type")+ylab("Mathematics points")
Type_levels<-c(levels(Data_uni$Type))
g2 <- ggplot(Data_uni, aes(x=Residence, y=Enrolment_Points,fill=Residence)) +
geom_bar(aes(fill = factor(Type, levels=Type_levels)),position=position_dodge(), stat="identity")+
facet_wrap( ~ Mathematics_grade)+labs(fill="Type")+ylab("Enrolment points")+xlab("Residence")
grid.arrange(g1,g2,nrow=2,ncol=1)
max(Data_uni$Total_point)
#5th
ggplot(Data_uni, aes(x=Major.program, y=Total_points/1000)) +
geom_bar(stat="identity", alpha=.6, width=.4,color="blue") +
coord_flip() +xlab("Program Major") +ylab("Total points")+theme_bw()
ggplot(Data_uni,aes(x=Language, y=Enrolment_Points, fill=Type)) +
geom_boxplot() +
xlab("class") +
xlab("Language of program") +ylab("Enrolment points")+
labs(fill = "Program type",caption="Clement's School on Advanced Data Analysis")
ggplot(Data_uni,aes(x=Language, y=Enrolment_Points, fill=Residence)) +
geom_boxplot() +
xlab("class") +
xlab("Language of program") +ylab("Enrolment points")+labs(fill = "Residence")
#Smoothed Line graph between Enrolment and Mathematics points without splitting by any additional variables
Data_uni%>%
mutate(type=fct_reorder(as.factor(Enrolment_Points),Total_points),
prcode=fct_reorder(as.factor(Mathematics_grade),Total_points)) %>%
ggplot()+geom_smooth(aes(x=Enrolment_Points,y=Total_points),method = "auto") +
xlab("Enrolment points")+ylab("Total points")+ labs(colour = "Mathematics grade")
p1=Data_uni%>%
mutate(type=fct_reorder(as.factor(Enrolment_Points),Total_points),
prcode=fct_reorder(as.factor(Mathematics_grade),Total_points)) %>%
ggplot()+geom_smooth(aes(x=Enrolment_Points,y=Total_points,color=Mathematics_grade,group=Mathematics_grade),method = "auto") +
xlab("Enrolment points")+ylab("Total points")+ labs(colour = "Mathematics grade")
p2=ggplot(Data_uni, aes(x =Enrolment_Points,y =Total_points, color =Mathematics_grade)) + geom_point()+
labs(colour = "Mathematics grades")+xlab("Enrolment points")+ylab("Total points")
grid.arrange(p1,p2,nrow=2,ncol=1)
g1 <- ggplot(Data_uni, aes(x=Enrolment_Points))+ geom_density(aes(fill=factor(Residence)), alpha=0.8) +
labs(title="Density plot",
subtitle="Enrolment points grouped by Residence",
caption="Clement's School on Advance Data Analysis",
x="Enrolment points",
fill="Residence")
g2 <- ggplot(Data_uni, aes(x=Total_points))+ geom_density(aes(fill=factor(Mathematics_grade)), alpha=0.8) +
labs(title="Density plot",
subtitle="Total points grouped by Mathematics grade",
caption="Clement's School on Advance Data Analysis",
x="Total points",
fill="Mathematics grade")
grid.arrange(g1,g2,nrow=2,ncol=1)
names(Data_uni)
Data=Data_uni[,c(4,6,8)]
colnames(Data)=c("Enrolment points","Mathematics points","Total points")
chart.Correlation(Data, histogram=TRUE, pch=19)