Session: Four (last session for the program)
Date: 20/03/2021
Intructor: Clement Twumasi
Website: https://twumasiclement.wixsite.com/website
YouTube channel: https://www.youtube.com/channel/UCxrpjLYi_Akmbc2QICDvKaQ
The most important info you need to know about statistics and data analysis: https://www.youtube.com/watch?v=SRGpeSYyOoA&t=3467s
Time series analysis in R (we shall simulate time series data, learn how to declare time series data and fit its model).
Other objectives:
Introduction to other advanced statistical (univariate and multivariate) tests & different models (eg. machine learning algorithms, etc.) in R
NB: The class/type of statistical tests and models to fit are predominantly dependent on the type/nature of your data.
Link to some of the aforementioned univariate tests in R: https://rpubs.com/sujith/IS
Multivariate analysis in R: https://little-book-of-r-for-multivariate-analysis.readthedocs.io/en/latest/src/multivariateanalysis.html
http://www.sthda.com/english/wiki/manova-test-in-r-multivariate-analysis-of-variance
PCA, SVD, Correspondence Analysis and Multidimensional Scaling:
https://web.stanford.edu/class/bios221/labs/multivariate/lab_5_multivariate.html
Note some of the class of regression models available. Knowing which one is appropriate for any given data and research problem is very imperative.
Introduction to linear regression: https://www.machinelearningplus.com/machine-learning/complete-introduction-linear-regression-r/
OLS linear regression in R: http://r-statistics.co/Linear-Regression.html
Different class of linear models in R: https://www.statmethods.net/advstats/glm.html
Time series analysis in R:
https://ourcodingclub.github.io/tutorials/time/
https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/
options(repr.plot.width=8, repr.plot.height=8,repr.plot.res = 300) #Setting plot size
#setting working directory to import and/or export data
setwd("C:/Users/user/Desktop/DataAnalysis_results_R/NUGSChina_R_class")
We shall simulate our own time series data for this session
Simulating a time series data with 300 data points from ARIMA(2,1,1) model
#Simulating data from ARIMA(2,1,1)
#You should set a working directory when importing
set.seed(1)
chosen_length <- 300; burnin <- 100
n <- chosen_length + burnin #n=300
e <- rnorm(n,0,1)
y <- e
mu <- 0.2; phi1 <- 0.3; phi2 <- 0.4; theta1 <- 0.5; sigma <- 1.2
for (i in 3:n) {
y[i] <- mu + phi1*y[i-1] + phi2*y[i-2] + sigma*e[i] - theta1*e[i-1]
}
S_y <- cumsum(y)
data<- S_y[(burnin+1):n]
print(data)
Saving and exporting data as a csv file into your working directory for use later if you wish
#Saving and exporting data as a csv file into your working directory
write.csv(data,"Simulated_ts_data.csv")
data_monthly<- ts(data,start=c(1997,1),end=c(2021,12),frequency=12)
data_monthly
write.csv(data_monthly,"Simulated_ts_Monthlydata.csv")
plot(data_monthly,type="l",lwd=3,ylab="Monthly times series",col="blue")
data_quartely<- aggregate(data_monthly,nfrequency=4)
data_quartely
data_yearly<- aggregate(data_monthly,nfrequency=1)
data_yearly
#par(mfrow=c(2,1))
par(mfrow=c(2,2),mar=c(4,4,1,1))
#First plot (top left)
plot(data_monthly,type="l",lwd=3,ylab="Times series",xlab="Time",col="blue",las=1)
text(2000,300,"Monthly series",col="blue",font=1)
plot(data_quartely,type="l",lwd=3,ylab="Times series",col="red",las=1)
text(2000,890,"Quartely series",col="red",font=1)
plot(data_yearly,type="l",lwd=3,ylab="Times series",col="green",las=1)
text(2000,3500,"Yearly series",col="green",font=1)
plot(data_yearly,type="l",lwd=3,ylab="Times series",col="green",ylim=c(0,3600),las=1)
lines(data_quartely,type="l",lwd=3,ylab="Times series",col="red",las=1)
lines(data_monthly,type="l",lwd=3,ylab="Times series",col="blue",las=1)
legend(x=1997,y=3650,legend = c("Yearly","Quartely","Monthly"),col=c("green","red","blue"),
cex=1, horiz = FALSE,pt.cex = 1,fill=c("green","red","blue"),ncol=1,
title="Time frequency ",box.lwd = 2)
#You can save the quartely data for use later into your working repository if you want
write.csv(data_quartely,"Simulated_ts_Quartelydata.csv")
write.csv(data_yearly,"Simulated_ts_Yearlydata.csv")
Practice task: Fit the same class of time series models using the quartely and yearly ts data respectively
NB: There lots of other class of machine learning algorithms for time series prediction.
#Don't forget to install each of the packages with install.package("") before loading with library()
#install.packages("tseries") #installing package tseries
library(tseries) #for time series
library("tsoutliers")#identifying outliers in time series
library(ggplot2) #its for other types of customized plots
library(forecast) # forecasting methods
library(gridExtra)# for combine ggplots using grid.arrange() fucntion
library("seastests")# for testing seasonality
library("TTR")#for simple moving average model
library(zoo)
Finding mean values for each year
year<- as.character(1997:2021)
means_year<- aggregate(data_monthly, nfrequency = 1, FUN = mean)
data.frame(year=year,mean=means_year)
The types of outliers it detects: "AO" additive outliers, "LS" level shifts, and "TC" temporary changes are selected; "IO" innovative outliers and "SLS" seasonal level shifts can also be selected.
?tso #to see the arguments of the function "tso" if you wish
outlier.monthlydata<- tso(data_monthly,maxit.iloop=10,tsmethod = "auto.arima")
outlier.monthlydata
Suppose the data had outliers, you can use the tsclean() to recompute good estimates of the outliers
NB: For this data, data cleaning is not necessary since there are no outliers.
cleaned_tseries<- tsclean(data_monthly)
cleaned_tseries
#Check whether the original series without outliers have the same values in the cleaned data
all(cleaned_tseries==data_monthly)
Why do we need to identify patterns?: Because they can also suggest the class of models to use/consider.
A time series with additive trend, seasonal, and irregular components can be decomposed using the stl() function.
NB: STL decomposition stands for Seasonal-Trend decomposition using LOESS
decomp_ts <- stl(data_monthly, s.window = "periodic", robust = TRUE)$time.series
#print(decomp_ts)
Seasonal_component<- decomp_ts[,1]
Trend_component<- decomp_ts[,2]
Random_component <- decomp_ts[,3]
Deasonalized_data<- data_monthl-Seasonal_component
par(mfrow=c(4,1),mar=c(4,4,1,1))
plot(data_monthly,ylab="Observed data",col="blue")
plot(Seasonal_component,col="blue",ylab="Seasonal")
plot(Trend_component,col="blue",ylab="Trend")
plot(Random_component,col="blue",ylab="Random")
Testing for stationarity using Augmented Dickey-Fuller unit root test (ADF test)
H0: There series is nonstationarity.
H1: There is stationary
Conclusion: The original monthly series is non-stationary (i.e at lag 0).
Clearly, the time series non-stationary or changes over time
adf.test(data_monthly)
Notice that the series is stationary at lag 1 (first differencing $\Delta Y_t$) using the ADF test
where $\Delta Y_t=Y_t-Y_{t-1}$
adf.test(diff(data_monthly,lag=1)) # NB: diff(data_monthly,lag=1) gives first differenced series Y_t-Y_{t-1}
first_diff_series<- diff(data_monthly,lag=1)
par(mfrow=c(2,1),mar=c(4,4,1,1))
plot(data_monthly,type="l",lwd=2,col="blue",ylab="Original series")
plot(first_diff_series,type="l",lwd=2,col="blue",ylab="First differenced series")
abline(h=0,col="black",lwd=3)
WO-test Seasonality test
By default, the WO-test is used to assess the seasonality of a time series and returns a boolean. Alternatively, the QS test (test='qs'), Friedman test (test='fried'), Kruskall-Wallis (test='kw'), F-test on seasonal dummies (test='seasdum') or the Welch test (test='welch') can be used.
Webel, K. and Ollech, D. (2019). An overall seasonality test. Deutsche Bundesbank’s Discussion Paper series.
#The p-values indicate the p-values of the underlying test, i.e. the QS-test, the QS-R test and the KW-R-test.
#Testing for monthly seasonality
summary(wo(data_monthly,freq=12))
#Clearly there are no seasonal patterns
#But the series generally increase over time for each year
seasonplot(data_monthly, col=rainbow(12), year.labels=TRUE,main="Seasonal plots across years")
ggseasonplot(data_monthly, col=rainbow(n=25),polar=TRUE) +
ylab("Series") +
ggtitle("Polar seasonal plot")
ACF and PACF plots using ggplot with forecast package
options(warn=-1)#avoid title warning message in ggAcf or ggPacf function
acf_plot1=ggAcf(data_monthly,main="Original series")
pacf_plot1=ggPacf(data_monthly,main="Original series")
acf_plot2=ggAcf(first_diff_series,main="First differenced series")
pacf_plot2=ggPacf(first_diff_series,main="First differenced series")
grid.arrange(acf_plot1,pacf_plot1,acf_plot2,pacf_plot2,ncol=2)
train_set<- window(data_monthly, start=c(1997, 1), end=c(2015, 12)) #from 1997 to 2015 from training
test_set<- window(data_monthly, start=c(2016, 1), end=c(2021, 12)) # from 2016 to 2021 for testing
Fitting ARIMA time series model
Automatic_ARIMA_model<- auto.arima(y=train_set,seasonal = FALSE)
Automatic_ARIMA_model
AutoArimaModel_forecasts=forecast(Automatic_ARIMA_model, PI=TRUE,h=length(test_set),level = c(95),interval = "c")
#accuracy measures
accuracy(AutoArimaModel_forecasts,test_set)[,c(2,3,5)]
Fitting Naive time series model
naive_model<- snaive(y=train_set,h=length(test_set))
#accuracy measures
accuracy(naive_model,test_set)[,c(2,3,5)]
Fitting non-seasonal Holt-Winters model (HW)
HW_model <- HoltWinters(train_set, beta=FALSE, gamma=FALSE)
HW_model
HW_forecasts=forecast(HW_model , PI=TRUE,h=length(test_set),level = c(95),interval = "c")
#accuracy measures
accuracy(HW_forecasts,test_set)[,c(2,3,5)]
Fitting Simple Exponential Smoothing model
ses_model <- ses(train_set, h = length(test_set))
#accuracy measures
accuracy(ses_model,test_set)[,c(2,3,5)]
p1=autoplot(AutoArimaModel_forecasts,ylab="Series",xlab="Time",main="Non-seasonal ARIMA(2,1,1) model")
p2=autoplot(naive_model,ylab="Series",xlab="Time",main="Naive model")
p3=autoplot(HW_forecasts,ylab="Series",xlab="Time",main="Non-seasonal Holt-Winter model")
p4=autoplot(ses_model,ylab="Series",xlab="Time",main="Simple Exponential Smoothing model")
grid.arrange(p1,p2,p3,p4,ncol=2)