SOCR ≫ | TCIU Website ≫ | TCIU GitHub ≫ |
library(dplyr)
library(arm)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(plot3D)
library(scatterplot3d)
library(plotly)
library(fastDummies)
library(forecast)
Load the data and preprocess the data.
#setwd("E:/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics")
eu <- read.csv("Master_Aggregate_EU_Econ_Data_11_29_2018_TimeTransform.csv", stringsAsFactors = F)[,-5]
colnames(eu) <- c("country","time","feature","value")
eu <- filter(eu,!country %in% c("European Union (25 countries)","D1_Country",""))
eu$value <- sapply(c(1:nrow(eu)),function(x) as.numeric(gsub(":|,","",eu$value[x])))
eu <- filter(eu, feature != "")
dim(eu)
unq_country <- sort(unique(eu$country))
unq_time <- sort(unique(eu$time))
unq_fea <- sort(unique(eu$feature))
num_country <- length(unq_country)
num_time <- length(unq_time)
num_fea <- length(unq_fea)
eu <- arrange(eu,country,time,feature)
eu_3d_array <- array(NA,dim = c(num_country,num_time,num_fea),dimnames = list(unq_country,unq_time,unq_fea))
for (i in 1:num_country){
for (j in 1:num_time){
for (k in 1:num_fea){
eu_3d_array[i,j,k] = eu$value[(i-1)*num_time*num_fea + (j-1)*num_fea + k]
}
}
}
eu_3d_array[1:10,1:10,1]
#Data visualization
eu <- arrange(eu,time,feature,country)
eu_visualization <- select(eu,time,feature,country,value)
eu_visualization$time <- sapply(c(1:nrow(eu_visualization)),function(x) as.numeric(gsub("Q",".",eu_visualization$time[x])))
eu_visualization$feature <- as.factor(eu_visualization$feature)
eu_visualization$country <- as.factor(eu_visualization$country)
eu_visualization$value <- as.numeric(eu_visualization$value)
eu_visualization$feature <- sapply(c(1:nrow(eu_visualization)),function(x) substr(eu_visualization$feature[x],1,20))
plot_ly(eu_visualization, x = ~time, y = ~country, z = ~value, color = ~feature,split = ~ country,type = 'scatter3d', mode = 'lines')
plot_ly(eu_visualization, x = ~time, y = ~country, z = ~value, color = ~feature,split = ~ country,type = 'scatter3d', mode = 'lines')
#Find the duplicates
eu_time_series <- na.omit(eu)
allFeatures = as.character(unique(eu_time_series$feature))
allTime = unique(eu_time_series$time)
allCountry = as.character(unique(eu_time_series$country))
allCombination = length(allFeatures)*length(allTime)*length(allCountry)
dup = c()
for (i in 1:length(allFeatures)){
for (j in 1:length(allCountry)){
for (k in 1:length(allTime)){
if (nrow(filter(eu_time_series,country == allCountry[j] & feature == allFeatures[i] & time == allTime[k]))>1){
dup = c(dup,as.character(allFeatures[i]))
break
}
}
break
}
}
dup #These features have mutiple observations at the same time point
Remove duplicates.
removeDup = filter(eu_time_series, feature != "Employment by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels" &
feature != "Labor cost for LCI excluding bonuses" &
feature != "Labor costs other than wages or salaries" &
feature != "Labour cost for LCI (compensation of employees plus taxes minus subsidies)" &
feature != "Labour cost for LCI excluding bonuses" &
feature != "Labour costs other than wages and salaries" &
feature != "Wages and salaries (total)")
time_series = spread(removeDup,feature,value)
dim(time_series)
Extract Belgium data.
#Chose Belgium and fit the arima
belgium = filter(time_series,country == "Belgium")
belgium = belgium[, colSums(is.na(belgium)) != nrow(belgium)] #delete the feature that is missing at all the time point
belgium = select(belgium,-time,-country)
dim(belgium)
Manually clean (preprocess) the Belgium data and fit ARIMAX model \[Y = BelguimSuperSample\$'Unemployment,\ Females,\ From\ 15-64\ years,\ Total',\]
\[X = XReg\ (all\ other\ covariates\ -starts\_with('Unemployment')).\]
#super sample the dataset
cleardata <- function(mat) {
for (i in 1:ncol(mat)) {
mat[is.na(mat[,i]),i]<-mean(mat[,i],na.rm = T) + rnorm(sum(is.na(mat[,i])),sd = sd(mat[,i],na.rm = T))
}
return(mat)
}
#use spline regression model to expand the dataset
splinecreate <- function(mat) {
res<-NULL
for (i in 1:ncol(mat)) {
sp<-smooth.spline(seq(1:72),mat[,i])
spresult<-predict(sp,seq(from=0,by=1/5,length.out = 360))
spfeat<-spresult$y+rnorm(360,sd=sd(mat[,i]))
res<-cbind(res,spfeat)
}
colnames(res)<-colnames(mat)
return(res)
}
BelguimMatrix = as.matrix(belgium)
BelguimMatrix = cleardata(BelguimMatrix)
BelguimMatrix = splinecreate(BelguimMatrix)
BelguimSuperSample = as.data.frame(BelguimMatrix)
#############################
# Outcome Features to examine, predict, forecast, spacekime-analyze
# "Gross domestic product at market prices"
# "Unemployment , Females, From 15-64 years, Total"
# "Capital transfers, payable"
# "Capital transfers, receivable"
# "Debt securities"
# "Government consolidated gross debt"
# ... View(colnames(BelguimMatrix))
#############################
################################################
# "Unemployment , Females, From 15-64 years, Total"
Y = dplyr::select(BelguimSuperSample, "Unemployment , Females, From 15-64 years, Total"); dim(Y)
X = dplyr::select(BelguimSuperSample, -starts_with("Unemployment")); dim(X)
fitArimaX = auto.arima(Y, xreg=data.matrix(X[,-112])); fitArimaX$arma
################################################
# "Government consolidated gross debt"
#Y = select(BelguimSuperSample, "Government consolidated gross debt"); dim(Y)
#X = select(BelguimSuperSample, -matches("debt|Debt")); dim(X)
# X_scale <- scale(X); Matrix::rankMatrix(X_scale); any(is.na(X_scale)); any(is.infinite(X_scale))
# remove columns with infinite or missing value
# X_scale_1 <- X_scale[ , !is.infinite(colSums(X_scale)) & !is.na(colSums(X_scale))]
#X_1 <- X[ , !is.infinite(colSums(X)) & !is.na(colSums(X))]; dim(X_1); Matrix::rankMatrix(X_1)
#X_2 <- X_1[, qr(X_1)$pivot[seq_len(qr(X_1)$rank)]]; dim(X_2)
#
#fitArimaX = auto.arima(Y, xreg=X_2, method = "CSS", # "SANN", method = "CSS", optim.method = "BFGS"
# optim.control=list(maxit = 20000, temp = 20), optim.method = "BFGS")
#fitArimaX; View(sort(fitArimaX$coef)[1:10]); fitArimaX$arma
################################################
# "Gross domestic product at market prices"
Y = select(BelguimSuperSample, "Gross domestic product at market prices"); dim(Y)
X = select(BelguimSuperSample, -matches("debt|Debt")); dim(X) # 360 167
X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X)
ts_Y <- ts(Y, start=c(2000,1), end=c(2017, 20), frequency = 20); length(ts_Y)
set.seed(1234)
fitArimaX = auto.arima(ts_Y, xreg=data.matrix(X[ , -c(50:60)])); fitArimaX$arma
# 5 0 2 0 20 0 0
# sigma^2 estimated as 57.04: log likelihood=-1132.78 AIC=2593.55 AICc=2871.09 BIC=3230.88
pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60 <-
predict(fitArimaX, n.ahead = 60, newxreg = data.matrix(X[301:360 , -c(50:60)]))$pred
plot(forecast(fitArimaX, xreg = data.matrix(X[301:360 , -c(50:60)])), # ARIMA forecast
include=120, lwd=4, lty=3, xlab="Time", ylab="GDP", ylim=c(50, 150),
main = "ARIMAX Analytics (Train: 2000-2017; Test: 2018-2020) GDP Forecasting\n
based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60, col = "red", lwd = 4, lty=3)
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2017)",
"ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)",
"ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)"),
col=c("black", "blue", "red"),
lty=c(3,3,3), lwd=c(4,4,4), cex=1.2, x.intersp=1.5, y.intersp=0.6)
text(2015, 60, expression(atop(paste("Training Region (2000-2017)"),
paste(Model(Unempl) %->% "ARIMAX(p, q, r) ; ",
XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)
text(2019.5, 60, expression(atop(paste("Validation Region (2018-2020)"),
paste(hat(Unempl) %<-% "ARIMAX(5,0 ,2); ",
XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)
GDP unit of measure represents the Current prices, euro per capita. The volume index of GDP per capita in Purchasing Power Standards (PPS) is intended for cross-country comparisons rather than for temporal comparisons. GDP per capita when expressed in PPS eliminates the differences in price levels between countries allowing meaningful volume comparisons of GDP between countries. Expressed in relation to the European Union (EU27 GDP = 100), a country with an index that is higher than \(100\) or lower than \(100\) correspond to that this country’s level of GDP per head being higher or lower than the EU average, respectively.
Define a new function that (1) cleans the data, and (2) fits the ARIMA model estimating the seasonal and non-seasonal time-series parameters \((p,d,q)\) and the effect-sizes (\(\beta\)’s) for the exogenous regression features (\(X\)).
#write a function of clean the data and fit the ARIMA model
Fit_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20,
feature="Unemployment , Females, From 15-64 years, Total")
{
#delete features that are missing at all time points
countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)]
countryData = select(countryData, -time, -country)
DataMatrix = as.matrix(countryData)
DataMatrix = cleardata(DataMatrix)
DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove feature that only has one value
DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove feature that all the values are 0
DataMatrix = splinecreate(DataMatrix)
DataSuperSample = as.data.frame(DataMatrix)
if (feature=="Unemployment , Females, From 15-64 years, Total") {
Y = select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total")
X = select(DataSuperSample, -starts_with("Unemployment"))
} else if (feature=="Gross domestic product at market prices") {
Y = select(DataSuperSample, "Gross domestic product at market prices"); dim(Y)
X = select(DataSuperSample, -matches("debt|Debt")); dim(X) # 360 167
print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], "); ",
" dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ..."))
X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X) # ensure full-rank design matrix, X
}
else {
print(paste0("This feature ", feature, " is not imlemented yet! Exiting Fit_ARIMA() method ..."))
return(NULL)
}
ts_Y <- ts(Y, start=c(start, 1), end=c(end, frequency), frequency = frequency); length(ts_Y)
set.seed(1234)
fitArimaX = auto.arima(ts_Y, xreg=data.matrix(X))
return(fitArimaX)
}
Bulgaria = filter(time_series,country == "Bulgaria")
BulgariaARIMA = Fit_ARIMA(countryData=Bulgaria, start=2000, end=2017, frequency=20,
feature="Gross domestic product at market prices")
BulgariaARIMA$arma
# Extend the Fit-ARIMA method to ensure testing-training modeling/assessment for 2 countries works
preprocess_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20,
feature="Unemployment , Females, From 15-64 years, Total")
{
#delete features that are missing at all time points
countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)]
countryData = select(countryData, -time, -country)
DataMatrix = as.matrix(countryData)
DataMatrix = cleardata(DataMatrix)
DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value
DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0
DataMatrix = splinecreate(DataMatrix)
DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data
print(paste0("Processing feature: ...", feature, "... "))
if (feature=="Unemployment , Females, From 15-64 years, Total") {
Y = select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total")
X = select(DataSuperSample, -starts_with("Unemployment"))
} else if (feature=="Gross domestic product at market prices") {
Y = select(DataSuperSample, "Gross domestic product at market prices"); dim(Y)
X = select(DataSuperSample, -matches("debt|Debt"));
X <- X [, -c(50:80)]; dim(X) # 360 167
} else {
print(paste0("This feature: ...", feature, "... is not imlemented yet! Exiting preprocess_ARIMA() method ..."))
return(NULL)
}
# reduce the number of observations (matrix rows) to specified timerange
len_1 <- (end + 1 - start) * frequency; print(paste0("dim(X)[1]=", len_1))
X <- X[1:len_1 , qr(X[1:len_1 , ])$pivot[seq_len(qr(X[1:len_1 , ])$rank)]]; dim(X)
# ensure full-rank design matrix, X
Y <- as.data.frame(Y[1:len_1 , ])
print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], "); ", # 300 136
" dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ...")) # 300 1
return(list("X"=X, "Y"=Y))
}
# Outcome Variable to be modelled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures)
# Spline interpolation *5; 2000-01 - 2014-20 (300 observations for training): 60 observations (2015-2017) for Testing
Belgium <- filter(time_series, country == "Belgium")
Bulgaria <- filter(time_series, country == "Bulgaria")
Netherlands <- filter(time_series, country == "Netherlands")
# Test preprocess_ARIMA()
#preprocess_Belgium <- preprocess_ARIMA(countryData=Belgium, start=2000, end=2014,
# frequency=20, feature="Gross domestic product at market prices")
#preprocess_Bulgaria <- preprocess_ARIMA(countryData=Bulgaria, start=2000, end=2014,
# frequency=20, feature="Gross domestic product at market prices")
# General function that ensures the XReg predictors for 2 countries are homologous
homologousX_features <- function (X_Country1, X_Country2){
# Check if the Belgium and Bilgaria Xreg are homologous (same feature columns)
common_cols <- intersect(colnames(X_Country1), colnames(X_Country2))
X_Country1 <- subset(X_Country1, select = common_cols)
X_Country2 <- subset(X_Country2, select = common_cols)
print(paste0("dim(X1)=(", dim(X_Country1)[1], ",", dim(X_Country1)[2], "); ", # 300 131
" dim(X2)=(", dim(X_Country2)[1], ",", dim(X_Country2)[2], ")!")) # 300 131
return(list("X_Country1"=X_Country1, "X_Country2"=X_Country2))
}
# Test homologousX_features
# homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
# X_Belgium <- homoFeat$X_Country1
# X_Bulgaria <- homoFeat$X_Country2
fit_ARIMA <- function(country1Data=Belgium, country2Data=Bulgaria,
start=2000, end=2014, frequency=20,
feature="Gross domestic product at market prices") {
preprocess_Country1 <- preprocess_ARIMA(countryData=country1Data,
start=start, end=end, frequency=frequency, feature=feature)
preprocess_Country2 <- preprocess_ARIMA(countryData=country2Data,
start=start, end=end, frequency=frequency, feature=feature)
ts_Y_Country1 <- ts(preprocess_Country1$Y, start=c(start, 1),
end=c(end, frequency), frequency = frequency); length(ts_Y_Country1)
homoFeat <- homologousX_features(preprocess_Country1$X, preprocess_Country2$X)
X_Country1 <- homoFeat$X_Country1
X_Country2 <- homoFeat$X_Country2
set.seed(1234)
fitArimaX_Country1 = auto.arima(ts_Y_Country1, xreg=data.matrix(X_Country1))
return(fitArimaX_Country1)
}
# Belgium = filter(time_series,country == "Belgium")
BelgiumARIMA = fit_ARIMA(country1Data=Belgium,
country2Data=Bulgaria, # country2Data=Netherlands,
start=2000, end=2014, frequency=20,
feature="Gross domestic product at market prices")
BelgiumARIMA$arma # [1] 4 0 2 0 20 0 0
# sigma^2 estimated as 45.99: log likelihood=-919.13 AIC=2116.26 AICc=2359.51 BIC=2631.09
# Outcome Variable to be modelled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures)
# Spline interpolation x5; 2000-01 - 2014-20 (300 observations for training)
# ts_Y_Belgium_train <- ts(Y_Belgium_train, start=c(2000,1), end=c(2014, 20), frequency=20)
# length(ts_Y_Belgium_train)
# ts_Y_Belgium_test <- ts(Y_Belgium_test, start=c(2015,1), end=c(2017, 20), frequency = 20)
#length(ts_Y_Belgium_test)
# Find ARIMAX model
# arimaX_Belgium_train <- auto.arima(ts_Y_Belgium_train, xreg=X_Belgium_train); # arimaX_Belgium_train$arma
Predict Y={Gross domestic product at market prices}
, using all other features not directly related to *GDP. Recall the core data organization:
Xreg
Predictors and 1 Outcome Variable to be modeled as a timeseries: 2000 - 2014 (15 years, Quarterly measures, 5-fold spline interpolation, \(15*4*5=300\));training
). The remaining 60 timepoints used for testing
(2015-01 to 2017-20).Report ARIMA(p,d,q)
params and quality metrics AIC/BIC.
# Previously, we already extracted the Belgium and Bulgaria Data, preprocess it,
# and fit ARIMAX (Belgium-training) model. Now, we will assess the model on Bulgaria_testing sets
# BelgiumARIMA = fit_ARIMA(country1Data=Belgium, country2Data=Bulgaria,
# start=2000, end=2014, frequency=20,
# feature="Gross domestic product at market prices")
# BelgiumARIMA$arma # [1] 4 0 2 0 20 0 0
# View rank-ordered ARIMAX effects:
# View(BelgiumARIMA$coef[order(BelgiumARIMA$coef)])
sort(BelgiumARIMA$coef)[1:10]
# Unemployment , Females, From 15-64 years, From 18 to 23 months
# -1.5203281
# ar2
# -1.1808472
# Labor cost other than wages and salaries
# -0.8380554
# Unemployment , Females, From 15-64 years, From 12 to 17 months
# -0.7037336
# ar4
# -0.6360880
# sar2
# -0.6260002
# Unemployment , Females, From 15-64 years, From 3 to 5 months
# -0.5262376
# Labor cost for LCI (compensation of employees plus taxes minus subsidies)
# -0.2885038
# Unemployment , Females, From 15-64 years, 48 months or over
# -0.2773650
# Unemployment , Males, From 15-64 years, from 3 to 5 months
# -0.2567934
#Get the Prospective Xreg=X design matrices ready (2015-2017, 60 rows)
preprocess_Belgium <- preprocess_ARIMA(countryData=Belgium,
start=2000, end=2017, frequency=20,
feature="Gross domestic product at market prices")
preprocess_Bulgaria <- preprocess_ARIMA(countryData=Bulgaria, #Netherlands
start=2000, end=2017, frequency=20,
feature="Gross domestic product at market prices")
homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
X_Belgium_test <- homoFeat$X_Country1[301:360, ]; dim(X_Belgium_test)
X_Bulgaria_test <- homoFeat$X_Country2[301:360, ]; dim(X_Bulgaria_test)
# Get Predictions
pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60 <-
forecast(BelgiumARIMA, xreg = X_Bulgaria_test)$mean
pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60 <-
forecast(BelgiumARIMA, xreg = X_Belgium_test)$mean
pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60 <-
forecast(BelgiumARIMA, xreg = X_Bulgaria_test)$mean +
mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) -
mean(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60)
cor(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, ts_Y_Belgium_test) # 0.11
mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) # [1] 118
# Alternative predictions:
# X_Country1 <- X_Belgium_test; X_Country2 <- X_Bulgaria_test
# pred_arimaX_1_0_2_Y_Belgium_train300_Bulgaria_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Bulgaria_test)$pred
# pred_arimaX_1_0_2_Y_Belgium_train300_Belgium_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Belgium_test)$pred
Plot only the last 5-years of training (2010-2014), \(100TimePoints=5Years\times 4Quarters\times 5SuperSample\) and the 3-year prospective forecasting \(60TimePoints=3Years\times 4Quarters\times 5SuperSample\).
ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ],
start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)
## [1] 60
# windows(width=14, height=10)
plot(forecast(BelgiumARIMA, xreg = data.matrix(X_Belgium_test)), # ARIMA forecast
include=100, lwd=4, lty=3, xlab="Time", ylab="GPD Purchasing Power Standards (PPS)",
ylim=c(25, 150),cex.main=0.8,
main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, col = "green", lwd = 4, lty=2) # Belgium train+test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60, col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60, col = "orange", lwd = 4, lty=1)
# Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1) # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)",
"ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
"ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)",
"ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
"Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
"Belgium Official Reported GDP (2015-2017)"),
col=c("black", "blue", "green", "purple", "orange", "red"),
lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2012.5, 30, expression(atop(paste("Training Region (2000-2014)"),
paste(Model(GDP) %->% "ARIMAX(p, q, r) ; ",
XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2016.5, 30, expression(atop(paste("Validation Region (2015-2017)"),
paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ",
XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
Plot entire 15-year training time-span (2000-2014), \(300TimePoints=15Years\times 4Quarters\times 5SuperSample\) and the 3-year prospective forecasting \(60TimePoints=3Years\times 4Quarters\times 5SuperSample\).
ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ],
start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)
## [1] 60
# windows(width=14, height=10)
plot(forecast(BelgiumARIMA, xreg = data.matrix(X_Belgium_test)), # ARIMA forecast
lwd=4, lty=3, xlab="Time", ylab="GPD Purchasing Power Standards (PPS)",
ylim=c(25, 150),cex.main=0.8,
main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, col = "green", lwd = 4, lty=2) # Belgium train+test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60, col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60, col = "orange", lwd = 4, lty=1)
# Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1) # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)",
"ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
"ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)",
"ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
"Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
"Belgium Official Reported GDP (2015-2017)"),
col=c("black", "blue", "green", "purple", "orange", "red"),
lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2005, 30, expression(atop(paste("Training Region (2000-2014)"),
paste(Model(GDP) %->% "ARIMAX(p, q, r) ; ",
XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2015, 30, expression(atop(paste("Validation Region (2015-2017)"),
paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ",
XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
Let’s start by defining the generic k-space transformation.
# FT/Spacekime Analytics
# 1D timeseries FFT SHIFT
fftshift1D <- function(img_ff) {
rows <- length(img_ff)
rows_half <- ceiling(rows/2)
return(append(img_ff[(rows_half+1):rows], img_ff[1:rows_half]))
}
# Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain)
kSpaceTransform <- function(data, inverse = FALSE, reconPhases = NULL) {
# ForwardFT (rawData, FALSE, NULL)
# InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL)
FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
mag_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
phase_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
IFT_reconPhases_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
for (i in 1:dim(data)[2]) {
if (inverse == FALSE | is.null(reconPhases)) {
FT_data[ , i] <- fft(data[ , i], inverse)
X2 <- FT_data[ , i]
# plot(fftshift1D(log(Re(X2)+2)), main = "log(fftshift1D(Re(FFT(timeseries))))")
mag_FT_data[ , i] <- sqrt(Re(X2)^2+Im(X2)^2);
# plot(log(fftshift1D(Re(mag_FT_MCSI_data))), main = "log(Magnitude(FFT(timeseries)))")
phase_FT_data[ , i] <- atan2(Im(X2), Re(X2));
# plot(Re(fftshift1D(phase_FT_MCSI_data[ , 1])), main = "Shift(Phase(FFT(timeseries)))")
}
else { # for IFT synthesis using user-provided Phases, typically from kime-phase aggregators
Real <- data[ , i] * cos(reconPhases[ , i])
Imaginary <- data[ , i] * sin(reconPhases[ , i])
IFT_reconPhases_data[ ,i] <-
Re(fft(Real+1i*Imaginary, inverse = TRUE)/length(data[ , i]))
}
}
######### Test the FT-IFT analysis-synthesis back-and-forth transform process
# to confirm calculations
# X2 <- FT_data[ , 1]; mag_FT_data[ , 1] <- sqrt(Re(X2)^2+Im(X2)^2);
# phase_FT_data[ , 1] <- atan2(Im(X2), Re(X2));
# Real2 = mag_FT_data[ , 1] * cos(phase_FT_data[ , 1])
# Imaginary2 = mag_FT_data[ , 1] * sin(phase_FT_data[ , 1])
# man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2))
# ifelse(abs(man_hat_X2[5] - data[5, 1]) < 0.001, "Perfect Syntesis", "Problems!!!")
#########
if (inverse == FALSE | is.null(reconPhases)) {
return(list("magnitudes"=mag_FT_data, "phases"=phase_FT_data))
# Use kSpaceTransform$magnitudes & kSpaceTransform$phases to retrieve teh Mags and Phases
}
else {
return(IFT_reconPhases_data)
# Use Re(kSpaceTransform) to extract spacetime Real-valued reconstructed data
}
}
Examine the Kime-direction Distributions of the Phases for all Belgium features (predictors + outcome). Define a generic function that plots the Phase distributions.
library(tidyr)
library(ggplot2)
plotPhaseDistributions <- function (dataFT, dataColnames, size=8, ...) {
df.phase <- as.data.frame(Re(dataFT$phases))
df.phase %>% gather() %>% head()
colnames(df.phase) <- dataColnames
phaseDistributions <- gather(df.phase)
colnames(phaseDistributions) <- c("Feature", "Phase")
if (is.null(size)) size=10
# map the value as our x variable, and use facet_wrap to separate by the key column:
ggplot(phaseDistributions, aes(Phase)) +
# geom_histogram(bins = 10) +
geom_histogram(aes(y=..density..), bins = 10) +
facet_wrap( ~Feature, scales = 'free_x') +
xlim(-pi, pi) +
theme(strip.text.x = element_text(size = size, colour = "black", angle = 0))
}
# homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
X_Belgium <- homoFeat$X_Country1; dim(X_Belgium)
## [1] 360 131
## [1] 360 1
FT_Belgium <- kSpaceTransform(cbind(X_Belgium, Y_Belgium), FALSE, NULL)
dataColnames <- c(colnames(X_Belgium), "Y_GDP_Belgium")
plotPhaseDistributions(FT_Belgium, dataColnames)
Phase Distribution of Belgium
Perform Nil-Phase reconstruction - IFT_NilPhase_FT_Belgium
- and then re-fir the ARIMAX model
# 1. Nil-Phase data synthesys (reconstruction)
temp_Data <- cbind(X_Belgium, Y_Belgium)
nilPhase_FT_data <- array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(nilPhase_FT_data) # ; head(nilPhase_FT_data)
# [1] 360 132
IFT_NilPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))
# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, nilPhase_FT_data))
colnames(IFT_NilPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dim(IFT_NilPhase_FT_Belgium); dim(FT_Belgium$magnitudes)
colnames(IFT_NilPhase_FT_Belgium); head(IFT_NilPhase_FT_Belgium); # head(temp_Data)
# 2. Perform ARIMAX modeling on IFT_NilPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_NilPhase_FT_Belgium_Y_train <- IFT_NilPhase_FT_Belgium[1:300, 132]; length(IFT_NilPhase_FT_Belgium_Y_train)
IFT_NilPhase_FT_Belgium_Y_test <- IFT_NilPhase_FT_Belgium[301:360]; length(IFT_NilPhase_FT_Belgium_Y_test)
# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_NilPhase_FT_Belgium_X_train <- as.data.frame(IFT_NilPhase_FT_Belgium)[1:300, 1:131]; dim(IFT_NilPhase_FT_Belgium_X_train)
IFT_NilPhase_FT_Belgium_X_test <- as.data.frame(IFT_NilPhase_FT_Belgium)[301:360, 1:131]; dim(IFT_NilPhase_FT_Belgium_X_test)
# Outcome Variable to be ARIMAX-modelled, as a timeseries
ts_IFT_NilPhase_FT_Belgium_Y_train <-
ts(IFT_NilPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)
# Find ARIMAX model: 2 1 2 0 20 0 0
set.seed(1234)
modArima_IFT_NilPhase_FT_Belgium_Y_train <-
auto.arima(ts_IFT_NilPhase_FT_Belgium_Y_train, xreg=IFT_NilPhase_FT_Belgium_X_train)
modArima_IFT_NilPhase_FT_Belgium_Y_train$arma
# Regression with ARIMA(2,0,1)(2,0,0)[20] errors
# Coefficients:
# ar1 ar2 ma1 sar1 sar2 Acquisitions less disposals of non-financial non-produced assets
# -1.4232 -0.8592 -0.987 0.0941 -0.3451 0.0123
#s.e. 0.0341 0.0311 NaN 0.0688 0.0742 0.0007
# sigma^2 estimated as 0.8622: log likelihood=-320.39 AIC=914.79 AICc=1148.19 BIC=1422.2
pred_arimax_2_0_1_Nil <- forecast(modArima_IFT_NilPhase_FT_Belgium_Y_train, xreg = IFT_NilPhase_FT_Belgium_X_test)
pred_arimax_2_0_1_Nil_2015_2017 <-
ts(pred_arimax_2_0_1_Nil$mean, frequency=20, start=c(2015,1), end=c(2017,20))
## Time Series:
## Start = c(2015, 1)
## End = c(2017, 20)
## Frequency = 20
## 301 302 303 304 305 306 307 308
## 85.02364 101.33786 95.24676 101.49982 93.98976 94.38776 97.51510 96.05219
## 309 310 311 312 313 314 315 316
## 85.63166 113.27167 96.39145 97.82059 98.63777 93.78477 91.20510 102.88110
## 317 318 319 320 321 322 323 324
## 95.81967 103.30420 89.45534 103.39560 108.05981 96.02793 105.99804 88.89360
## 325 326 327 328 329 330 331 332
## 105.99806 101.23141 103.42049 102.00218 99.90036 100.80956 96.56583 103.71199
## 333 334 335 336 337 338 339 340
## 107.28897 94.43721 111.65774 99.79332 108.91354 109.66776 103.23192 105.11549
## 341 342 343 344 345 346 347 348
## 108.12944 99.35360 110.90110 112.87704 109.90520 108.49580 107.15445 107.12296
## 349 350 351 352 353 354 355 356
## 128.99135 112.62487 111.40590 117.22144 102.33043 115.79956 115.48040 114.26040
## 357 358 359 360
## 114.31062 117.33343 105.32412 118.13403
# alternatively:
# pred_arimax_1_0_1_2015_2017 <- predict(modArima_IFT_NilPhase_FT_Belgium_Y_train,
# n.ahead = 3*20, newxreg = IFT_NilPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_NilPhase_FT_Belgium_Y_train$coef)[1:10]
## Labor cost for LCI (compensation of employees plus taxes minus subsidies)
## -1.5972295
## ar1
## -1.4231617
## Labor cost other than wages and salaries
## -1.2213214
## ma1
## -0.9869571
## ar2
## -0.8591937
## Unemployment , Females, From 15-64 years, From 12 to 17 months
## -0.7075454
## Unemployment , Total, From 15-64 years, From 18 to 23 months
## -0.5797656
## Unemployment , Males, From 15-64 years, from 3 to 5 months
## -0.5026139
## sar2
## -0.3450866
## Unemployment , Males, From 15-64 years, from 24 to 47 months
## -0.2965540
# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-1.5972295
# ar1, effect=-1.4231617
# Labor cost other than wages and salaries, effect=-1.2213214
# ma1, effect=-0.9869571
# ar2, effect=-0.8591937
# Unemployment , Females, From 15-64 years, From 12 to 17 months, effect=-0.7075454
# Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.5797656
# Unemployment , Males, From 15-64 years, from 3 to 5 months, effect=-0.5026139
# sar2, effect=-0.3450866
# Unemployment , Males, From 15-64 years, from 24 to 47 months, effect=-0.2965540
cor(pred_arimax_2_0_1_Nil$mean, ts_Y_Belgium_test) # 0.14
## [1] 0.1420164
## [1] 103.7756
Space-time reconstructions by inverting back the mag_FT
for each feature signal after swapping the feature-phases. In other words, we randomly shuffle the columns
of the Phases-matrix (Training & Testing XReg Data) and use these swapped phases to synthesize the design covariate matrix (\(Xreg\)).
# 1. Swap Feature Phases and then synthesize the data (reconstruction)
# temp_Data <- cbind(X_Belgium, Y_Belgium)
swapped_phase_FT_Belgium_data <- FT_Belgium$phases
colnames(swapped_phase_FT_Belgium_data) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
swapped_phase_FT_Belgium_data1 <- swapped_phase_FT_Belgium_data
dim(swapped_phase_FT_Belgium_data) # ; head(swappedPhase_FT_data)
# [1] 360 132
IFT_SwappedPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))
set.seed(12345) # sample randomly Phase-columns for each of the 131 covariates (X)
#swap_phase_FT_Belgium_indices <- sample(ncol(swapped_phase_FT_Belgium_data)-1)
# for (j in 1:131) { # for all coluns of the design Xreg matrix, excluding Y, randomly swap columns phases
# swapped_phase_FT_Belgium_data1[ , j] <- swapped_phase_FT_Belgium_data[, swap_phase_FT_Belgium_indices[j]]
#}
swapped_phase_FT_Belgium_data1 <- as.data.frame(cbind(
swapped_phase_FT_Belgium_data[ , sample(ncol(swapped_phase_FT_Belgium_data[ , 1:131]))],
swapped_phase_FT_Belgium_data[ , 132]))
swapped_phase_FT_Belgium_data <- swapped_phase_FT_Belgium_data1
colnames(swapped_phase_FT_Belgium_data)[132] <- "Y_GDP_Belgium"
colnames(swapped_phase_FT_Belgium_data)
dim(swapped_phase_FT_Belgium_data); dim(FT_Belgium$phases)
# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal using the feature swapped phases
IFT_SwappedPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, swapped_phase_FT_Belgium_data))
colnames(IFT_SwappedPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dim(IFT_SwappedPhase_FT_Belgium); dim(FT_Belgium$magnitudes)
colnames(IFT_SwappedPhase_FT_Belgium); tail(IFT_SwappedPhase_FT_Belgium); # tail(temp_Data)
# 2. Perform ARIMAX modeling on IFT_SwappedPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_SwappedPhase_FT_Belgium_Y_train <- IFT_SwappedPhase_FT_Belgium[1:300, 132]; length(IFT_SwappedPhase_FT_Belgium_Y_train)
IFT_SwappedPhase_FT_Belgium_Y_test <- IFT_SwappedPhase_FT_Belgium[301:360]; length(IFT_SwappedPhase_FT_Belgium_Y_test)
# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_SwappedPhase_FT_Belgium_X_train <- as.data.frame(IFT_SwappedPhase_FT_Belgium)[1:300, 1:131]
dim(IFT_SwappedPhase_FT_Belgium_X_train)
IFT_SwappedPhase_FT_Belgium_X_test <- as.data.frame(IFT_SwappedPhase_FT_Belgium)[301:360, 1:131]
dim(IFT_SwappedPhase_FT_Belgium_X_test)
# Outcome Variable to be ARIMAX-modelled, as a timeseries
ts_IFT_SwappedPhase_FT_Belgium_Y_train <-
ts(IFT_SwappedPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)
# Find ARIMAX model: 1 0 2 0 20 0 0
set.seed(1234)
modArima_IFT_SwappedPhase_FT_Belgium_Y_train <-
auto.arima(ts_IFT_SwappedPhase_FT_Belgium_Y_train, xreg=IFT_SwappedPhase_FT_Belgium_X_train)
modArima_IFT_SwappedPhase_FT_Belgium_Y_train$arma
# Regression with ARIMA(1,0,0)(2,0,0)[20] errors
# Coefficients:
# ar1 sar1 sar2 intercept Acquisitions less disposals of non-financial non-produced assets
# -0.3837 -0.2196 0.2827 46.1704 0.0063
#s.e. 0.1113 0.0903 0.0779 36.8492 0.0080
# sigma^2 estimated as 70: log likelihood=-976.01 AIC=2224.02 AICc=2452.63 BIC=2727.73
pred_arimax_1_0_0_Swapped <- forecast(modArima_IFT_SwappedPhase_FT_Belgium_Y_train, xreg = IFT_SwappedPhase_FT_Belgium_X_test)
pred_arimax_1_0_0_Swapped_2015_2017 <-
ts(pred_arimax_1_0_0_Swapped$mean, frequency=20, start=c(2015,1), end=c(2017,20))
## Time Series:
## Start = c(2015, 1)
## End = c(2017, 20)
## Frequency = 20
## 301 302 303 304 305 306 307 308
## 99.10741 110.77632 91.58138 98.50625 122.93774 105.60752 104.87176 113.06541
## 309 310 311 312 313 314 315 316
## 113.61298 106.57517 104.94313 92.11965 102.44793 117.49381 111.47798 128.57506
## 317 318 319 320 321 322 323 324
## 106.60489 98.64213 105.63698 114.32857 121.77287 95.20252 108.93503 104.96129
## 325 326 327 328 329 330 331 332
## 106.67301 106.67622 107.40082 105.75911 102.10965 113.08226 103.28406 97.29026
## 333 334 335 336 337 338 339 340
## 114.32844 89.02971 104.32914 94.74108 112.06142 112.33537 101.01485 113.38591
## 341 342 343 344 345 346 347 348
## 111.82291 110.66114 101.03265 107.66914 113.26743 105.83638 103.92252 103.19822
## 349 350 351 352 353 354 355 356
## 104.23278 90.16403 102.19771 104.50565 101.01555 103.87907 99.85848 99.51574
## 357 358 359 360
## 93.93449 100.37559 99.90370 92.28034
# alternatively:
# pred_arimax_1_0_0_Swapped_2015_2017 <- predict(modArima_IFT_SwappedPhase_FT_Belgium_Y_train,
# n.ahead = 3*20, newxreg = IFT_SwappedPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_SwappedPhase_FT_Belgium_Y_train$coef)[1:10]
## ar1
## -0.38372043
## Unemployment , Females, From 15-64 years, From 18 to 23 months
## -0.31137514
## Labor cost other than wages and salaries
## -0.31094561
## sar1
## -0.21964957
## Unemployment , Females, From 15-64 years, From 6 to 11 months
## -0.20878853
## Labor cost for LCI (compensation of employees plus taxes minus subsidies)
## -0.12497311
## Unemployment , Males, From 15-64 years, Less than 1 month
## -0.10849013
## Unemployment , Total, From 15-64 years, 48 months or over
## -0.09066684
## Unemployment , Total, From 15-64 years, From 1 to 2 months
## -0.05852382
## Unemployment , Females, From 15-64 years, 48 months or over
## -0.05695172
# ar1, effect=-0.38372043
# Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.31137514
# Labor cost other than wages and salaries, effect=-0.31094561
# sar1, effect=-0.21964957
# Unemployment , Females, From 15-64 years, From 6 to 11 months, effect=-0.20878853
#Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.12497311
# Unemployment , Males, From 15-64 years, Less than 1 month, effect=-0.10849013
# Unemployment , Total, From 15-64 years, 48 months or over, effect=-0.09066684
# Unemployment , Total, From 15-64 years, From 1 to 2 months, effect=-0.05852382
# Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.05695172
cor(pred_arimax_1_0_0_Swapped$mean, ts_Y_Belgium_test) # 0.0
## [1] -0.007886214
## [1] 105.2093
Perform Random-Phase reconstruction - IFT_RandPhase_FT_Belgium
- by randomly sampling from the phase distributions for each feature and then re-fitting the ARIMAX model
# 1. Random-Phase data synthesys (reconstruction)
# temp_Data <- cbind(X_Belgium, Y_Belgium)
randPhase_FT_data <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(randPhase_FT_data) # ; head(randPhase_FT_data)
# [1] 360 132
IFT_RandPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))
randPhase_FT_data <- FT_Belgium$phases
for (i in 1:(dim(randPhase_FT_data)[2] -1)) {
if (i < dim(randPhase_FT_data)[2]) {
set.seed(12345) # sample randomly Phases for each of the 131 predictors covariates (X)
randPhase_FT_data[ , i] <- FT_Belgium$phases[sample(nrow(FT_Belgium$phases)), i]
} else { } # for the Y outcome (Last Column) - do not change the phases of the Y
}
# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with avg-phase
IFT_RandPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, randPhase_FT_data))
colnames(IFT_RandPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dim(IFT_RandPhase_FT_Belgium); dim(FT_Belgium$magnitudes)
colnames(IFT_RandPhase_FT_Belgium); tail(IFT_RandPhase_FT_Belgium); # tail(temp_Data)
dim(IFT_RandPhase_FT_Belgium); head(Re(IFT_RandPhase_FT_Belgium)); tail(Re(IFT_RandPhase_FT_Belgium))
# 2. Perform ARIMAX modeling on IFT_RandPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_RandPhase_FT_Belgium_Y_train <- IFT_RandPhase_FT_Belgium[1:300, 132]; length(IFT_RandPhase_FT_Belgium_Y_train)
IFT_RandPhase_FT_Belgium_Y_test <- IFT_RandPhase_FT_Belgium[301:360]; length(IFT_RandPhase_FT_Belgium_Y_test)
# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_RandPhase_FT_Belgium_X_train <- as.data.frame(IFT_RandPhase_FT_Belgium)[1:300, 1:131]; dim(IFT_RandPhase_FT_Belgium_X_train)
IFT_RandPhase_FT_Belgium_X_test <- as.data.frame(IFT_RandPhase_FT_Belgium)[301:360, 1:131]; dim(IFT_RandPhase_FT_Belgium_X_test)
# Outcome Variable to be ARIMAX-modelled, as a timeseries
ts_IFT_RandPhase_FT_Belgium_Y_train <-
ts(IFT_RandPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)
# Find ARIMAX model: 0 0 2 0 20 0 0
set.seed(1234)
modArima_IFT_RandPhase_FT_Belgium_Y_train <-
auto.arima(ts_IFT_RandPhase_FT_Belgium_Y_train, xreg=IFT_RandPhase_FT_Belgium_X_train)
modArima_IFT_RandPhase_FT_Belgium_Y_train$arma
# Regression with ARIMA(0,0,0)(2,0,0)[20] errors
# Coefficients:
# sar1 sar2 Acquisitions less disposals of non-financial non-produced assets
# -0.0743 0.5766 0.0162
#s.e. 0.0625 0.0752 0.0100
#sigma^2 estimated as 72.17: log likelihood=-988.06 AIC=2244.12 AICc=2463.4 BIC=2740.43
pred_arimax_0_0_0_Rand <- forecast(modArima_IFT_RandPhase_FT_Belgium_Y_train, xreg = IFT_RandPhase_FT_Belgium_X_test)
pred_arimax_0_0_0_Rand_2015_2017 <-
ts(pred_arimax_0_0_0_Rand$mean, frequency=20, start=c(2015,1), end=c(2017,20))
## Time Series:
## Start = c(2015, 1)
## End = c(2017, 20)
## Frequency = 20
## 301 302 303 304 305 306 307 308
## 92.80231 104.99710 87.45035 75.08463 103.09887 93.28688 90.83812 88.43779
## 309 310 311 312 313 314 315 316
## 103.67438 86.28912 107.24861 87.24921 114.39761 94.46072 89.95292 108.18454
## 317 318 319 320 321 322 323 324
## 96.28926 94.71985 99.80931 81.39295 76.55054 79.73523 96.72882 94.78202
## 325 326 327 328 329 330 331 332
## 77.65464 82.44926 92.59295 70.27627 85.31992 92.93975 80.25941 82.43967
## 333 334 335 336 337 338 339 340
## 82.12752 84.24610 100.37597 76.09647 79.43232 85.82942 82.38588 88.82293
## 341 342 343 344 345 346 347 348
## 80.76724 80.57109 90.10177 75.38288 94.01425 88.78526 92.36941 88.43295
## 349 350 351 352 353 354 355 356
## 84.34626 72.68033 88.87970 73.72520 74.90564 90.76582 84.32506 82.03129
## 357 358 359 360
## 76.89846 92.09809 75.23212 87.49829
# alternatively:
# pred_arimax_1_0_1_Rand_2015_2017 <- predict(modArima_IFT_RandPhase_FT_Belgium_Y_train,
# n.ahead = 3*20, newxreg = IFT_RandPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_RandPhase_FT_Belgium_Y_train$coef)[1:10]
## Labor cost for LCI (compensation of employees plus taxes minus subsidies)
## -0.71989958
## Unemployment , Females, From 15-64 years, From 18 to 23 months
## -0.54541627
## Unemployment , Females, From 15-64 years, 48 months or over
## -0.44230677
## Labor cost other than wages and salaries
## -0.32854422
## Unemployment , Males, From 15-64 years, from 6 to 11 months
## -0.24511374
## Unemployment , Total, From 15-64 years, From 24 to 47 months
## -0.19283037
## Agriculture, forestry and fishing - Employers' social contributions
## -0.11994897
## Unemployment , Females, From 15-64 years, From 24 to 47 months
## -0.10835175
## Unemployment , Females, From 15-64 years, From 1 to 2 months
## -0.09093252
## Unemployment , Total, From 15-64 years, From 18 to 23 months
## -0.07427297
# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.71989958
# Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.54541627
# Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.44230677
# Labor cost other than wages and salaries, effect=-0.32854422
# Unemployment , Males, From 15-64 years, from 6 to 11 months, effect=-0.24511374
# Unemployment , Total, From 15-64 years, From 24 to 47 months, effect=-0.19283037
# Agriculture, forestry and fishing - Employers' social contributions, effect=-0.11994897
# Unemployment , Females, From 15-64 years, From 24 to 47 months, effect=-0.10835175
# Unemployment , Females, From 15-64 years, From 1 to 2 months, effect=-0.09093252
# Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.07427297
cor(pred_arimax_0_0_0_Rand$mean, ts_Y_Belgium_test) # -0.15
## [1] -0.1532414
## [1] 87.74201
## Plot the results of the model ARIMAX fitting
ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ],
start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)
## [1] 60
# windows(width=14, height=10)
plot(forecast(BelgiumARIMA, xreg = data.matrix(X_Belgium_test)), # ARIMA forecast
include=60, lwd=4, lty=3, xlab="Time", ylab="GPD Purchasing Power Standards (PPS)",
ylim=c(60, 160),cex.main=0.8,
main = "Spacekime ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
based on fitting ARIMAX Models on spline interpolated & kime-transformed Belgium data")
lines(pred_arimax_2_0_1_Nil_2015_2017, col = "green", lwd = 4, lty=2) # Belgium Xreg Nil-Phase Reconstructions
lines(pred_arimax_1_0_0_Swapped_2015_2017, col = "purple", lwd = 4, lty=1) # Belgium Xreg Swapped-Phase Reconstructions
lines(pred_arimax_0_0_0_Rand_2015_2017, col = "orange", lwd = 4, lty=1) # Belgium Xreg Random-Phase Reconstructions
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1) # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)",
"ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
"ARIMAX(2,0,1) Belgium Xreg Nil-Phase Reconstruction (2015-2017)",
"ARIMAX(1,0,0) Belgium Xreg Swapped-Phase Reconstructions (2015-2017)",
"ARIMAX(0,0,0)(2,0,0)[20] Belgium Xreg Random-Phase Reconstructions (2015-2017)",
"Belgium Official Reported GDP (2015-2017)"),
col=c("black", "blue", "green", "purple", "orange", "red"),
lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2013.5, 65, expression(atop(paste("Training Region (2000-2014)"),
paste(Model(GDP) %->% "ARIMAX(p, q, r) ; ",
XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2016.5, 65, expression(atop(paste("Validation Region (2015-2017)"),
paste(hat(GDP) %<-% "ARIMAX(., ., .); ",
XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)