SOCR ≫ | TCIU Website ≫ | TCIU GitHub ≫ |
library(dplyr)
library(arm)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(plot3D)
library(scatterplot3d)
library(plotly)
library(fastDummies)
library(forecast)
library(glmnet)
library(knitr)
library(doParallel)
library(ggrepel)
library(roxygen2)
library(devtools)
library(DT)
library(parallel)
registerDoParallel(10,cores = detectCores()-2)
Read the Rdata file
load("Fig5.15_to_17.RData")
Put in all functions
#' Reshape the title
#'
#' @param titl vector contains titles that need to be reshaped
changetitle <- function(titl) {
newtitle<-rep(NA,length(titl))
for (i in 1:length(titl)) {
tempchar<-substring(titl[i],1:nchar(titl[i]),1:nchar(titl[i]))
for (j in 1:8) {
k=25*j
tempchar[which(tempchar==" ")[which(tempchar==" ")>k][1]]<-"\n"
}
tempchar<-paste(tempchar,collapse = "")
newtitle[i]<-tempchar
}
return(newtitle)
}
#' Creating plot result for K-space transformation
#'
#' This function can choose to generate all plots related to the K-space transformation. Or we could choose features code to select features we like to see to generate K-space transformation result.
#'
#'@param dataFT data inherited from function "kSpaceTransform", K-space transformed dataset.
#'@param dataColnames feature names for the transformed dataset
#'@param size size of the text inside the plot
#'@param option two options avaliable: "All": to plot for all features, "Select": to select several features you wish to show
#'@param select_feature if "option" is setted as "Select", then put inside a sequence to indicate the features you wish to show with this function.
plotPhaseDistributions <- function (dataFT, dataColnames, size=10, option="ALL",select_feature=NULL,...) {
df.phase <- as.data.frame(Re(dataFT$phases))
df.phase %>% gather() %>% head()
colnames(df.phase) <- dataColnames
phaseDistributions <- gather(df.phase)
colnames(phaseDistributions) <- c("Feature", "Phase")
if (is.null(size)) size=10
# map the value as our x variable, and use facet_wrap to separate by the key column:
if(option=="ALL"){
ggplot(phaseDistributions, aes(Phase)) +
# geom_histogram(bins = 10) +
geom_histogram(aes(y=..density..), bins = 10) +
facet_wrap( ~Feature, scales = 'free_x') +
xlim(-pi, pi) +
theme(strip.text.x = element_text(size = size, colour = "black", angle = 0))}
else if(option=="Select"){
choosefeat<-dataColnames[select_feature]
NphaseDistributions<-phaseDistributions[which(phaseDistributions$Feature %in% choosefeat),]
ggplot(NphaseDistributions, aes(Phase)) +
# geom_histogram(bins = 10) +
geom_histogram(aes(y=..density..), bins = 10) +
facet_wrap( ~Feature, scales = 'free_x') +
xlim(-pi, pi) +
theme(strip.text.x = element_text(size = size, colour = "black", angle = 0))}
else{
return("Wrong input on option, Please try again")
}
}
#'Conduct three different phase synthesis methods and create ARIMAX model based on different method.
#'
#'This function can reconstruct the dataset based on three phase synthesis methods: Nil-Phase Synthesis, Swapped-Phase Synthesis and Random-Phase Synthesis. Also it can construct different ARIMAX model based on the phase synthesis method chosen.
#'
#'@param X inherited from function "homologousX_features", we can get parameter X by "homologousX_features(country1,country2)$$X_Country1"
#'@param Y inherited from function "preprocess_ARIMA", we can get parameter Y by "preprocess_ARIMA$Y"
#'@param ts_Y_test time series vector, must be set from function "ts" and must be created based on testing set
#'@param option choose the phase synthesis method used in the reconstruction or ARIMAX model. Three options are valid: "Nil-Phase", "Swapped-Phase", "Random-Phase"
#'@param result choose the result shown by this function. Two options are valid: "ARIMAX": this will show the ARIMAX model created by the chosen phase synthesis method. "reconstruct": this will show the reconstruction procedure based on the chosen phase synthesis method, but the ARIMAX model won't be built.
#'@param rename_Y rename the return result of Y
#'
#'@return if the "result" parameter is setted as "reconstruct", then the reconstruction result will be shown. if setted as "ARIMAX", then the detailed ARIMAX model will be built.
PS_ARIMAX_remodel <- function(X,Y,ts_Y_test,option="Nil-Phase",result="ARIMAX",rename_Y="Y_GDP_Belgium") {
temp_data<-cbind(X,Y)
FT<-kSpaceTransform(temp_data, FALSE, NULL)
if(option=="Nil-Phase"){
nilPhase_FT_data <- array(complex(real=0, imaginary=0), c(dim(temp_data)[1], dim(temp_data)[2]))
##IFT_NilPhase_FT <- array(complex(), c(dim(temp_data)[1], dim(temp_data)[2]))
# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT<- Re(kSpaceTransform(FT$magnitudes, TRUE, nilPhase_FT_data))
colnames(IFT_NilPhase_FT) <- c(colnames(X), rename_Y)
#result
if(result=="reconstruct"){
return(list(
dim(nilPhase_FT_data),dim(IFT_NilPhase_FT), dim(FT$magnitudes),
colnames(IFT_NilPhase_FT), head(IFT_NilPhase_FT),IFT_NilPhase_FT)) # head(temp_Data)
}
#rename
IFT_FT<-IFT_NilPhase_FT
}
else if(option=="Swapped-Phase"){
swapped_phase_FT_data <- FT$phases
colnames(swapped_phase_FT_data) <- c(colnames(X), rename_Y)
swapped_phase_FT_data1 <- swapped_phase_FT_data
IFT_SwappedPhase_FT <- array(complex(), c(dim(temp_data)[1], dim(temp_data)[2]))
set.seed(12345) # sample randomly Phase-columns for each of the 131 covariates (X)
#swap_phase_FT_Belgium_indices <- sample(ncol(swapped_phase_FT_Belgium_data)-1)
# for (j in 1:131) { # for all coluns of the design Xreg matrix, excluding Y, randomly swap columns phases
# swapped_phase_FT_Belgium_data1[ , j] <- swapped_phase_FT_Belgium_data[, swap_phase_FT_Belgium_indices[j]]
#}
swapped_phase_FT_data1 <- as.data.frame(cbind(
swapped_phase_FT_data[ , sample(ncol(swapped_phase_FT_data[ , 1:(ncol(swapped_phase_FT_data)-1)]))],
swapped_phase_FT_data[ , ncol(swapped_phase_FT_data)]))
swapped_phase_FT_data <- swapped_phase_FT_data1
colnames(swapped_phase_FT_data)[ncol(swapped_phase_FT_data)] <- rename_Y
colnames(swapped_phase_FT_data)
# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal using the feature swapped phases
IFT_SwappedPhase_FT <- Re(kSpaceTransform(FT$magnitudes, TRUE, swapped_phase_FT_data))
colnames(IFT_SwappedPhase_FT) <- c(colnames(X), rename_Y)
#result
if(result=="reconstruct"){
return(list(
dim(swapped_phase_FT_data) ,
# [1] 360 132
dim(swapped_phase_FT_data), dim(FT$phases),
dim(IFT_SwappedPhase_FT), dim(FT$magnitudes),
colnames(IFT_SwappedPhase_FT), tail(IFT_SwappedPhase_FT),IFT_SwappedPhase_FT)) # tail(temp_Data)
}
#rename
IFT_FT<-IFT_SwappedPhase_FT
}
else if(option=="Random-Phase"){
#randPhase_FT_data <- array(complex(), c(dim(temp_data)[1], dim(temp_data)[2]))
IFT_RandPhase_FT <- array(complex(), c(dim(temp_data)[1], dim(temp_data)[2]))
randPhase_FT_data <- FT$phases
for (i in 1:(dim(randPhase_FT_data)[2] -1)) {
if (i < dim(randPhase_FT_data)[2]) {
set.seed(12345) # sample randomly Phases for each of the 131 predictors covariates (X)
randPhase_FT_data[ , i] <- FT$phases[sample(nrow(FT$phases)), i]
} else { } # for the Y outcome (Last Column) - do not change the phases of the Y
}
# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with avg-phase
IFT_RandPhase_FT <- Re(kSpaceTransform(FT$magnitudes, TRUE, randPhase_FT_data))
colnames(IFT_RandPhase_FT) <- c(colnames(X), rename_Y)
#result
if(result=="reconstruct"){
return( list(
dim(randPhase_FT_data), # ; head(randPhase_FT_data)
# [1] 360 132
dim(IFT_RandPhase_FT), dim(FT$magnitudes) ,
colnames(IFT_RandPhase_FT), tail(IFT_RandPhase_FT), # tail(temp_Data)
dim(IFT_RandPhase_FT), head(Re(IFT_RandPhase_FT)), tail(Re(IFT_RandPhase_FT))))
}
#rename
IFT_FT<-IFT_RandPhase_FT
}
if(result=="ARIMAX"){
#construction of time-series analysis
# Perform ARIMAX modeling on IFT_NilPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_FT_Y_train <- IFT_FT[1:300, ncol(IFT_FT)]
IFT_FT_Y_test <- IFT_FT[301:360]
# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_FT_X_train <- as.data.frame(IFT_FT)[1:300, 1:(ncol(IFT_FT)-1)]; dim(IFT_FT_X_train)
IFT_FT_X_test <- as.data.frame(IFT_FT)[301:360, 1:(ncol(IFT_FT)-1)]; dim(IFT_FT_X_test)
# Outcome Variable to be ARIMAX-modelled, as a timeseries
ts_IFT_FT_Y_train <-
ts(IFT_FT_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)
set.seed(1234)
modArima_IFT_FT_Y_train <-
auto.arima(ts_IFT_FT_Y_train, xreg=as.matrix(IFT_FT_X_train))
pred_arimax <- forecast(modArima_IFT_FT_Y_train, xreg = as.matrix(IFT_FT_X_test))
pred_arimax_2015_2017 <-
ts(pred_arimax$mean, frequency=20, start=c(2015,1), end=c(2017,20))
# alternatively:
# pred_arimax_1_0_1_2015_2017 <- predict(modArima_IFT_NilPhase_FT_Belgium_Y_train,
# n.ahead = 3*20, newxreg = IFT_NilPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_FT_Y_train$coef)[1:10]
# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-1.5972295
# ar1, effect=-1.4231617
# Labor cost other than wages and salaries, effect=-1.2213214
# ma1, effect=-0.9869571
# ar2, effect=-0.8591937
# Unemployment , Females, From 15-64 years, From 12 to 17 months, effect=-0.7075454
# Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.5797656
# Unemployment , Males, From 15-64 years, from 3 to 5 months, effect=-0.5026139
# sar2, effect=-0.3450866
# Unemployment , Males, From 15-64 years, from 24 to 47 months, effect=-0.2965540
return(list(
training_set_length=length(IFT_FT_Y_train),
testing_set_length=length(IFT_FT_Y_test),
AR_MA=modArima_IFT_FT_Y_train$arma,
ARIMAX_model=pred_arimax_2015_2017,
feature_effect=sort(modArima_IFT_FT_Y_train$coef)[1:10],
Correlation=cor(pred_arimax$mean, ts_Y_test),
Mean=mean(pred_arimax_2015_2017)))}
else{
print(paste0("Wrong input for option or result, please try again.") )
return(NULL)
}
}
Extract data for Belgium.
# Choose Belgium and fit the arima
belgium =
time_series %>%
filter(country == "Belgium") %>%
# delete the feature that is missing for all the time
select_if(function(col) sum(is.na(col)) != length(col) ) %>%
select(-time,-country)
The dimension of dataset related to Belgium are 72, 170.
Manually clean (preprocess) the Belgium data and fit ARIMAX model \[Y = BelguimSuperSample\$'Unemployment,\ Females,\ From\ 15-64\ years,\ Total',\]
\[X = XReg\ (all\ other\ covariates\ -starts\_with('Unemployment')).\]
# Reformate the belgium matrix
BelguimSuperSample <-
belgium %>%
as.matrix() %>%
cleardata() %>%
splinecreate() %>%
as.data.frame()
#############################
# Outcome Features to examine, predict, forecast, spacekime-analyze
# "Gross domestic product at market prices"
# "Unemployment , Females, From 15-64 years, Total"
# "Capital transfers, payable"
# "Capital transfers, receivable"
# "Debt securities"
# "Government consolidated gross debt"
# ... View(colnames(BelguimMatrix))
#############################
feature: “Unemployment , Females, From 15-64 years, Total”
################################################
# "Unemployment , Females, From 15-64 years, Total"
Y = select(BelguimSuperSample, "Unemployment , Females, From 15-64 years, Total")
X = select(BelguimSuperSample, -starts_with("Unemployment"))
X = as.matrix(X)
fitArimaX = auto.arima(Y, xreg=X[,-112])
dimension of this feature: Y: 360, 1, X:360, 143
ARIMAX model AR and MA coefficients: 0, 0, 0, 0, 1, 0, 0
################################################
# "Government consolidated gross debt"
#Y = select(BelguimSuperSample, "Government consolidated gross debt"); dim(Y)
#X = select(BelguimSuperSample, -matches("debt|Debt")); dim(X)
# X_scale <- scale(X); Matrix::rankMatrix(X_scale); any(is.na(X_scale)); any(is.infinite(X_scale))
# remove columns with infinite or missing value
# X_scale_1 <- X_scale[ , !is.infinite(colSums(X_scale)) & !is.na(colSums(X_scale))]
#X_1 <- X[ , !is.infinite(colSums(X)) & !is.na(colSums(X))]; dim(X_1); Matrix::rankMatrix(X_1)
#X_2 <- X_1[, qr(X_1)$pivot[seq_len(qr(X_1)$rank)]]; dim(X_2)
#
#fitArimaX = auto.arima(Y, xreg=X_2, method = "CSS", # "SANN", method = "CSS", optim.method = "BFGS"
# optim.control=list(maxit = 20000, temp = 20), optim.method = "BFGS")
#fitArimaX; View(sort(fitArimaX$coef)[1:10]); fitArimaX$arma
################################################
# "Gross domestic product at market prices"
Y = select(BelguimSuperSample, "Gross domestic product at market prices")
X = select(BelguimSuperSample, -matches("debt|Debt")) # 360 167
X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]
X = as.matrix(X)
ts_Y <- ts(Y, start=c(2000,1), end=c(2017, 20), frequency = 20); length(ts_Y)
## [1] 360
set.seed(1234)
fitArimaX = auto.arima(ts_Y, xreg=X[ , -c(50:60)])
# 5 0 2 0 20 0 0
# sigma^2 estimated as 57.04: log likelihood=-1132.78 AIC=2593.55 AICc=2871.09 BIC=3230.88
dimension of this feature: Y: 360, 1, X:360, 167 ARIMAX model AR and MA coefficients: 5, 0, 2, 0, 20, 0, 0
pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60 <-
predict(fitArimaX, n.ahead = 60, newxreg = X[301:360 , -c(50:60)])$pred
#Output here:
plot(forecast(fitArimaX, xreg = X[301:360 , -c(50:60)]), # ARIMA forecast
include=120, lwd=4, lty=3, xlab="Time", ylab="GDP", ylim=c(50, 150),
main = "ARIMAX Analytics (Train: 2000-2017; Test: 2018-2020) GDP Forecasting\n
based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60, col = "red", lwd = 4, lty=3)
legend("topleft", bty="n",
legend=c("Belgium Training Data (2000-2017)",
"ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)",
"ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)"),
col = c("black", "blue", "red"),
lty = c(3,3,3), lwd=c(4,4,4), cex=1.2, x.intersp=1.5, y.intersp=0.6)
text(2015, 60, expression(atop(paste("Training Region (2000-2017)"),
paste(Model(Unempl) %->% "ARIMAX(p, q, r) ; ",
XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)
text(2019.5, 60, expression(atop(paste("Validation Region (2018-2020)"),
paste(hat(Unempl) %<-% "ARIMAX(5,0 ,2); ",
XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)
GDP unit of measure represents the Current prices, euro per capita. The volume index of GDP per capita in Purchasing Power Standards (PPS) is intended for cross-country comparisons rather than for temporal comparisons. GDP per capita when expressed in PPS eliminates the differences in price levels between countries allowing meaningful volume comparisons of GDP between countries. Expressed in relation to the European Union (EU27 GDP = 100), a country with an index that is higher than \(100\) or lower than \(100\) correspond to that this country’s level of GDP per head being higher or lower than the EU average, respectively.
Define a new function that (1) cleans the data, and (2) fits the ARIMA model estimating the seasonal and non-seasonal time-series parameters \((p,d,q)\) and the effect-sizes (\(\beta\)’s) for the exogenous regression features (\(X\)).
Bulgaria = filter(time_series,country == "Bulgaria")
BulgariaARIMA =
Fit_ARIMA(country ='Bulgaria',
start=2000, end=2017, frequency=20,
feature="Gross domestic product at market prices")
## [1] "dim(X)=(360,168); dim(Y)=(360,1) ..."
# Outcome Variable to be modelled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures)
# Spline interpolation *5; 2000-01 - 2014-20 (300 observations for training): 60 observations (2015-2017) for Testing
Belgium <- filter(time_series, country == "Belgium")
Bulgaria <- filter(time_series, country == "Bulgaria")
Netherlands <- filter(time_series, country == "Netherlands")
# Test preprocess_ARIMA()
#preprocess_Belgium <- preprocess_ARIMA(country = 'belgium', start=2000, end=2014,
# frequency=20, feature="Gross domestic product at market prices")
#preprocess_Bulgaria <- preprocess_ARIMA(country = 'bulgaria', start=2000, end=2014,
# frequency=20, feature="Gross domestic product at market prices")
# Test homologousX_features
# homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
# X_Belgium <- homoFeat$X_Country1
# X_Bulgaria <- homoFeat$X_Country2
# Belgium = filter(time_series,country == "Belgium")
BelgiumARIMA =
fit_ARIMA(country1 = 'Belgium', country2 = 'Bulgaria', # country = Netherlands,
start=2000, end=2014, frequency=20,
feature="Gross domestic product at market prices" )
## [1] "Processing feature: ...Gross domestic product at market prices... "
## [1] "dim(X)[1]=300"
## [1] "dim(X)=(300,136); dim(Y)=(300,1) ..."
## [1] "Processing feature: ...Gross domestic product at market prices... "
## [1] "dim(X)[1]=300"
## [1] "dim(X)=(300,137); dim(Y)=(300,1) ..."
## [1] "dim(X1)=(300,131); dim(X2)=(300,131)!"
# sigma^2 estimated as 45.99: log likelihood=-919.13 AIC=2116.26 AICc=2359.51 BIC=2631.09
# Outcome Variable to be modelled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures)
# Spline interpolation x5; 2000-01 - 2014-20 (300 observations for training)
# ts_Y_Belgium_train <- ts(Y_Belgium_train, start=c(2000,1), end=c(2014, 20), frequency=20)
# length(ts_Y_Belgium_train)
# ts_Y_Belgium_test <- ts(Y_Belgium_test, start=c(2015,1), end=c(2017, 20), frequency = 20)
#length(ts_Y_Belgium_test)
# Find ARIMAX model
# arimaX_Belgium_train <- auto.arima(ts_Y_Belgium_train, xreg=X_Belgium_train); # arimaX_Belgium_train$arma
ARIMA model created on Bulgaria: 0, 0, 0, 0, 20, 0, 0
ARIMA model created on Belgium with comparison to Bulgaria 5, 0, 2, 0, 20, 0, 0
Predict Y={Gross domestic product at market prices}
, using all other features not directly related to *GDP. Recall the core data organization:
Xreg
Predictors and 1 Outcome Variable to be modeled as a timeseries: 2000 - 2014 (15 years, Quarterly measures, 5-fold spline interpolation, \(15*4*5=300\));training
). The remaining 60 timepoints used for testing
(2015-01 to 2017-20).Report ARIMA(p,d,q)
params and quality metrics AIC/BIC.
The rank order of the top 10 indicators driving up the (relative) GDP-PPS along with their effect-sizes:(BOOK: VI-173)
# Previously, we already extracted the Belgium and Bulgaria Data, preprocess it,
# and fit ARIMAX (Belgium-training) model. Now, we will assess the model on Bulgaria_testing sets
# BelgiumARIMA = fit_ARIMA(country1Data=Belgium, country2Data=Bulgaria,
# start=2000, end=2014, frequency=20,
# feature="Gross domestic product at market prices")
# BelgiumARIMA$arma # [1] 4 0 2 0 20 0 0
# View rank-ordered ARIMAX effects:
# View(BelgiumARIMA$coef[order(BelgiumARIMA$coef)])
datatable(data.frame(Value=sort(BelgiumARIMA$coef)[1:10]),fillContainer = TRUE)
# Unemployment , Females, From 15-64 years, From 18 to 23 months
# -1.5203281
# ar2
# -1.1808472
# Labor cost other than wages and salaries
# -0.8380554
# Unemployment , Females, From 15-64 years, From 12 to 17 months
# -0.7037336
# ar4
# -0.6360880
# sar2
# -0.6260002
# Unemployment , Females, From 15-64 years, From 3 to 5 months
# -0.5262376
# Labor cost for LCI (compensation of employees plus taxes minus subsidies)
# -0.2885038
# Unemployment , Females, From 15-64 years, 48 months or over
# -0.2773650
# Unemployment , Males, From 15-64 years, from 3 to 5 months
# -0.2567934
Doing predictions:
#Get the Prospective Xreg=X design matrices ready (2015-2017, 60 rows)
preprocess_Belgium <- preprocess_ARIMA( country = 'Belgium',
start=2000, end=2017, frequency=20,
feature="Gross domestic product at market prices")
## [1] "Processing feature: ...Gross domestic product at market prices... "
## [1] "dim(X)[1]=360"
## [1] "dim(X)=(360,136); dim(Y)=(360,1) ..."
preprocess_Bulgaria <- preprocess_ARIMA(country = 'Bulgaria', #Netherlands
start=2000, end=2017, frequency=20,
feature="Gross domestic product at market prices")
## [1] "Processing feature: ...Gross domestic product at market prices... "
## [1] "dim(X)[1]=360"
## [1] "dim(X)=(360,137); dim(Y)=(360,1) ..."
homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
## [1] "dim(X1)=(360,131); dim(X2)=(360,131)!"
X_Belgium_test <- as.matrix(homoFeat$X_Country1[301:360, ])
X_Bulgaria_test <- as.matrix(homoFeat$X_Country2[301:360, ])
# Get Predictions
pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60 <-
forecast(BelgiumARIMA, xreg = X_Bulgaria_test)$mean
pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60 <-
forecast(BelgiumARIMA, xreg = X_Belgium_test)$mean
pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60 <-
forecast(BelgiumARIMA, xreg = X_Bulgaria_test)$mean +
mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) -
mean(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60)
cor(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, ts_Y_Belgium_test) # 0.11
## [1] 0.1526309
mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) # [1] 118
## [1] 110.9725
# Alternative predictions:
# X_Country1 <- X_Belgium_test; X_Country2 <- X_Bulgaria_test
# pred_arimaX_1_0_2_Y_Belgium_train300_Bulgaria_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Bulgaria_test)$pred
# pred_arimaX_1_0_2_Y_Belgium_train300_Belgium_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Belgium_test)$pred
Belgium test set dimensions: 60, 131
Bulgaria test set dimensions: 60, 131
Correlation: 0.1526309
Mean: 110.9725004
Plot only the last 5-years of training (2010-2014), \(100TimePoints=5Years\times 4Quarters\times 5SuperSample\) and the 3-year prospective forecasting \(60TimePoints=3Years\times 4Quarters\times 5SuperSample\).
ts_Y_Belgium_test <-
ts(preprocess_Belgium$Y[301:360, ],
start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)
## [1] 60
## png
## 2
#Output here:
plot(forecast(BelgiumARIMA, xreg = X_Belgium_test), # ARIMA forecast
include=100, lwd=4, lty=3,
xlab="Time", ylab="GPD Purchasing Power Standards (PPS)",
ylim=c(0, 150),
main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60,
col = "green", lwd = 4, lty=2) # Belgium train+test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60,
col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60,
col = "orange", lwd = 4, lty=1)
# Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1) # Observed Y_Test timeseries
legend("topleft", bty="n",
legend =
c("Belgium Training Data (2000-2014)",
"ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
"ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)",
"ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
"Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
"Belgium Official Reported GDP (2015-2017)"),
col=c("black", "blue", "green", "purple", "orange", "red"),
lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2012.5, 0, expression(atop(paste("Training Region (2000-2014)"),
paste(Model(GDP) %->% "ARIMAX(p, q, r) ; ",
XReg %==% as.matrix( X[i] ), " ", i %in% {1 : 131}))), cex=1.2)
text(2016.5, 0, expression(atop(paste("Validation Region (2015-2017)"),
paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ",
XReg %==% as.matrix(X[i]), " ", i %in% {1 : 131}))), cex=1.2)
Plot entire 15-year training time-span (2000-2014), \(300TimePoints=15Years\times 4Quarters\times 5SuperSample\) and the 3-year prospective forecasting \(60TimePoints=3Years\times 4Quarters\times 5SuperSample\).
Plot showing in BOOK VI-172-173 Figure 5.15:
ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ],
start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)
## [1] 60
#Output here
plot(forecast(BelgiumARIMA, xreg = X_Belgium_test), # ARIMA forecast
lwd=4, lty=3, xlab="Time", ylab="GPD Purchasing Power Standards (PPS)",
ylim=c(0, 150),
main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60,
col = "green", lwd = 4, lty=2) # Belgium train+test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60,
col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60,
col = "orange", lwd = 4, lty=1)
# Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1) # Observed Y_Test timeseries
legend("topleft", bty = "n",
legend =
c("Belgium Training Data (2000-2014)",
"ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
"ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)",
"ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
"Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
"Belgium Official Reported GDP (2015-2017)"),
col=c("black", "blue", "green", "purple", "orange", "red"),
lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2,
x.intersp=1.5, y.intersp=0.7)
text(2005, 0,
expression(atop(paste("Training Region (2000-2014)"),
paste(Model(GDP) %->% "ARIMAX(p, q, r) ; ",
XReg %==% as.matrix( X[i] ), " ", i %in% {1 : 131}))),
cex=1.2)
text(2015, 0, expression(atop(paste("Validation Region (2015-2017)"),
paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ",
XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
Let’s start by defining the generic k-space transformation.
See the procedure code data for detailed information about transformation functions
Examine the Kime-direction Distributions of the Phases for all Belgium features (predictors + outcome). Define a generic function that plots the Phase distributions.
BOOK VI-174 Figure 5.16
# homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
X_Belgium <- homoFeat$X_Country1; dim(X_Belgium)
## [1] 360 131
Y_Belgium <- preprocess_Belgium$Y; dim(Y_Belgium)
## [1] 360 1
FT_Belgium <- kSpaceTransform(cbind(X_Belgium, Y_Belgium), FALSE, NULL)
dataColnames <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dataColnames <- changetitle(dataColnames)
## png
## 2
plotPhaseDistributions(FT_Belgium, dataColnames)
#Take the first 6 features as an example
plotPhaseDistributions(FT_Belgium, dataColnames,option = "Select",size = 10,select_feature = c(1,2,3,4,5,6))
IFT_FT_Belgium <- kSpaceTransform(FT_Belgium$magnitudes, TRUE, FT_Belgium$phases)
# Check IFT(FT) == I:
# ifelse(abs(cbind(X_Belgium, Y_Belgium)[5,4] - Re(IFT_FT_Belgium[5,4])) < 0.001, "Perfect Syntesis", "Problems!!!")
Three different Phase Synthesis methods are introduced here:
1.Nil-Phase Synthesis
2.Swapped-Phase Synthesis
Space-time reconstructions by inverting back the mag_FT
for each feature signal after swapping the feature-phases. In other words, we randomly shuffle the columns
of the Phases-matrix (Training & Testing XReg Data) and use these swapped phases to synthesize the design covariate matrix (\(Xreg\)).
3.Random-Phase Synthesis
Perform Random-Phase reconstruction - IFT_RandPhase_FT_Belgium
- by randomly sampling from the phase distributions for each feature and then re-fitting the ARIMAX model.
BOOK VI-175-176 Table 5.1 are based on the following result
#Nil-Phase Synthesis
##Transformation
Nil_reconstruct<-PS_ARIMAX_remodel(X_Belgium,Y_Belgium,ts_Y_Belgium_test,result = "reconstruct")
#Dimension
Nil_reconstruct[[1]]
## [1] 360 132
Nil_reconstruct[[2]]
## [1] 360 132
Nil_reconstruct[[3]]
## [1] 360 132
#Feature names
datatable(data.frame(feature_name=Nil_reconstruct[[4]]),fillContainer = TRUE)
#Reconstruct result(part)
datatable(data.frame(Nil_reconstruct[[6]]),fillContainer = TRUE)
##ARIMAX re-modeling
Nil_ARIMAX<-PS_ARIMAX_remodel(X_Belgium,Y_Belgium,ts_Y_Belgium_test,result = "ARIMAX")
Nil_ARIMAX$training_set_length
## [1] 300
Nil_ARIMAX$testing_set_length
## [1] 60
# Find ARIMAX model: 2 1 2 0 20 0 0
Nil_ARIMAX$AR_MA
## [1] 2 0 0 0 20 0 0
# Regression with ARIMA(2,0,1)(2,0,0)[20] errors
# Coefficients:
# ar1 ar2 ma1 sar1 sar2 Acquisitions less disposals of non-financial non-produced assets
# -1.4232 -0.8592 -0.987 0.0941 -0.3451 0.0123
#s.e. 0.0341 0.0311 NaN 0.0688 0.0742 0.0007
# sigma^2 estimated as 0.8622: log likelihood=-320.39 AIC=914.79 AICc=1148.19 BIC=1422.2
Nil_ARIMAX$ARIMAX_model
## Time Series:
## Start = c(2015, 1)
## End = c(2017, 20)
## Frequency = 20
## 301 302 303 304 305 306 307
## 100.23740 103.03985 103.93131 94.78863 97.89201 98.00352 99.62775
## 308 309 310 311 312 313 314
## 104.01148 102.28614 104.55994 98.54172 98.41790 96.14105 107.16384
## 315 316 317 318 319 320 321
## 99.45335 96.44437 102.55336 96.88055 97.04026 101.91233 98.96258
## 322 323 324 325 326 327 328
## 89.24350 89.29086 94.41691 102.15558 101.44441 109.56905 108.06485
## 329 330 331 332 333 334 335
## 111.85496 114.01763 105.41725 111.13032 110.12992 103.82299 104.85538
## 336 337 338 339 340 341 342
## 105.07924 115.53392 120.60091 109.96504 110.97310 104.22759 105.82676
## 343 344 345 346 347 348 349
## 105.42527 112.88059 108.56014 112.43295 115.59193 110.84576 121.79527
## 350 351 352 353 354 355 356
## 123.17139 120.00412 115.81981 119.78872 109.28916 108.79465 118.52640
## 357 358 359 360
## 107.65560 117.75657 118.52997 121.46124
# alternatively:
# pred_arimax_1_0_1_2015_2017 <- predict(modArima_IFT_NilPhase_FT_Belgium_Y_train,
# n.ahead = 3*20, newxreg = IFT_NilPhase_FT_Belgium_X_test)$pred
Nil_ARIMAX$feature_effect
## Labour cost for LCI
## -2.3144505
## Unemployment , Females, From 15-64 years, From 18 to 23 months
## -1.1741540
## ar2
## -0.8530808
## Unemployment , Males, From 15-64 years, Less than 1 month
## -0.7046778
## Unemployment , Males, From 15-64 years, 48 months or over
## -0.5118510
## Unemployment , Females, From 15-64 years, From 1 to 2 months
## -0.3724768
## Unemployment , Females, From 15-64 years, From 3 to 5 months
## -0.3524810
## Labor cost for LCI (compensation of employees plus taxes minus subsidies)
## -0.3235822
## Unemployment , Females, From 15-64 years, From 6 to 11 months
## -0.2232634
## Unemployment , Males, From 15-64 years, from 1 to 2 months
## -0.2020433
# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-1.5972295
# ar1, effect=-1.4231617
# Labor cost other than wages and salaries, effect=-1.2213214
# ma1, effect=-0.9869571
# ar2, effect=-0.8591937
# Unemployment , Females, From 15-64 years, From 12 to 17 months, effect=-0.7075454
# Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.5797656
# Unemployment , Males, From 15-64 years, from 3 to 5 months, effect=-0.5026139
# sar2, effect=-0.3450866
# Unemployment , Males, From 15-64 years, from 24 to 47 months, effect=-0.2965540
Nil_ARIMAX$Correlation# 0.14
## [1] 0.1300407
Nil_ARIMAX$Mean# [1] 105
## [1] 106.6307
#Swapped-Phase Synthesis
##Transformation
Swapped_reconstruct<-PS_ARIMAX_remodel(X_Belgium,Y_Belgium,ts_Y_Belgium_test,option = "Swapped-Phase",result = "reconstruct")
#Dimension
Swapped_reconstruct[[1]]
## [1] 360 132
Swapped_reconstruct[[2]]
## [1] 360 132
Swapped_reconstruct[[3]]
## [1] 360 132
Swapped_reconstruct[[4]]
## [1] 360 132
Swapped_reconstruct[[5]]
## [1] 360 132
#Feature names
datatable(data.frame(Swapped_reconstruct[[6]]),fillContainer = TRUE)
#Reconstruct result(part)
datatable(data.frame(Swapped_reconstruct[[7]]),fillContainer = TRUE)
##ARIMAX re-modeling
Swapped_ARIMAX<-PS_ARIMAX_remodel(X_Belgium,Y_Belgium,ts_Y_Belgium_test,option = "Swapped-Phase",result = "ARIMAX")
Swapped_ARIMAX$training_set_length
## [1] 300
Swapped_ARIMAX$testing_set_length
## [1] 60
# Find ARIMAX model: 1 0 2 0 20 0 0
Swapped_ARIMAX$AR_MA
## [1] 0 0 2 0 20 0 0
# Regression with ARIMA(1,0,0)(2,0,0)[20] errors
# Coefficients:
# ar1 sar1 sar2 intercept Acquisitions less disposals of non-financial non-produced assets
# -0.3837 -0.2196 0.2827 46.1704 0.0063
#s.e. 0.1113 0.0903 0.0779 36.8492 0.0080
# sigma^2 estimated as 70: log likelihood=-976.01 AIC=2224.02 AICc=2452.63 BIC=2727.73
Swapped_ARIMAX$ARIMAX_model
## Time Series:
## Start = c(2015, 1)
## End = c(2017, 20)
## Frequency = 20
## 301 302 303 304 305 306 307
## 128.73276 119.68651 92.70275 91.27189 110.67953 106.75674 108.81414
## 308 309 310 311 312 313 314
## 104.60039 100.04283 112.43549 90.00036 102.19302 96.85436 105.43883
## 315 316 317 318 319 320 321
## 106.21309 94.74615 112.35201 101.69583 95.49132 99.50060 95.25607
## 322 323 324 325 326 327 328
## 119.42242 106.98760 108.82758 82.02274 101.96243 89.32122 95.64222
## 329 330 331 332 333 334 335
## 119.25223 103.29745 102.74379 109.71061 94.96187 109.35137 117.33802
## 336 337 338 339 340 341 342
## 117.44036 115.15772 98.39985 116.70175 104.96118 104.69955 112.54130
## 343 344 345 346 347 348 349
## 94.52727 95.46678 119.55027 101.55243 110.89774 123.99589 102.00219
## 350 351 352 353 354 355 356
## 100.91378 111.80278 105.35938 108.49608 107.30856 115.87760 121.83518
## 357 358 359 360
## 110.72181 92.68570 111.81961 99.47224
# alternatively:
# pred_arimax_1_0_0_Swapped_2015_2017 <- predict(modArima_IFT_SwappedPhase_FT_Belgium_Y_train,
# n.ahead = 3*20, newxreg = IFT_SwappedPhase_FT_Belgium_X_test)$pred
Swapped_ARIMAX$feature_effect
## sar2
## -0.45539809
## sar1
## -0.38177236
## Unemployment , Total, From 15-64 years, From 18 to 23 months
## -0.25821721
## Unemployment , Males, From 15-64 years, from 6 to 11 months
## -0.24583149
## Unemployment , Males, From 15-64 years, 48 months or over
## -0.19147436
## Unemployment , Total, From 15-64 years, From 6 to 11 months
## -0.12705360
## Unemployment , Males, From 15-64 years, from 24 to 47 months
## -0.08080671
## Unemployment , Females, From 15-64 years, From 6 to 11 months
## -0.04450060
## Unemployment , Total, From 15-64 years, From 24 to 47 months
## -0.01975551
## ISCED11 Upper secondary and post-secondary non-tertiary education (levels 3 and 4), Males
## -0.01686624
# ar1, effect=-0.38372043
# Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.31137514
# Labor cost other than wages and salaries, effect=-0.31094561
# sar1, effect=-0.21964957
# Unemployment , Females, From 15-64 years, From 6 to 11 months, effect=-0.20878853
#Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.12497311
# Unemployment , Males, From 15-64 years, Less than 1 month, effect=-0.10849013
# Unemployment , Total, From 15-64 years, 48 months or over, effect=-0.09066684
# Unemployment , Total, From 15-64 years, From 1 to 2 months, effect=-0.05852382
# Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.05695172
Swapped_ARIMAX$Correlation # 0.0
## [1] 0.1859233
Swapped_ARIMAX$Mean# [1] 105
## [1] 105.6749
#Random-Phase Synthesis
##Transformation
Random_reconstruct<-PS_ARIMAX_remodel(X_Belgium,Y_Belgium,ts_Y_Belgium_test,option = "Random-Phase",result = "reconstruct")
#Dimension
Random_reconstruct[[1]]
## [1] 360 132
Random_reconstruct[[2]]
## [1] 360 132
Random_reconstruct[[3]]
## [1] 360 132
Random_reconstruct[[6]]
## [1] 360 132
#Feature names
datatable(data.frame(Random_reconstruct[[4]]),fillContainer = TRUE)
#Reconstruct result(part)
datatable(data.frame(Random_reconstruct[[7]]),fillContainer = TRUE)
##ARIMAX re-modeling
Random_ARIMAX<-PS_ARIMAX_remodel(X_Belgium,Y_Belgium,ts_Y_Belgium_test,option = "Random-Phase",result = "ARIMAX")
Random_ARIMAX$training_set_length
## [1] 300
Random_ARIMAX$testing_set_length
## [1] 60
# Find ARIMAX model: 0 0 2 0 20 0 0
Random_ARIMAX$AR_MA
## [1] 0 0 2 0 20 0 0
# Regression with ARIMA(0,0,0)(2,0,0)[20] errors
# Coefficients:
# sar1 sar2 Acquisitions less disposals of non-financial non-produced assets
# -0.0743 0.5766 0.0162
#s.e. 0.0625 0.0752 0.0100
#sigma^2 estimated as 72.17: log likelihood=-988.06 AIC=2244.12 AICc=2463.4 BIC=2740.43
Random_ARIMAX$ARIMAX_model
## Time Series:
## Start = c(2015, 1)
## End = c(2017, 20)
## Frequency = 20
## 301 302 303 304 305 306 307
## 109.21384 97.90635 115.30991 87.18544 111.61506 108.93967 105.63176
## 308 309 310 311 312 313 314
## 99.81729 98.93481 108.10909 92.43841 88.10366 102.05895 98.15395
## 315 316 317 318 319 320 321
## 95.73793 107.01333 95.30001 97.26324 99.68047 97.51727 92.67848
## 322 323 324 325 326 327 328
## 96.11143 88.49441 94.17550 88.71985 89.40652 94.39787 84.99996
## 329 330 331 332 333 334 335
## 96.48612 99.88463 97.48044 92.94693 98.11024 90.76454 89.06357
## 336 337 338 339 340 341 342
## 103.49729 100.12285 99.48860 90.90774 94.74558 85.95599 82.79387
## 343 344 345 346 347 348 349
## 96.00114 86.73778 94.34068 90.33366 88.95980 92.55541 96.28621
## 350 351 352 353 354 355 356
## 80.73906 87.10278 97.93606 82.48071 85.83396 92.57338 88.57348
## 357 358 359 360
## 74.11565 90.74335 77.76062 92.29155
# alternatively:
# pred_arimax_1_0_1_Rand_2015_2017 <- predict(modArima_IFT_RandPhase_FT_Belgium_Y_train,
# n.ahead = 3*20, newxreg = IFT_RandPhase_FT_Belgium_X_test)$pred
Random_ARIMAX$feature_effect
## Labor cost for LCI (compensation of employees plus taxes minus subsidies)
## -0.29663870
## Unemployment , Males, From 15-64 years, Less than 1 month
## -0.27771467
## Unemployment , Total, From 15-64 years, From 12 to 17 months
## -0.18093817
## Unemployment , Males, From 15-64 years, from 12 to 17 months
## -0.17239828
## Unemployment , Total, From 15-64 years, From 6 to 11 months
## -0.14251591
## Unemployment , Females, From 15-64 years, From 3 to 5 months
## -0.13925477
## Agriculture, forestry and fishing - Employers' social contributions
## -0.13588769
## Unemployment , Females, From 15-64 years, From 24 to 47 months
## -0.11176784
## Unemployment , Males, From 15-64 years, from 18 to 23 months
## -0.09993668
## Unemployment , Total, From 15-64 years, From 24 to 47 months
## -0.07412022
# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.71989958
# Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.54541627
# Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.44230677
# Labor cost other than wages and salaries, effect=-0.32854422
# Unemployment , Males, From 15-64 years, from 6 to 11 months, effect=-0.24511374
# Unemployment , Total, From 15-64 years, From 24 to 47 months, effect=-0.19283037
# Agriculture, forestry and fishing - Employers' social contributions, effect=-0.11994897
# Unemployment , Females, From 15-64 years, From 24 to 47 months, effect=-0.10835175
# Unemployment , Females, From 15-64 years, From 1 to 2 months, effect=-0.09093252
# Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.07427297
Random_ARIMAX$Correlation # -0.15
## [1] -0.3181751
Random_ARIMAX$Mean # [1] 87.74201
## [1] 94.37547
BOOK VI-175 Figure 5.17
## Plot the results of the model ARIMAX fitting
ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ],
start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)
## [1] 60
## png
## 2
plot(forecast(BelgiumARIMA, xreg = X_Belgium_test), # ARIMA forecast
include=60, lwd=4, lty=3, xlab="Time", ylab="GPD Purchasing Power Standards (PPS)",
ylim=c(60, 160),
main = "Spacekime ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
based on fitting ARIMAX Models on spline interpolated & kime-transformed Belgium data")
lines(pred_arimax_2_0_1_Nil_2015_2017, col = "green", lwd = 4, lty=2) # Belgium Xreg Nil-Phase Reconstructions
lines(pred_arimax_1_0_0_Swapped_2015_2017, col = "purple", lwd = 4, lty=1) # Belgium Xreg Swapped-Phase Reconstructions
lines(pred_arimax_0_0_0_Rand_2015_2017, col = "orange", lwd = 4, lty=1) # Belgium Xreg Random-Phase Reconstructions
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1) # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)",
"ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
"ARIMAX(2,0,1) Belgium Xreg Nil-Phase Reconstruction (2015-2017)",
"ARIMAX(1,0,0) Belgium Xreg Swapped-Phase Reconstructions (2015-2017)",
"ARIMAX(0,0,0)(2,0,0)[20] Belgium Xreg Random-Phase Reconstructions (2015-2017)",
"Belgium Official Reported GDP (2015-2017)"),
col=c("black", "blue", "green", "purple", "orange", "red"),
lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2013.5, 65, expression(atop(paste("Training Region (2000-2014)"),
paste(Model(GDP) %->% "ARIMAX(p, q, r) ; ",
XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2016.5, 65, expression(atop(paste("Validation Region (2015-2017)"),
paste(hat(GDP) %<-% "ARIMAX(., ., .); ",
XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
All functions used in this part
#' Impute missing values inside a dataset
#'
#' This function is a preliminary way of dealing with NA inside the dataset. This function fills the NA's with column mean and sd
#'@param mat matrix with missing value(NA) inside
cleardata <- function(mat){
mat =
apply(mat, 2, function(column)
sapply(column, function(x)
ifelse(is.na(x),
mean(column,na.rm = T) +
rnorm( sum(is.na(column) ), sd = sd(column, na.rm = T) ),
x) ) )
return(mat)
}
#'use spline regression model to expand the dataset
#'
#'This function is mainly focused on time series dataset. Especially in this case, we apply spline regression model to expand the time series data from seasonal period to monthly, even weekly period. Notice that the length.out parameter in this function is fixed to 360, which will expand the time series to a weekly period. But we can change this parameter to get the time period that we desire.
#'@param mat matrix which contains a time series data. In this function we desire a time series data which has a seasonal period time data.
splinecreate <- function(mat){
res<-NULL
for (i in 1:ncol(mat)) {
sp<-smooth.spline(seq(1:72),mat[,i])
spresult<-predict(sp,seq(from=0,by=1/5,length.out = 360))
spfeat<-spresult$y+rnorm(360,sd=sd(mat[,i]))
res<-cbind(res,spfeat)
}
colnames(res)<-colnames(mat)
return(res)
}
#'Remove features with all missing values
#'
#' This function deletes features that are missing for all time points
#' associated with a certain country
#'@param countryName give a country name that is shown on the original dataset
rmv_miss_ftr = function(countryName = "Bulgaria"){
DataSuperSample =
time_series %>%
filter(country == countryName) %>%
select_if( function(col) sum(is.na(col)) != length(col) ) %>%
select(-time, -country) %>%
as.matrix() %>%
cleardata() %>%
as.data.frame()%>%
# remove feature that only has one value
select_if(function(col) sum(is.na(col)) == 0 ) %>%
# remove feature that all the values are 0
select_if(function(col) sum(col) != 0 ) %>%
splinecreate() %>%
as.data.frame()
return(DataSuperSample)
}
##Concern: The name of country "German" has changed inside the dataset once.
#'Clean and fit ARIMA model
#'
#' This function cleans the data and fits the ARIMA model
#'@param country Give a country name that is shown on the original dataset
#'@param start Select the start year to create the ARIMA model
#'@param end Select the end year to create the ARIMA model
#'@param frequency The number of observations per unit of time. The same in function "ts"
#'@param feature Choose one feature to create the ARIMA model on
Fit_ARIMA <- function(country = 'Belgium',
start = 2000, end = 2017, frequency = 20,
feature="Unemployment , Females, From 15-64 years, Total")
{
# delete features that are missing for all time points
DataSuperSample = rmv_miss_ftr(countryName = country)
if (feature=="Unemployment , Females, From 15-64 years, Total") {
Y = select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total")
X = select(DataSuperSample, -starts_with("Unemployment"))
} else if (feature=="Gross domestic product at market prices") {
Y = select(DataSuperSample, "Gross domestic product at market prices"); dim(Y)
X = select(DataSuperSample, -matches("debt|Debt")); dim(X) # 360 167
print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], "); ",
" dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ..."))
# ensure full-rank design matrix, X
X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X)
}
else {
print(paste0("This feature ", feature, " is not imlemented yet! Exiting Fit_ARIMA() method ..."))
return(NULL)
}
ts_Y <- ts(Y, start=c(start, 1), end=c(end, frequency),
frequency = frequency); length(ts_Y)
set.seed(1234)
fitArimaX = auto.arima(ts_Y, xreg=as.matrix(X) )
return(fitArimaX)
}
#'Preprocess dataset of given countries to ensure full rank
#'
#'Extend the Fit-ARIMA method to ensure testing-training
#'modeling/assessment for 2 countries works
#'
#'@param country Give a country name that is shown on the original dataset
#'@param start Select the start year to create the ARIMA model
#'@param end Select the end year to create the ARIMA model
#'@param frequency The number of observations per unit of time. The same in function "ts"
#'@param feature Choose one feature to create the ARIMA model on
preprocess_ARIMA <-
function(country='Belgium',start=2000, end=2017, frequency=20,
feature="Unemployment , Females, From 15-64 years, Total")
{
# delete features that are missing for all time points
DataSuperSample = rmv_miss_ftr(countryName = country)
print(paste0("Processing feature: ...", feature, "... "))
if (feature == "Unemployment , Females, From 15-64 years, Total") {
Y = select(DataSuperSample,
"Unemployment , Females, From 15-64 years, Total")
X = select(DataSuperSample,
-starts_with("Unemployment"))
} else if (feature == "Gross domestic product at market prices") {
Y = select(DataSuperSample,
"Gross domestic product at market prices"); dim(Y)
X = select(DataSuperSample,
-matches("debt|Debt") )
X <- X [, -c(50:80)]; dim(X) # 360 167
} else {
print(paste0("This feature: ...",
feature,
"... is not imlemented yet! Exiting preprocess_ARIMA() method ..."))
return(NULL)
}
# reduce the number of observations (matrix rows) to specified timerange
len_1 <- (end + 1 - start) * frequency; print(paste0("dim(X)[1]=", len_1))
X <- X[1:len_1 , qr(X[1:len_1 , ])$pivot[seq_len(qr(X[1:len_1 , ])$rank)]]; dim(X)
# ensure full-rank design matrix, X
Y <- as.data.frame(Y[1:len_1 , ])
print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], "); ", # 300 136
" dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ...")) # 300 1
return( list("X"=X, "Y"=Y) )
}
#' General function that ensures the XReg predictors for 2 countries are homologous
#'
#'@param X_Country1 value inhereted from previous function "preprocess_ARIMA"
#'@param X_Country2 value inhereted from previous function "preprocess_ARIMA"
homologousX_features <- function (X_Country1, X_Country2){
# Check if Xreg for Belgium and Bulgaria are homologous (same feature columns)
common_cols <- intersect(colnames(X_Country1), colnames(X_Country2))
X_Country1 <- subset(X_Country1, select = common_cols)
X_Country2 <- subset(X_Country2, select = common_cols)
print(paste0("dim(X1)=(", dim(X_Country1)[1], ",", dim(X_Country1)[2], "); ", # 300 131
" dim(X2)=(", dim(X_Country2)[1], ",", dim(X_Country2)[2], ")!")) # 300 131
return(list("X_Country1"=X_Country1, "X_Country2"=X_Country2))
}
#'Set up ARIMAX model with homologous features
#'
#'This function is based on two previous functions "preprocess_ARIMA" and "homologousX_features". It gets the homologous features between two countries and create ARIMAX model on them.
#'
#'@param country1 Give a country name that is shown on the original dataset, ARIMAX model will be created based on the data of this country
#'@param country2 Give a country name that is shown on the original dataset, this country data will only be used to get the homologous features with country 1
#'@param start Select the start year to create the ARIMA model
#'@param end Select the end year to create the ARIMA model
#'@param frequency The number of observations per unit of time. The same in function "ts"
#'@param feature Choose one feature to create the ARIMA model on
fit_ARIMA <-
function(country1='Belgium', country2= 'Bulgaria',
start=2000, end=2014, frequency=20,
feature="Gross domestic product at market prices"){
# This function
preprocess_Country1 <- preprocess_ARIMA(country = country1,
start=start, end=end, frequency=frequency, feature=feature)
preprocess_Country2 <- preprocess_ARIMA(country = country2,
start=start, end=end, frequency=frequency, feature=feature)
ts_Y_Country1 <- ts(preprocess_Country1$Y, start=c(start, 1),
end=c(end, frequency), frequency = frequency); length(ts_Y_Country1)
homoFeat <- homologousX_features(preprocess_Country1$X, preprocess_Country2$X)
X_Country1 <- homoFeat$X_Country1
X_Country2 <- homoFeat$X_Country2
set.seed(1234)
fitArimaX_Country1 = auto.arima(ts_Y_Country1, xreg=as.matrix(X_Country1))
return(fitArimaX_Country1)
}
#'1D timeseries FFT SHIFT
fftshift1D <- function(img_ff) {
rows <- length(img_ff)
rows_half <- ceiling(rows/2)
return(append(img_ff[(rows_half+1):rows], img_ff[1:rows_half]))
}
##This function isn't really being used in our report
#'K-space transformation
#'
#' Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain)
#' For ForwardFT, set parameters as (rawData, FALSE, NULL)
#' For InverseFT, there are two parameters setting: (magnitudes, TRUE, reconPhasesToUse) or (FT_data, TRUE, NULL)
#' @param data dataset that needs K-space transformation
kSpaceTransform <- function(data, inverse = FALSE, reconPhases = NULL) {
# ForwardFT (rawData, FALSE, NULL)
# InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL)
FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
mag_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
phase_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
IFT_reconPhases_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
for (i in 1:dim(data)[2]) {
if (inverse == FALSE | is.null(reconPhases)) {
FT_data[ , i] <- fft(data[ , i], inverse)
X2 <- FT_data[ , i]
# plot(fftshift1D(log(Re(X2)+2)), main = "log(fftshift1D(Re(FFT(timeseries))))")
mag_FT_data[ , i] <- sqrt(Re(X2)^2+Im(X2)^2);
# plot(log(fftshift1D(Re(mag_FT_MCSI_data))), main = "log(Magnitude(FFT(timeseries)))")
phase_FT_data[ , i] <- atan2(Im(X2), Re(X2));
# plot(Re(fftshift1D(phase_FT_MCSI_data[ , 1])), main = "Shift(Phase(FFT(timeseries)))")
}
else { # for IFT synthesis using user-provided Phases, typically from kime-phase aggregators
Real <- data[ , i] * cos(reconPhases[ , i])
Imaginary <- data[ , i] * sin(reconPhases[ , i])
IFT_reconPhases_data[ ,i] <-
Re(fft(Real+1i*Imaginary, inverse = TRUE)/length(data[ , i]))
}
}
######### Test the FT-IFT analysis-synthesis back-and-forth transform process
# to confirm calculations
# X2 <- FT_data[ , 1]; mag_FT_data[ , 1] <- sqrt(Re(X2)^2+Im(X2)^2);
# phase_FT_data[ , 1] <- atan2(Im(X2), Re(X2));
# Real2 = mag_FT_data[ , 1] * cos(phase_FT_data[ , 1])
# Imaginary2 = mag_FT_data[ , 1] * sin(phase_FT_data[ , 1])
# man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2))
# ifelse(abs(man_hat_X2[5] - data[5, 1]) < 0.001, "Perfect Syntesis", "Problems!!!")
#########
if (inverse == FALSE | is.null(reconPhases)) {
return(list("magnitudes"=mag_FT_data, "phases"=phase_FT_data))
# Use kSpaceTransform$magnitudes & kSpaceTransform$phases to retrieve teh Mags and Phases
}
else {
return(IFT_reconPhases_data)
# Use Re(kSpaceTransform) to extract spacetime Real-valued reconstructed data
}
}
#' Reshape the title
#'
#' @param titl vector contains titles that need to be reshaped
changetitle <- function(titl) {
newtitle<-rep(NA,length(titl))
for (i in 1:length(titl)) {
tempchar<-substring(titl[i],1:nchar(titl[i]),1:nchar(titl[i]))
for (j in 1:8) {
k=25*j
tempchar[which(tempchar==" ")[which(tempchar==" ")>k][1]]<-"\n"
}
tempchar<-paste(tempchar,collapse = "")
newtitle[i]<-tempchar
}
return(newtitle)
}
#' Creating plot result for K-space transformation
#'
#' This function can choose to generate all plots related to the K-space transformation. Or we could choose features code to select features we like to see to generate K-space transformation result.
#'
#'@param dataFT data inherited from function "kSpaceTransform", K-space transformed dataset.
#'@param dataColnames feature names for the transformed dataset
#'@param size size of the text inside the plot
#'@param option two options avaliable: "All": to plot for all features, "Select": to select several features you wish to show
#'@param select_feature if "option" is setted as "Select", then put inside a sequence to indicate the features you wish to show with this function.
plotPhaseDistributions <- function (dataFT, dataColnames, size=10, option="ALL",select_feature=NULL,...) {
df.phase <- as.data.frame(Re(dataFT$phases))
df.phase %>% gather() %>% head()
colnames(df.phase) <- dataColnames
phaseDistributions <- gather(df.phase)
colnames(phaseDistributions) <- c("Feature", "Phase")
if (is.null(size)) size=10
# map the value as our x variable, and use facet_wrap to separate by the key column:
if(option=="ALL"){
ggplot(phaseDistributions, aes(Phase)) +
# geom_histogram(bins = 10) +
geom_histogram(aes(y=..density..), bins = 10) +
facet_wrap( ~Feature, scales = 'free_x') +
xlim(-pi, pi) +
theme(strip.text.x = element_text(size = size, colour = "black", angle = 0))}
else if(option=="Select"){
choosefeat<-dataColnames[select_feature]
NphaseDistributions<-phaseDistributions[which(phaseDistributions$Feature %in% choosefeat),]
ggplot(NphaseDistributions, aes(Phase)) +
# geom_histogram(bins = 10) +
geom_histogram(aes(y=..density..), bins = 10) +
facet_wrap( ~Feature, scales = 'free_x') +
xlim(-pi, pi) +
theme(strip.text.x = element_text(size = size, colour = "black", angle = 0))}
else{
return("Wrong input on option, Please try again")
}
}
#'Conduct three different phase synthesis methods and create ARIMAX model based on different method.
#'
#'This function can reconstruct the dataset based on three phase synthesis methods: Nil-Phase Synthesis, Swapped-Phase Synthesis and Random-Phase Synthesis. Also it can construct different ARIMAX model based on the phase synthesis method chosen.
#'
#'@param X inherited from function "homologousX_features", we can get parameter X by "homologousX_features(country1,country2)$$X_Country1"
#'@param Y inherited from function "preprocess_ARIMA", we can get parameter Y by "preprocess_ARIMA$Y"
#'@param ts_Y_test time series vector, must be set from function "ts" and must be created based on testing set
#'@param option choose the phase synthesis method used in the reconstruction or ARIMAX model. Three options are valid: "Nil-Phase", "Swapped-Phase", "Random-Phase"
#'@param result choose the result shown by this function. Two options are valid: "ARIMAX": this will show the ARIMAX model created by the chosen phase synthesis method. "reconstruct": this will show the reconstruction procedure based on the chosen phase synthesis method, but the ARIMAX model won't be built.
#'@param rename_Y rename the return result of Y
#'
#'@return if the "result" parameter is setted as "reconstruct", then the reconstruction result will be shown. if setted as "ARIMAX", then the detailed ARIMAX model will be built.
PS_ARIMAX_remodel <- function(X,Y,ts_Y_test,option="Nil-Phase",result="ARIMAX",rename_Y="Y_GDP_Belgium") {
temp_data<-cbind(X,Y)
FT<-kSpaceTransform(temp_data, FALSE, NULL)
if(option=="Nil-Phase"){
nilPhase_FT_data <- array(complex(real=0, imaginary=0), c(dim(temp_data)[1], dim(temp_data)[2]))
##IFT_NilPhase_FT <- array(complex(), c(dim(temp_data)[1], dim(temp_data)[2]))
# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT<- Re(kSpaceTransform(FT$magnitudes, TRUE, nilPhase_FT_data))
colnames(IFT_NilPhase_FT) <- c(colnames(X), rename_Y)
#result
if(result=="reconstruct"){
return(list(
dim(nilPhase_FT_data),dim(IFT_NilPhase_FT), dim(FT$magnitudes),
colnames(IFT_NilPhase_FT), head(IFT_NilPhase_FT),IFT_NilPhase_FT)) # head(temp_Data)
}
#rename
IFT_FT<-IFT_NilPhase_FT
}
else if(option=="Swapped-Phase"){
swapped_phase_FT_data <- FT$phases
colnames(swapped_phase_FT_data) <- c(colnames(X), rename_Y)
swapped_phase_FT_data1 <- swapped_phase_FT_data
IFT_SwappedPhase_FT <- array(complex(), c(dim(temp_data)[1], dim(temp_data)[2]))
set.seed(12345) # sample randomly Phase-columns for each of the 131 covariates (X)
#swap_phase_FT_Belgium_indices <- sample(ncol(swapped_phase_FT_Belgium_data)-1)
# for (j in 1:131) { # for all coluns of the design Xreg matrix, excluding Y, randomly swap columns phases
# swapped_phase_FT_Belgium_data1[ , j] <- swapped_phase_FT_Belgium_data[, swap_phase_FT_Belgium_indices[j]]
#}
swapped_phase_FT_data1 <- as.data.frame(cbind(
swapped_phase_FT_data[ , sample(ncol(swapped_phase_FT_data[ , 1:(ncol(swapped_phase_FT_data)-1)]))],
swapped_phase_FT_data[ , ncol(swapped_phase_FT_data)]))
swapped_phase_FT_data <- swapped_phase_FT_data1
colnames(swapped_phase_FT_data)[ncol(swapped_phase_FT_data)] <- rename_Y
colnames(swapped_phase_FT_data)
# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal using the feature swapped phases
IFT_SwappedPhase_FT <- Re(kSpaceTransform(FT$magnitudes, TRUE, swapped_phase_FT_data))
colnames(IFT_SwappedPhase_FT) <- c(colnames(X), rename_Y)
#result
if(result=="reconstruct"){
return(list(
dim(swapped_phase_FT_data) ,
# [1] 360 132
dim(swapped_phase_FT_data), dim(FT$phases),
dim(IFT_SwappedPhase_FT), dim(FT$magnitudes),
colnames(IFT_SwappedPhase_FT), tail(IFT_SwappedPhase_FT),IFT_SwappedPhase_FT)) # tail(temp_Data)
}
#rename
IFT_FT<-IFT_SwappedPhase_FT
}
else if(option=="Random-Phase"){
#randPhase_FT_data <- array(complex(), c(dim(temp_data)[1], dim(temp_data)[2]))
IFT_RandPhase_FT <- array(complex(), c(dim(temp_data)[1], dim(temp_data)[2]))
randPhase_FT_data <- FT$phases
for (i in 1:(dim(randPhase_FT_data)[2] -1)) {
if (i < dim(randPhase_FT_data)[2]) {
set.seed(12345) # sample randomly Phases for each of the 131 predictors covariates (X)
randPhase_FT_data[ , i] <- FT$phases[sample(nrow(FT$phases)), i]
} else { } # for the Y outcome (Last Column) - do not change the phases of the Y
}
# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with avg-phase
IFT_RandPhase_FT <- Re(kSpaceTransform(FT$magnitudes, TRUE, randPhase_FT_data))
colnames(IFT_RandPhase_FT) <- c(colnames(X), rename_Y)
#result
if(result=="reconstruct"){
return( list(
dim(randPhase_FT_data), # ; head(randPhase_FT_data)
# [1] 360 132
dim(IFT_RandPhase_FT), dim(FT$magnitudes) ,
colnames(IFT_RandPhase_FT), tail(IFT_RandPhase_FT), # tail(temp_Data)
dim(IFT_RandPhase_FT), head(Re(IFT_RandPhase_FT)), tail(Re(IFT_RandPhase_FT))))
}
#rename
IFT_FT<-IFT_RandPhase_FT
}
if(result=="ARIMAX"){
#construction of time-series analysis
# Perform ARIMAX modeling on IFT_NilPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_FT_Y_train <- IFT_FT[1:300, ncol(IFT_FT)]
IFT_FT_Y_test <- IFT_FT[301:360]
# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_FT_X_train <- as.data.frame(IFT_FT)[1:300, 1:(ncol(IFT_FT)-1)]; dim(IFT_FT_X_train)
IFT_FT_X_test <- as.data.frame(IFT_FT)[301:360, 1:(ncol(IFT_FT)-1)]; dim(IFT_FT_X_test)
# Outcome Variable to be ARIMAX-modelled, as a timeseries
ts_IFT_FT_Y_train <-
ts(IFT_FT_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)
set.seed(1234)
modArima_IFT_FT_Y_train <-
auto.arima(ts_IFT_FT_Y_train, xreg=as.matrix(IFT_FT_X_train))
pred_arimax <- forecast(modArima_IFT_FT_Y_train, xreg = as.matrix(IFT_FT_X_test))
pred_arimax_2015_2017 <-
ts(pred_arimax$mean, frequency=20, start=c(2015,1), end=c(2017,20))
# alternatively:
# pred_arimax_1_0_1_2015_2017 <- predict(modArima_IFT_NilPhase_FT_Belgium_Y_train,
# n.ahead = 3*20, newxreg = IFT_NilPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_FT_Y_train$coef)[1:10]
# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-1.5972295
# ar1, effect=-1.4231617
# Labor cost other than wages and salaries, effect=-1.2213214
# ma1, effect=-0.9869571
# ar2, effect=-0.8591937
# Unemployment , Females, From 15-64 years, From 12 to 17 months, effect=-0.7075454
# Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.5797656
# Unemployment , Males, From 15-64 years, from 3 to 5 months, effect=-0.5026139
# sar2, effect=-0.3450866
# Unemployment , Males, From 15-64 years, from 24 to 47 months, effect=-0.2965540
return(list(
training_set_length=length(IFT_FT_Y_train),
testing_set_length=length(IFT_FT_Y_test),
AR_MA=modArima_IFT_FT_Y_train$arma,
ARIMAX_model=pred_arimax_2015_2017,
feature_effect=sort(modArima_IFT_FT_Y_train$coef)[1:10],
Correlation=cor(pred_arimax$mean, ts_Y_test),
Mean=mean(pred_arimax_2015_2017)))}
else{
print(paste0("Wrong input for option or result, please try again.") )
return(NULL)
}
}