load("EU_Econ_SpaceKime.RData")

1 Data Preprocessing

library(dplyr)
library(arm)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(plot3D)
library(scatterplot3d)
library(plotly)
library(fastDummies)
library(forecast)

Load the data and preprocess the data.

#setwd("E:/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics")
eu <- read.csv("Master_Aggregate_EU_Econ_Data_11_29_2018_TimeTransform.csv", stringsAsFactors = F)[,-5]
colnames(eu) <- c("country","time","feature","value")
eu <- filter(eu,!country %in% c("European Union (25 countries)","D1_Country",""))
eu$value <- sapply(c(1:nrow(eu)),function(x) as.numeric(gsub(":|,","",eu$value[x])))
eu <- filter(eu, feature != "")
dim(eu)

2 Reformat the data into a 3D array (country * feature * time)

unq_country <- sort(unique(eu$country))
unq_time <- sort(unique(eu$time))
unq_fea <- sort(unique(eu$feature))
num_country <- length(unq_country)
num_time <- length(unq_time)
num_fea <- length(unq_fea)
eu <- arrange(eu,country,time,feature)
eu_3d_array <- array(NA,dim = c(num_country,num_time,num_fea),dimnames = list(unq_country,unq_time,unq_fea))
for (i in 1:num_country){
  for (j in 1:num_time){
    for (k in 1:num_fea){
      eu_3d_array[i,j,k] = eu$value[(i-1)*num_time*num_fea + (j-1)*num_fea + k]
    }
  }
}
eu_3d_array[1:10,1:10,1]

#Data visualization

eu <- arrange(eu,time,feature,country)
eu_visualization <- select(eu,time,feature,country,value)
eu_visualization$time <- sapply(c(1:nrow(eu_visualization)),function(x) as.numeric(gsub("Q",".",eu_visualization$time[x])))
eu_visualization$feature <- as.factor(eu_visualization$feature)
eu_visualization$country <- as.factor(eu_visualization$country)
eu_visualization$value <- as.numeric(eu_visualization$value)
eu_visualization$feature <- sapply(c(1:nrow(eu_visualization)),function(x) substr(eu_visualization$feature[x],1,20))
plot_ly(eu_visualization, x = ~time, y = ~country, z = ~value, color = ~feature,split = ~ country,type = 'scatter3d', mode = 'lines')
plot_ly(eu_visualization, x = ~time, y = ~country, z = ~value, color = ~feature,split = ~ country,type = 'scatter3d', mode = 'lines')

3 Time series format

#Find the duplicates
eu_time_series <- na.omit(eu)
allFeatures = as.character(unique(eu_time_series$feature))
allTime = unique(eu_time_series$time)
allCountry = as.character(unique(eu_time_series$country))
allCombination = length(allFeatures)*length(allTime)*length(allCountry)
dup = c()
for (i in 1:length(allFeatures)){
  for (j in 1:length(allCountry)){
    for (k in 1:length(allTime)){
      if (nrow(filter(eu_time_series,country == allCountry[j] & feature == allFeatures[i] & time == allTime[k]))>1){
        dup = c(dup,as.character(allFeatures[i]))
        break
      }
    }
    break
  }
}
dup #These features have mutiple observations at the same time point

Remove duplicates.

removeDup = filter(eu_time_series, feature != "Employment by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels" &
                     feature != "Labor cost for LCI excluding bonuses" &
                     feature != "Labor costs other than wages or salaries" &
                     feature != "Labour cost for LCI (compensation of employees plus taxes minus subsidies)" &
                     feature != "Labour cost for LCI excluding bonuses" &
                     feature != "Labour costs other than wages and salaries" &
                     feature != "Wages and salaries (total)")
time_series = spread(removeDup,feature,value)
dim(time_series)

4 Fit the ARIMA model for each country

4.1 Belgium

Extract Belgium data.

#Chose Belgium and fit the arima
belgium = filter(time_series,country == "Belgium")
belgium = belgium[, colSums(is.na(belgium)) != nrow(belgium)] #delete the feature that is missing at all the time point
belgium = select(belgium,-time,-country)
dim(belgium)

Manually clean (preprocess) the Belgium data and fit ARIMAX model \[Y = BelguimSuperSample\$'Unemployment,\ Females,\ From\ 15-64\ years,\ Total',\]

\[X = XReg\ (all\ other\ covariates\ -starts\_with('Unemployment')).\]

#super sample the dataset
cleardata <- function(mat) {
  for (i in 1:ncol(mat)) {
    mat[is.na(mat[,i]),i]<-mean(mat[,i],na.rm = T) + rnorm(sum(is.na(mat[,i])),sd = sd(mat[,i],na.rm = T))
  }
  return(mat)
}

#use spline regression model to expand the dataset
splinecreate <- function(mat) {
  res<-NULL
  for (i in 1:ncol(mat)) {
    sp<-smooth.spline(seq(1:72),mat[,i])
    spresult<-predict(sp,seq(from=0,by=1/5,length.out = 360))
    spfeat<-spresult$y+rnorm(360,sd=sd(mat[,i]))
    res<-cbind(res,spfeat)
  }
  colnames(res)<-colnames(mat)
  return(res)
}
BelguimMatrix = as.matrix(belgium)
BelguimMatrix = cleardata(BelguimMatrix)
BelguimMatrix = splinecreate(BelguimMatrix)
BelguimSuperSample = as.data.frame(BelguimMatrix)

#############################
# Outcome Features to examine, predict, forecast, spacekime-analyze
#   "Gross domestic product at market prices"
#   "Unemployment , Females, From 15-64 years, Total"
#   "Capital transfers, payable"       
#   "Capital transfers, receivable"
#   "Debt securities"
#   "Government consolidated gross debt"
#   ... View(colnames(BelguimMatrix))
############################# 

################################################
#    "Unemployment , Females, From 15-64 years, Total"
Y = dplyr::select(BelguimSuperSample, "Unemployment , Females, From 15-64 years, Total"); dim(Y)
X = dplyr::select(BelguimSuperSample, -starts_with("Unemployment")); dim(X)
fitArimaX = auto.arima(Y, xreg=data.matrix(X[,-112])); fitArimaX$arma

################################################
#    "Government consolidated gross debt"
#Y = select(BelguimSuperSample, "Government consolidated gross debt"); dim(Y)
#X = select(BelguimSuperSample, -matches("debt|Debt")); dim(X)
# X_scale <- scale(X); Matrix::rankMatrix(X_scale); any(is.na(X_scale)); any(is.infinite(X_scale))
#     remove columns with infinite or missing value  
# X_scale_1 <- X_scale[ , !is.infinite(colSums(X_scale)) & !is.na(colSums(X_scale))]
#X_1 <- X[ , !is.infinite(colSums(X)) & !is.na(colSums(X))]; dim(X_1); Matrix::rankMatrix(X_1)
#X_2 <- X_1[, qr(X_1)$pivot[seq_len(qr(X_1)$rank)]]; dim(X_2)
# 
#fitArimaX = auto.arima(Y, xreg=X_2, method = "CSS", # "SANN", method = "CSS", optim.method = "BFGS"
#          optim.control=list(maxit = 20000, temp = 20), optim.method = "BFGS")
#fitArimaX; View(sort(fitArimaX$coef)[1:10]); fitArimaX$arma

################################################
#   "Gross domestic product at market prices"
Y = select(BelguimSuperSample, "Gross domestic product at market prices"); dim(Y)
X = select(BelguimSuperSample, -matches("debt|Debt")); dim(X)  # 360 167
X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X)
ts_Y <- ts(Y, start=c(2000,1), end=c(2017, 20), frequency = 20); length(ts_Y)
set.seed(1234)
fitArimaX = auto.arima(ts_Y, xreg=data.matrix(X[ , -c(50:60)])); fitArimaX$arma
# 5  0  2  0 20  0  0
# sigma^2 estimated as 57.04:  log likelihood=-1132.78 AIC=2593.55   AICc=2871.09   BIC=3230.88
pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60 <- 
predict(fitArimaX, n.ahead = 60, newxreg = data.matrix(X[301:360 , -c(50:60)]))$pred

plot(forecast(fitArimaX, xreg = data.matrix(X[301:360 , -c(50:60)])),         # ARIMA forecast
     include=120, lwd=4, lty=3, xlab="Time", ylab="GDP", ylim=c(50, 150),
     main = "ARIMAX Analytics (Train: 2000-2017; Test: 2018-2020) GDP Forecasting\n
      based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
 lines(pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60, col = "red", lwd = 4, lty=3) 
 legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2017)", 
                        "ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)",
                        "ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)"),
       col=c("black", "blue", "red"), 
       lty=c(3,3,3), lwd=c(4,4,4), cex=1.2, x.intersp=1.5, y.intersp=0.6)
text(2015, 60, expression(atop(paste("Training Region (2000-2017)"), 
                paste(Model(Unempl) %->% "ARIMAX(p, q, r) ;  ", 
                      XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)
text(2019.5, 60, expression(atop(paste("Validation Region (2018-2020)"), 
        paste(hat(Unempl) %<-% "ARIMAX(5,0 ,2); ", 
              XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)

GDP Description: GDP (gross domestic product) is an indicator for a nation?s economic situation. It reflects the total value of all goods and services produced less the value of goods and services used for intermediate consumption in their production. Expressing GDP in PPS (purchasing power standards) eliminates differences in price levels between countries, and calculations on a per head basis allows for the comparison of economies significantly different in absolute size.

GDP unit of measure represents the Current prices, euro per capita. The volume index of GDP per capita in Purchasing Power Standards (PPS) is intended for cross-country comparisons rather than for temporal comparisons. GDP per capita when expressed in PPS eliminates the differences in price levels between countries allowing meaningful volume comparisons of GDP between countries. Expressed in relation to the European Union (EU27 GDP = 100), a country with an index that is higher than $100$ or lower than $100$ correspond to that this country’s level of GDP per head being higher or lower than the EU average, respectively.

4.2 Bulgaria

Define a new function that (1) cleans the data, and (2) fits the ARIMA model estimating the seasonal and non-seasonal time-series parameters $(p,d,q)$ and the effect-sizes ($\beta$’s) for the exogenous regression features ($X$).

#write a function of clean the data and fit the ARIMA model
Fit_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20,
                     feature="Unemployment , Females, From 15-64 years, Total")
{
  #delete features that are missing at all time points
  countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)]
  countryData = select(countryData, -time, -country)
  DataMatrix = as.matrix(countryData)
  DataMatrix = cleardata(DataMatrix)
  DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove feature that only has one value
  DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove feature that all the values are 0
  DataMatrix = splinecreate(DataMatrix)
  DataSuperSample = as.data.frame(DataMatrix)
  if (feature=="Unemployment , Females, From 15-64 years, Total") {
      Y = select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total")
      X = select(DataSuperSample, -starts_with("Unemployment"))
  } else if (feature=="Gross domestic product at market prices") {
    Y = select(DataSuperSample, "Gross domestic product at market prices"); dim(Y)
    X = select(DataSuperSample, -matches("debt|Debt")); dim(X)  # 360 167
    print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], ");  ",
                 " dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ..."))
    X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X)  # ensure full-rank design matrix, X
  }
  else {
    print(paste0("This feature ", feature, " is not imlemented yet! Exiting Fit_ARIMA() method ...")) 
    return(NULL)
  }
  ts_Y <- ts(Y, start=c(start, 1), end=c(end, frequency), frequency = frequency); length(ts_Y)
  set.seed(1234)
  fitArimaX = auto.arima(ts_Y, xreg=data.matrix(X))
  return(fitArimaX)
}
Bulgaria = filter(time_series,country == "Bulgaria")
BulgariaARIMA = Fit_ARIMA(countryData=Bulgaria, start=2000, end=2017, frequency=20, 
                     feature="Gross domestic product at market prices")
BulgariaARIMA$arma

# Extend the Fit-ARIMA method to ensure testing-training modeling/assessment for 2 countries works
preprocess_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20,
                     feature="Unemployment , Females, From 15-64 years, Total")
{
  #delete features that are missing at all time points
  countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)]
  countryData = select(countryData, -time, -country)
  DataMatrix = as.matrix(countryData)
  DataMatrix = cleardata(DataMatrix)
  DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value
  DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0
  DataMatrix = splinecreate(DataMatrix)
  DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data
  print(paste0("Processing feature: ...", feature, "... "))
      
  if (feature=="Unemployment , Females, From 15-64 years, Total") {
      Y = select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total")
      X = select(DataSuperSample, -starts_with("Unemployment"))
  } else if (feature=="Gross domestic product at market prices") {
      Y = select(DataSuperSample, "Gross domestic product at market prices"); dim(Y)
      X = select(DataSuperSample, -matches("debt|Debt")); 
      X <- X [, -c(50:80)]; dim(X)  # 360 167
  } else {
    print(paste0("This feature: ...", feature, "... is not imlemented yet! Exiting preprocess_ARIMA() method ...")) 
    return(NULL)
  }
  
  # reduce the number of observations (matrix rows) to specified timerange
  len_1 <- (end + 1 - start) * frequency; print(paste0("dim(X)[1]=", len_1))
  X <- X[1:len_1 , qr(X[1:len_1 , ])$pivot[seq_len(qr(X[1:len_1 , ])$rank)]]; dim(X)  
  # ensure full-rank design matrix, X
  Y <- as.data.frame(Y[1:len_1 , ])
  print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], ");  ",         # 300 136
                 " dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ..."))    # 300 1
  return(list("X"=X, "Y"=Y))
}

# Outcome Variable to be modelled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures)
# Spline interpolation *5;   2000-01 - 2014-20 (300 observations for training): 60 observations (2015-2017) for Testing

Belgium  <- filter(time_series, country == "Belgium")
Bulgaria <- filter(time_series, country == "Bulgaria")
Netherlands <- filter(time_series, country == "Netherlands")
# Test preprocess_ARIMA()
#preprocess_Belgium <- preprocess_ARIMA(countryData=Belgium, start=2000, end=2014,
#                frequency=20, feature="Gross domestic product at market prices")
#preprocess_Bulgaria <- preprocess_ARIMA(countryData=Bulgaria, start=2000, end=2014,
#                frequency=20, feature="Gross domestic product at market prices")


# General function that ensures the XReg predictors for 2 countries are homologous
homologousX_features <- function (X_Country1, X_Country2){
  # Check if the Belgium and Bilgaria Xreg are homologous (same feature columns)
  common_cols <- intersect(colnames(X_Country1), colnames(X_Country2))
  X_Country1 <- subset(X_Country1, select = common_cols)
  X_Country2 <- subset(X_Country2, select = common_cols)
  print(paste0("dim(X1)=(", dim(X_Country1)[1], ",", dim(X_Country1)[2], ");  ", # 300 131
              " dim(X2)=(", dim(X_Country2)[1], ",", dim(X_Country2)[2], ")!"))  # 300 131
  return(list("X_Country1"=X_Country1, "X_Country2"=X_Country2))
}
# Test homologousX_features
# homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
# X_Belgium  <- homoFeat$X_Country1
# X_Bulgaria <- homoFeat$X_Country2

fit_ARIMA <- function(country1Data=Belgium, country2Data=Bulgaria, 
                     start=2000, end=2014, frequency=20,
                     feature="Gross domestic product at market prices") {
  preprocess_Country1 <- preprocess_ARIMA(countryData=country1Data, 
                    start=start, end=end, frequency=frequency, feature=feature)
  preprocess_Country2 <- preprocess_ARIMA(countryData=country2Data, 
                    start=start, end=end, frequency=frequency, feature=feature)
  ts_Y_Country1 <- ts(preprocess_Country1$Y, start=c(start, 1), 
            end=c(end, frequency), frequency = frequency); length(ts_Y_Country1)
  
  homoFeat <- homologousX_features(preprocess_Country1$X, preprocess_Country2$X)
  X_Country1  <- homoFeat$X_Country1
  X_Country2 <- homoFeat$X_Country2
  
  set.seed(1234)
  fitArimaX_Country1 = auto.arima(ts_Y_Country1, xreg=data.matrix(X_Country1))
  return(fitArimaX_Country1)
}

# Belgium = filter(time_series,country == "Belgium")
BelgiumARIMA = fit_ARIMA(country1Data=Belgium, 
                    country2Data=Bulgaria, # country2Data=Netherlands,
                    start=2000, end=2014, frequency=20,
                    feature="Gross domestic product at market prices")
BelgiumARIMA$arma    # [1]  4  0  2  0 20  0  0

# sigma^2 estimated as 45.99:  log likelihood=-919.13 AIC=2116.26 AICc=2359.51 BIC=2631.09

# Outcome Variable to be modelled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures)
# Spline interpolation x5;   2000-01 - 2014-20 (300 observations for training)
# ts_Y_Belgium_train <- ts(Y_Belgium_train, start=c(2000,1), end=c(2014, 20), frequency=20)
# length(ts_Y_Belgium_train)
# ts_Y_Belgium_test <- ts(Y_Belgium_test, start=c(2015,1), end=c(2017, 20), frequency = 20)
#length(ts_Y_Belgium_test)
# Find ARIMAX model
# arimaX_Belgium_train <- auto.arima(ts_Y_Belgium_train, xreg=X_Belgium_train); # arimaX_Belgium_train$arma

5 TS Forecasting

5.1 Perform ARIMAX modeling

Predict Y={Gross domestic product at market prices}, using all other features not directly related to *GDP. Recall the core data organization:

131 (common BE + BG) Xreg Predictors and 1 Outcome Variable to be modeled as a timeseries: 2000 - 2014 (15 years, Quarterly measures, 5-fold spline interpolation, $15*4*5=300$);
Spline interpolation x5 (freq=20 observations per year); 2000-01 to 2014-20 (300 timepoint observations over 15 years, for training). The remaining 60 timepoints used for testing (2015-01 to 2017-20).

Report ARIMA(p,d,q) params and quality metrics AIC/BIC.

# Previously, we already extracted the Belgium and Bulgaria Data, preprocess it, 
# and fit ARIMAX (Belgium-training) model. Now, we will assess the model on Bulgaria_testing sets
# BelgiumARIMA = fit_ARIMA(country1Data=Belgium, country2Data=Bulgaria,
#                    start=2000, end=2014, frequency=20,
#                    feature="Gross domestic product at market prices")
# BelgiumARIMA$arma    # [1]  4  0  2  0 20  0  0

# View rank-ordered ARIMAX effects:
# View(BelgiumARIMA$coef[order(BelgiumARIMA$coef)])
sort(BelgiumARIMA$coef)[1:10]
#           Unemployment , Females, From 15-64 years, From 18 to 23 months 
#                                                               -1.5203281 
#                                                                      ar2 
#                                                               -1.1808472 
#                                 Labor cost other than wages and salaries 
#                                                               -0.8380554 
#           Unemployment , Females, From 15-64 years, From 12 to 17 months 
#                                                               -0.7037336 
#                                                                      ar4 
#                                                               -0.6360880 
#                                                                     sar2 
#                                                               -0.6260002 
#             Unemployment , Females, From 15-64 years, From 3 to 5 months 
#                                                               -0.5262376 
#  Labor cost for LCI (compensation of employees plus taxes minus subsidies) 
#                                                               -0.2885038 
#              Unemployment , Females, From 15-64 years, 48 months or over 
#                                                               -0.2773650 
#               Unemployment , Males, From 15-64 years, from 3 to 5 months 
#                                                               -0.2567934 

#Get the Prospective Xreg=X design matrices ready (2015-2017, 60 rows)
preprocess_Belgium <- preprocess_ARIMA(countryData=Belgium, 
                    start=2000, end=2017, frequency=20, 
                    feature="Gross domestic product at market prices")
preprocess_Bulgaria <- preprocess_ARIMA(countryData=Bulgaria,  #Netherlands
                    start=2000, end=2017, frequency=20, 
                    feature="Gross domestic product at market prices")
homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
X_Belgium_test  <- homoFeat$X_Country1[301:360, ]; dim(X_Belgium_test)
X_Bulgaria_test <- homoFeat$X_Country2[301:360, ]; dim(X_Bulgaria_test)
 
# Get Predictions
pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60 <- 
      forecast(BelgiumARIMA, xreg = X_Bulgaria_test)$mean
pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60 <- 
      forecast(BelgiumARIMA, xreg = X_Belgium_test)$mean
pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60 <-
  forecast(BelgiumARIMA, xreg = X_Bulgaria_test)$mean +
  mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) -
  mean(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60)

cor(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, ts_Y_Belgium_test)  # 0.11
mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) # [1] 118

# Alternative predictions:
# X_Country1 <- X_Belgium_test; X_Country2 <- X_Bulgaria_test
# pred_arimaX_1_0_2_Y_Belgium_train300_Bulgaria_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Bulgaria_test)$pred
# pred_arimaX_1_0_2_Y_Belgium_train300_Belgium_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Belgium_test)$pred

5.2 Display the alternative (spacetime) analytical models (ARIMAX)

Plot only the last 5-years of training (2010-2014), $100TimePoints=5Years\times 4Quarters\times 5SuperSample$ and the 3-year prospective forecasting $60TimePoints=3Years\times 4Quarters\times 5SuperSample$.

ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ], 
                        start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)

## [1] 60

# windows(width=14, height=10)
plot(forecast(BelgiumARIMA, xreg = data.matrix(X_Belgium_test)),         # ARIMA forecast
     include=100, lwd=4, lty=3, xlab="Time", ylab="GPD Purchasing Power Standards (PPS)",
     ylim=c(25, 150),cex.main=0.8,
     main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
      based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, col = "green", lwd = 4, lty=2)   # Belgium train+test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60, col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60, col = "orange", lwd = 4, lty=1) 
# Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1)       # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)", 
                        "ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
                        "ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)",
                        "ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
                        "Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
                        "Belgium Official Reported GDP (2015-2017)"),
       col=c("black", "blue", "green", "purple", "orange", "red"), 
       lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2012.5, 30, expression(atop(paste("Training Region (2000-2014)"), 
                paste(Model(GDP) %->% "ARIMAX(p, q, r) ;  ", 
                      XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2016.5, 30, expression(atop(paste("Validation Region (2015-2017)"), 
        paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ", 
              XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)

Plot entire 15-year training time-span (2000-2014), $300TimePoints=15Years\times 4Quarters\times 5SuperSample$ and the 3-year prospective forecasting $60TimePoints=3Years\times 4Quarters\times 5SuperSample$.

ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ], 
                        start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)

## [1] 60

# windows(width=14, height=10)
plot(forecast(BelgiumARIMA, xreg = data.matrix(X_Belgium_test)),         # ARIMA forecast
     lwd=4, lty=3, xlab="Time", ylab="GPD Purchasing Power Standards (PPS)",
     ylim=c(25, 150),cex.main=0.8,
     main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
      based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, col = "green", lwd = 4, lty=2)   # Belgium train+test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60, col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60, col = "orange", lwd = 4, lty=1) 
# Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1)       # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)", 
                        "ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
                        "ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)",
                        "ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
                        "Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
                        "Belgium Official Reported GDP (2015-2017)"),
       col=c("black", "blue", "green", "purple", "orange", "red"), 
       lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2005, 30, expression(atop(paste("Training Region (2000-2014)"), 
                paste(Model(GDP) %->% "ARIMAX(p, q, r) ;  ", 
                      XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2015, 30, expression(atop(paste("Validation Region (2015-2017)"), 
        paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ", 
              XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)

6 Spacekime analytics

6.1 Generic K-Space transformations (FT/IFT)

Let’s start by defining the generic k-space transformation.

# FT/Spacekime Analytics
# 1D timeseries FFT SHIFT
fftshift1D <- function(img_ff) {
  rows <- length(img_ff)   
  rows_half <- ceiling(rows/2)
  return(append(img_ff[(rows_half+1):rows], img_ff[1:rows_half]))
}

# Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain)
kSpaceTransform <- function(data, inverse = FALSE, reconPhases = NULL) {
  # ForwardFT (rawData, FALSE, NULL)
  # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL)
  FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
  mag_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
  phase_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
  IFT_reconPhases_data <- array(complex(), c(dim(data)[1], dim(data)[2]))

  for (i in 1:dim(data)[2]) {
    if (inverse == FALSE | is.null(reconPhases)) {
      FT_data[ , i] <- fft(data[ , i], inverse)
      X2 <- FT_data[ , i]
      # plot(fftshift1D(log(Re(X2)+2)), main = "log(fftshift1D(Re(FFT(timeseries))))") 
      mag_FT_data[ , i] <- sqrt(Re(X2)^2+Im(X2)^2); 
      # plot(log(fftshift1D(Re(mag_FT_MCSI_data))), main = "log(Magnitude(FFT(timeseries)))") 
      phase_FT_data[ , i] <- atan2(Im(X2), Re(X2)); 
      # plot(Re(fftshift1D(phase_FT_MCSI_data[ , 1])), main = "Shift(Phase(FFT(timeseries)))")
    }
    else {  # for IFT synthesis using user-provided Phases, typically from kime-phase aggregators
      Real <- data[ , i] * cos(reconPhases[ , i])  
      Imaginary <- data[ , i] * sin(reconPhases[ , i]) 
      IFT_reconPhases_data[ ,i] <- 
          Re(fft(Real+1i*Imaginary, inverse = TRUE)/length(data[ , i]))
    }
  }
    ######### Test the FT-IFT analysis-synthesis back-and-forth transform process 
    #         to confirm calculations
    # X2 <- FT_data[ , 1]; mag_FT_data[ , 1] <- sqrt(Re(X2)^2+Im(X2)^2); 
    # phase_FT_data[ , 1] <- atan2(Im(X2), Re(X2)); 
    # Real2 = mag_FT_data[ , 1] * cos(phase_FT_data[ , 1])
    # Imaginary2 = mag_FT_data[ , 1] * sin(phase_FT_data[ , 1])
    # man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2))
    # ifelse(abs(man_hat_X2[5] - data[5, 1]) < 0.001, "Perfect Syntesis", "Problems!!!")
    #########
  
    if (inverse == FALSE | is.null(reconPhases)) {
      return(list("magnitudes"=mag_FT_data, "phases"=phase_FT_data))
      # Use kSpaceTransform$magnitudes & kSpaceTransform$phases to retrieve teh Mags and Phases
    }
    else {
      return(IFT_reconPhases_data)
      # Use Re(kSpaceTransform) to extract spacetime Real-valued reconstructed data
    }
}

6.2 Kime-Phase Distributions

Examine the Kime-direction Distributions of the Phases for all Belgium features (predictors + outcome). Define a generic function that plots the Phase distributions.

library(tidyr)
library(ggplot2)

plotPhaseDistributions <- function (dataFT, dataColnames, size=8, ...) {
  df.phase <- as.data.frame(Re(dataFT$phases))
  df.phase %>% gather() %>% head()
  colnames(df.phase) <- dataColnames
  phaseDistributions <- gather(df.phase)
  colnames(phaseDistributions) <- c("Feature", "Phase")
  if (is.null(size)) size=10
  
  # map the value as our x variable, and use facet_wrap to separate by the key column:
  ggplot(phaseDistributions, aes(Phase)) + 
    # geom_histogram(bins = 10) + 
    geom_histogram(aes(y=..density..), bins = 10) + 
    facet_wrap( ~Feature, scales = 'free_x') +
    xlim(-pi, pi) + 
    theme(strip.text.x = element_text(size = size, colour = "black", angle = 0))
}

# homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
X_Belgium <- homoFeat$X_Country1; dim(X_Belgium)

## [1] 360 131

Y_Belgium <- preprocess_Belgium$Y; dim(Y_Belgium)

## [1] 360   1

FT_Belgium <- kSpaceTransform(cbind(X_Belgium, Y_Belgium), FALSE, NULL)
dataColnames <- c(colnames(X_Belgium), "Y_GDP_Belgium")
plotPhaseDistributions(FT_Belgium, dataColnames)

Phase Distribution of Belgium

IFT_FT_Belgium <- kSpaceTransform(FT_Belgium$magnitudes, TRUE, FT_Belgium$phases)
# Check IFT(FT) == I: 
# ifelse(abs(cbind(X_Belgium, Y_Belgium)[5,4] - Re(IFT_FT_Belgium[5,4])) < 0.001, "Perfect Syntesis", "Problems!!!")

6.3 Nil-Phase Synthesis and ARIMAX re-modeling

Perform Nil-Phase reconstruction - IFT_NilPhase_FT_Belgium - and then re-fir the ARIMAX model

# 1. Nil-Phase data synthesys (reconstruction)
temp_Data <- cbind(X_Belgium, Y_Belgium)
nilPhase_FT_data <- array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(nilPhase_FT_data)    # ;  head(nilPhase_FT_data)
# [1] 360 132
IFT_NilPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

#       Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, nilPhase_FT_data))

colnames(IFT_NilPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dim(IFT_NilPhase_FT_Belgium); dim(FT_Belgium$magnitudes)
colnames(IFT_NilPhase_FT_Belgium); head(IFT_NilPhase_FT_Belgium); # head(temp_Data)

# 2. Perform ARIMAX modeling on IFT_NilPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_NilPhase_FT_Belgium_Y_train <- IFT_NilPhase_FT_Belgium[1:300, 132]; length(IFT_NilPhase_FT_Belgium_Y_train)
IFT_NilPhase_FT_Belgium_Y_test <- IFT_NilPhase_FT_Belgium[301:360]; length(IFT_NilPhase_FT_Belgium_Y_test)

# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_NilPhase_FT_Belgium_X_train <- as.data.frame(IFT_NilPhase_FT_Belgium)[1:300, 1:131]; dim(IFT_NilPhase_FT_Belgium_X_train)
IFT_NilPhase_FT_Belgium_X_test <- as.data.frame(IFT_NilPhase_FT_Belgium)[301:360, 1:131]; dim(IFT_NilPhase_FT_Belgium_X_test)

# Outcome Variable to be ARIMAX-modelled, as a timeseries
ts_IFT_NilPhase_FT_Belgium_Y_train <- 
         ts(IFT_NilPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)

# Find ARIMAX model: 2  1  2  0 20  0  0
set.seed(1234)
modArima_IFT_NilPhase_FT_Belgium_Y_train <- 
        auto.arima(ts_IFT_NilPhase_FT_Belgium_Y_train, xreg=IFT_NilPhase_FT_Belgium_X_train)

modArima_IFT_NilPhase_FT_Belgium_Y_train$arma

# Regression with ARIMA(2,0,1)(2,0,0)[20] errors 
# Coefficients:
#          ar1      ar2     ma1    sar1     sar2  Acquisitions less disposals of non-financial non-produced assets
#      -1.4232  -0.8592  -0.987  0.0941  -0.3451                                                            0.0123
#s.e.   0.0341   0.0311     NaN  0.0688   0.0742                                                            0.0007
# sigma^2 estimated as 0.8622:  log likelihood=-320.39 AIC=914.79   AICc=1148.19   BIC=1422.2
pred_arimax_2_0_1_Nil <- forecast(modArima_IFT_NilPhase_FT_Belgium_Y_train, xreg = IFT_NilPhase_FT_Belgium_X_test)
pred_arimax_2_0_1_Nil_2015_2017 <- 
  ts(pred_arimax_2_0_1_Nil$mean, frequency=20, start=c(2015,1), end=c(2017,20))

pred_arimax_2_0_1_Nil_2015_2017

## Time Series:
## Start = c(2015, 1) 
## End = c(2017, 20) 
## Frequency = 20 
##       301       302       303       304       305       306       307       308 
##  85.02364 101.33786  95.24676 101.49982  93.98976  94.38776  97.51510  96.05219 
##       309       310       311       312       313       314       315       316 
##  85.63166 113.27167  96.39145  97.82059  98.63777  93.78477  91.20510 102.88110 
##       317       318       319       320       321       322       323       324 
##  95.81967 103.30420  89.45534 103.39560 108.05981  96.02793 105.99804  88.89360 
##       325       326       327       328       329       330       331       332 
## 105.99806 101.23141 103.42049 102.00218  99.90036 100.80956  96.56583 103.71199 
##       333       334       335       336       337       338       339       340 
## 107.28897  94.43721 111.65774  99.79332 108.91354 109.66776 103.23192 105.11549 
##       341       342       343       344       345       346       347       348 
## 108.12944  99.35360 110.90110 112.87704 109.90520 108.49580 107.15445 107.12296 
##       349       350       351       352       353       354       355       356 
## 128.99135 112.62487 111.40590 117.22144 102.33043 115.79956 115.48040 114.26040 
##       357       358       359       360 
## 114.31062 117.33343 105.32412 118.13403

# alternatively:
# pred_arimax_1_0_1_2015_2017 <- predict(modArima_IFT_NilPhase_FT_Belgium_Y_train, 
#                                              n.ahead = 3*20, newxreg = IFT_NilPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_NilPhase_FT_Belgium_Y_train$coef)[1:10]

## Labor cost for LCI (compensation of employees plus taxes minus subsidies) 
##                                                                -1.5972295 
##                                                                       ar1 
##                                                                -1.4231617 
##                                  Labor cost other than wages and salaries 
##                                                                -1.2213214 
##                                                                       ma1 
##                                                                -0.9869571 
##                                                                       ar2 
##                                                                -0.8591937 
##            Unemployment , Females, From 15-64 years, From 12 to 17 months 
##                                                                -0.7075454 
##              Unemployment , Total, From 15-64 years, From 18 to 23 months 
##                                                                -0.5797656 
##                Unemployment , Males, From 15-64 years, from 3 to 5 months 
##                                                                -0.5026139 
##                                                                      sar2 
##                                                                -0.3450866 
##              Unemployment , Males, From 15-64 years, from 24 to 47 months 
##                                                                -0.2965540

# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-1.5972295 
#                                                                      ar1, effect=-1.4231617 
#                                 Labor cost other than wages and salaries, effect=-1.2213214 
#                                                                      ma1, effect=-0.9869571 
#                                                                      ar2, effect=-0.8591937 
#           Unemployment , Females, From 15-64 years, From 12 to 17 months, effect=-0.7075454 
#             Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.5797656 
#               Unemployment , Males, From 15-64 years, from 3 to 5 months, effect=-0.5026139 
#                                                                     sar2, effect=-0.3450866 
#             Unemployment , Males, From 15-64 years, from 24 to 47 months, effect=-0.2965540 
cor(pred_arimax_2_0_1_Nil$mean, ts_Y_Belgium_test)  # 0.14

## [1] 0.1420164

mean(pred_arimax_2_0_1_Nil_2015_2017) # [1] 105

## [1] 103.7756

6.4 Swapped-Phase Synthesis and ARIMAX Modeling

Space-time reconstructions by inverting back the mag_FT for each feature signal after swapping the feature-phases. In other words, we randomly shuffle the columns of the Phases-matrix (Training & Testing XReg Data) and use these swapped phases to synthesize the design covariate matrix ($Xreg$).

# 1. Swap Feature Phases and then synthesize the data (reconstruction)
# temp_Data <- cbind(X_Belgium, Y_Belgium)
swapped_phase_FT_Belgium_data <- FT_Belgium$phases
colnames(swapped_phase_FT_Belgium_data) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
swapped_phase_FT_Belgium_data1 <- swapped_phase_FT_Belgium_data
dim(swapped_phase_FT_Belgium_data)    # ;  head(swappedPhase_FT_data)
# [1] 360 132
IFT_SwappedPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

set.seed(12345)   # sample randomly Phase-columns for each of the 131 covariates (X)
#swap_phase_FT_Belgium_indices <- sample(ncol(swapped_phase_FT_Belgium_data)-1)
# for (j in 1:131) {  # for all coluns of the design Xreg matrix, excluding Y, randomly swap columns phases
#  swapped_phase_FT_Belgium_data1[ , j] <- swapped_phase_FT_Belgium_data[, swap_phase_FT_Belgium_indices[j]]
#}
swapped_phase_FT_Belgium_data1 <- as.data.frame(cbind(
  swapped_phase_FT_Belgium_data[ , sample(ncol(swapped_phase_FT_Belgium_data[ , 1:131]))], 
  swapped_phase_FT_Belgium_data[ , 132]))
swapped_phase_FT_Belgium_data <- swapped_phase_FT_Belgium_data1
colnames(swapped_phase_FT_Belgium_data)[132] <- "Y_GDP_Belgium"
colnames(swapped_phase_FT_Belgium_data)
dim(swapped_phase_FT_Belgium_data); dim(FT_Belgium$phases)

# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal using the feature swapped phases
IFT_SwappedPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, swapped_phase_FT_Belgium_data))

colnames(IFT_SwappedPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dim(IFT_SwappedPhase_FT_Belgium); dim(FT_Belgium$magnitudes)
colnames(IFT_SwappedPhase_FT_Belgium); tail(IFT_SwappedPhase_FT_Belgium); # tail(temp_Data)

# 2. Perform ARIMAX modeling on IFT_SwappedPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_SwappedPhase_FT_Belgium_Y_train <- IFT_SwappedPhase_FT_Belgium[1:300, 132]; length(IFT_SwappedPhase_FT_Belgium_Y_train)
IFT_SwappedPhase_FT_Belgium_Y_test <- IFT_SwappedPhase_FT_Belgium[301:360]; length(IFT_SwappedPhase_FT_Belgium_Y_test)

# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_SwappedPhase_FT_Belgium_X_train <- as.data.frame(IFT_SwappedPhase_FT_Belgium)[1:300, 1:131]
dim(IFT_SwappedPhase_FT_Belgium_X_train)
IFT_SwappedPhase_FT_Belgium_X_test <- as.data.frame(IFT_SwappedPhase_FT_Belgium)[301:360, 1:131]
dim(IFT_SwappedPhase_FT_Belgium_X_test)

# Outcome Variable to be ARIMAX-modelled, as a timeseries
ts_IFT_SwappedPhase_FT_Belgium_Y_train <- 
         ts(IFT_SwappedPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)

# Find ARIMAX model: 1  0  2  0 20  0  0
set.seed(1234)
modArima_IFT_SwappedPhase_FT_Belgium_Y_train <- 
        auto.arima(ts_IFT_SwappedPhase_FT_Belgium_Y_train, xreg=IFT_SwappedPhase_FT_Belgium_X_train)
modArima_IFT_SwappedPhase_FT_Belgium_Y_train$arma

# Regression with ARIMA(1,0,0)(2,0,0)[20] errors 
# Coefficients:
#          ar1     sar1    sar2  intercept  Acquisitions less disposals of non-financial non-produced assets
#      -0.3837  -0.2196  0.2827    46.1704                                                            0.0063
#s.e.   0.1113   0.0903  0.0779    36.8492                                                            0.0080
# sigma^2 estimated as 70:  log likelihood=-976.01 AIC=2224.02   AICc=2452.63   BIC=2727.73

pred_arimax_1_0_0_Swapped <- forecast(modArima_IFT_SwappedPhase_FT_Belgium_Y_train, xreg = IFT_SwappedPhase_FT_Belgium_X_test)
pred_arimax_1_0_0_Swapped_2015_2017 <- 
  ts(pred_arimax_1_0_0_Swapped$mean, frequency=20, start=c(2015,1), end=c(2017,20))

pred_arimax_1_0_0_Swapped_2015_2017

## Time Series:
## Start = c(2015, 1) 
## End = c(2017, 20) 
## Frequency = 20 
##       301       302       303       304       305       306       307       308 
##  99.10741 110.77632  91.58138  98.50625 122.93774 105.60752 104.87176 113.06541 
##       309       310       311       312       313       314       315       316 
## 113.61298 106.57517 104.94313  92.11965 102.44793 117.49381 111.47798 128.57506 
##       317       318       319       320       321       322       323       324 
## 106.60489  98.64213 105.63698 114.32857 121.77287  95.20252 108.93503 104.96129 
##       325       326       327       328       329       330       331       332 
## 106.67301 106.67622 107.40082 105.75911 102.10965 113.08226 103.28406  97.29026 
##       333       334       335       336       337       338       339       340 
## 114.32844  89.02971 104.32914  94.74108 112.06142 112.33537 101.01485 113.38591 
##       341       342       343       344       345       346       347       348 
## 111.82291 110.66114 101.03265 107.66914 113.26743 105.83638 103.92252 103.19822 
##       349       350       351       352       353       354       355       356 
## 104.23278  90.16403 102.19771 104.50565 101.01555 103.87907  99.85848  99.51574 
##       357       358       359       360 
##  93.93449 100.37559  99.90370  92.28034

# alternatively:
# pred_arimax_1_0_0_Swapped_2015_2017 <- predict(modArima_IFT_SwappedPhase_FT_Belgium_Y_train, 
#                                              n.ahead = 3*20, newxreg = IFT_SwappedPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_SwappedPhase_FT_Belgium_Y_train$coef)[1:10]

##                                                                       ar1 
##                                                               -0.38372043 
##            Unemployment , Females, From 15-64 years, From 18 to 23 months 
##                                                               -0.31137514 
##                                  Labor cost other than wages and salaries 
##                                                               -0.31094561 
##                                                                      sar1 
##                                                               -0.21964957 
##             Unemployment , Females, From 15-64 years, From 6 to 11 months 
##                                                               -0.20878853 
## Labor cost for LCI (compensation of employees plus taxes minus subsidies) 
##                                                               -0.12497311 
##                 Unemployment , Males, From 15-64 years, Less than 1 month 
##                                                               -0.10849013 
##                 Unemployment , Total, From 15-64 years, 48 months or over 
##                                                               -0.09066684 
##                Unemployment , Total, From 15-64 years, From 1 to 2 months 
##                                                               -0.05852382 
##               Unemployment , Females, From 15-64 years, 48 months or over 
##                                                               -0.05695172

#                                                                     ar1, effect=-0.38372043 
#           Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.31137514 
#                                 Labor cost other than wages and salaries, effect=-0.31094561 
#                                                                     sar1, effect=-0.21964957 
#            Unemployment , Females, From 15-64 years, From 6 to 11 months, effect=-0.20878853 
#Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.12497311 
#                Unemployment , Males, From 15-64 years, Less than 1 month, effect=-0.10849013 
#                Unemployment , Total, From 15-64 years, 48 months or over, effect=-0.09066684 
#               Unemployment , Total, From 15-64 years, From 1 to 2 months, effect=-0.05852382 
#              Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.05695172 
cor(pred_arimax_1_0_0_Swapped$mean, ts_Y_Belgium_test)  # 0.0

## [1] -0.007886214

mean(pred_arimax_1_0_0_Swapped_2015_2017) # [1] 105

## [1] 105.2093

6.5 Random-Phase Synthesis and ARIMAX re-modeling

Perform Random-Phase reconstruction - IFT_RandPhase_FT_Belgium - by randomly sampling from the phase distributions for each feature and then re-fitting the ARIMAX model

# 1. Random-Phase data synthesys (reconstruction)
# temp_Data <- cbind(X_Belgium, Y_Belgium)
randPhase_FT_data <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(randPhase_FT_data)    # ;  head(randPhase_FT_data)
# [1] 360 132
IFT_RandPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

randPhase_FT_data <- FT_Belgium$phases
for (i in 1:(dim(randPhase_FT_data)[2] -1)) {
  if (i < dim(randPhase_FT_data)[2]) {
    set.seed(12345)   # sample randomly Phases for each of the 131 predictors covariates (X)
    randPhase_FT_data[ , i] <- FT_Belgium$phases[sample(nrow(FT_Belgium$phases)), i]
  } else {   } # for the Y outcome (Last Column) - do not change the phases of the Y
}
#       Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with avg-phase
IFT_RandPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, randPhase_FT_data))
colnames(IFT_RandPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dim(IFT_RandPhase_FT_Belgium); dim(FT_Belgium$magnitudes)
colnames(IFT_RandPhase_FT_Belgium); tail(IFT_RandPhase_FT_Belgium); # tail(temp_Data)

dim(IFT_RandPhase_FT_Belgium); head(Re(IFT_RandPhase_FT_Belgium)); tail(Re(IFT_RandPhase_FT_Belgium))


# 2. Perform ARIMAX modeling on IFT_RandPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_RandPhase_FT_Belgium_Y_train <- IFT_RandPhase_FT_Belgium[1:300, 132]; length(IFT_RandPhase_FT_Belgium_Y_train)
IFT_RandPhase_FT_Belgium_Y_test <- IFT_RandPhase_FT_Belgium[301:360]; length(IFT_RandPhase_FT_Belgium_Y_test)

# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_RandPhase_FT_Belgium_X_train <- as.data.frame(IFT_RandPhase_FT_Belgium)[1:300, 1:131]; dim(IFT_RandPhase_FT_Belgium_X_train)
IFT_RandPhase_FT_Belgium_X_test <- as.data.frame(IFT_RandPhase_FT_Belgium)[301:360, 1:131]; dim(IFT_RandPhase_FT_Belgium_X_test)

# Outcome Variable to be ARIMAX-modelled, as a timeseries
ts_IFT_RandPhase_FT_Belgium_Y_train <- 
         ts(IFT_RandPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)

# Find ARIMAX model: 0  0  2  0 20  0  0
set.seed(1234)
modArima_IFT_RandPhase_FT_Belgium_Y_train <- 
        auto.arima(ts_IFT_RandPhase_FT_Belgium_Y_train, xreg=IFT_RandPhase_FT_Belgium_X_train)
modArima_IFT_RandPhase_FT_Belgium_Y_train$arma

# Regression with ARIMA(0,0,0)(2,0,0)[20] errors 
# Coefficients:
#         sar1    sar2  Acquisitions less disposals of non-financial non-produced assets
#      -0.0743  0.5766                                                            0.0162
#s.e.   0.0625  0.0752                                                            0.0100 
#sigma^2 estimated as 72.17:  log likelihood=-988.06 AIC=2244.12   AICc=2463.4   BIC=2740.43

pred_arimax_0_0_0_Rand <- forecast(modArima_IFT_RandPhase_FT_Belgium_Y_train, xreg = IFT_RandPhase_FT_Belgium_X_test)
pred_arimax_0_0_0_Rand_2015_2017 <- 
  ts(pred_arimax_0_0_0_Rand$mean, frequency=20, start=c(2015,1), end=c(2017,20))

pred_arimax_0_0_0_Rand_2015_2017

## Time Series:
## Start = c(2015, 1) 
## End = c(2017, 20) 
## Frequency = 20 
##       301       302       303       304       305       306       307       308 
##  92.80231 104.99710  87.45035  75.08463 103.09887  93.28688  90.83812  88.43779 
##       309       310       311       312       313       314       315       316 
## 103.67438  86.28912 107.24861  87.24921 114.39761  94.46072  89.95292 108.18454 
##       317       318       319       320       321       322       323       324 
##  96.28926  94.71985  99.80931  81.39295  76.55054  79.73523  96.72882  94.78202 
##       325       326       327       328       329       330       331       332 
##  77.65464  82.44926  92.59295  70.27627  85.31992  92.93975  80.25941  82.43967 
##       333       334       335       336       337       338       339       340 
##  82.12752  84.24610 100.37597  76.09647  79.43232  85.82942  82.38588  88.82293 
##       341       342       343       344       345       346       347       348 
##  80.76724  80.57109  90.10177  75.38288  94.01425  88.78526  92.36941  88.43295 
##       349       350       351       352       353       354       355       356 
##  84.34626  72.68033  88.87970  73.72520  74.90564  90.76582  84.32506  82.03129 
##       357       358       359       360 
##  76.89846  92.09809  75.23212  87.49829

# alternatively:
# pred_arimax_1_0_1_Rand_2015_2017 <- predict(modArima_IFT_RandPhase_FT_Belgium_Y_train, 
#                                              n.ahead = 3*20, newxreg = IFT_RandPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_RandPhase_FT_Belgium_Y_train$coef)[1:10]

## Labor cost for LCI (compensation of employees plus taxes minus subsidies) 
##                                                               -0.71989958 
##            Unemployment , Females, From 15-64 years, From 18 to 23 months 
##                                                               -0.54541627 
##               Unemployment , Females, From 15-64 years, 48 months or over 
##                                                               -0.44230677 
##                                  Labor cost other than wages and salaries 
##                                                               -0.32854422 
##               Unemployment , Males, From 15-64 years, from 6 to 11 months 
##                                                               -0.24511374 
##              Unemployment , Total, From 15-64 years, From 24 to 47 months 
##                                                               -0.19283037 
##       Agriculture, forestry and fishing - Employers' social contributions 
##                                                               -0.11994897 
##            Unemployment , Females, From 15-64 years, From 24 to 47 months 
##                                                               -0.10835175 
##              Unemployment , Females, From 15-64 years, From 1 to 2 months 
##                                                               -0.09093252 
##              Unemployment , Total, From 15-64 years, From 18 to 23 months 
##                                                               -0.07427297

# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.71989958 
#           Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.54541627 
#              Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.44230677 
#                                 Labor cost other than wages and salaries, effect=-0.32854422 
#              Unemployment , Males, From 15-64 years, from 6 to 11 months, effect=-0.24511374 
#             Unemployment , Total, From 15-64 years, From 24 to 47 months, effect=-0.19283037 
#      Agriculture, forestry and fishing - Employers' social contributions, effect=-0.11994897 
#           Unemployment , Females, From 15-64 years, From 24 to 47 months, effect=-0.10835175 
#             Unemployment , Females, From 15-64 years, From 1 to 2 months, effect=-0.09093252 
#             Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.07427297 
cor(pred_arimax_0_0_0_Rand$mean, ts_Y_Belgium_test)  # -0.15

## [1] -0.1532414

mean(pred_arimax_0_0_0_Rand_2015_2017) # [1] 87.74201

## [1] 87.74201

6.6 Result Visualization

## Plot the results of the model ARIMAX fitting
ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ], 
                        start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)

## [1] 60

# windows(width=14, height=10)
plot(forecast(BelgiumARIMA, xreg = data.matrix(X_Belgium_test)),         # ARIMA forecast
     include=60, lwd=4, lty=3, xlab="Time", ylab="GPD Purchasing Power Standards (PPS)",
     ylim=c(60, 160),cex.main=0.8,
     main = "Spacekime ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
      based on fitting ARIMAX Models on spline interpolated & kime-transformed Belgium data")
lines(pred_arimax_2_0_1_Nil_2015_2017, col = "green", lwd = 4, lty=2)   # Belgium Xreg Nil-Phase Reconstructions
lines(pred_arimax_1_0_0_Swapped_2015_2017, col = "purple", lwd = 4, lty=1) # Belgium Xreg Swapped-Phase Reconstructions
lines(pred_arimax_0_0_0_Rand_2015_2017, col = "orange", lwd = 4, lty=1) # Belgium Xreg Random-Phase Reconstructions
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1)       # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)", 
                        "ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
                        "ARIMAX(2,0,1) Belgium Xreg Nil-Phase Reconstruction (2015-2017)",
                        "ARIMAX(1,0,0) Belgium Xreg Swapped-Phase Reconstructions (2015-2017)",
                        "ARIMAX(0,0,0)(2,0,0)[20]  Belgium Xreg Random-Phase Reconstructions (2015-2017)",
                        "Belgium Official Reported GDP (2015-2017)"),
       col=c("black", "blue", "green", "purple", "orange", "red"), 
       lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2013.5, 65, expression(atop(paste("Training Region (2000-2014)"), 
                paste(Model(GDP) %->% "ARIMAX(p, q, r) ;  ", 
                      XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2016.5, 65, expression(atop(paste("Validation Region (2015-2017)"), 
        paste(hat(GDP) %<-% "ARIMAX(., ., .); ", 
              XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)

European Economic SpcaceKime Analysis Part 1 – Longitudinal Modeling

SOCR Team

10/20/2020