---
title: "Spacekime Analytics (Time Complexity and Inferential Uncertainty)"
subtitle: "European Economic Spacekime Analysis Part 1 – Longitudinal Modeling"
author: "SOCR Team "
date: "`r format(Sys.time(),'%m/%d/%Y')`"
output: 
  html_document:
    theme: spacelab
    highlight: tango
    includes:
      before_body: TCIU_header.html
    toc: true
    number_sections: true
    toc_depth: 2
    toc_float:
      collapsed: false
      smooth_scroll: true
    code_folding: hide
---

```{r error=F, message=F, warning=F}
library(dplyr)
library(plotly)
library(pracma)
library(ggplot2)
require(magrittr)
library(gridExtra)
```

# Data Preprocessing {.tabset .tabset-fade .tabset-dropdown}

 - Load the data
 - Find all "Common" features (highly-observed and congruent Econ indicators)
 - Establish homologies between the XReg predictors for ALL 31 EU countries.
 
```{r}
load("EU_Econ_rawdata.RData")
```

```{r,echo = T, results = 'hide', message=F, warning=F}
#super sample the dataset
cleardata <- function(mat) {
  for (i in 1:ncol(mat)) {
    mat[is.na(mat[,i]),i]<-mean(mat[,i],na.rm = T) + rnorm(sum(is.na(mat[,i])),sd = sd(mat[,i],na.rm = T))
  }
  return(mat)
}
# 1. Find all "Common" features (highly-observed and congruent Econ indicators)
countryNames <- unique(time_series$country); length(countryNames); # countryNames
# initialize 3D array of DF's that will store the data for each of the countries into a 2D frame
countryData <- list()  # countryData[[listID==Country]][1-time-72, 1-feature-197]

for (i in 1:length(countryNames)) {
  countryData[[i]] <- filter(time_series, country == countryNames[i])
}
# Check countryData[[2]][2, 3] == Belgium[2,3]

list_of_dfs_CommonFeatures <- list()  # list of data for supersampled countries 360 * 197

# 2. General function that ensures the XReg predictors for ALL 31 EU countries are homologous
completeHomologousX_features <- function (list_of_dfs) {
  # delete features that are missing at all time points
  for (j in 1:length(list_of_dfs)) {
    print(paste0("Pre-processing Country: ...", countryNames[j], "... "))
    data = list_of_dfs[[j]]
    data = data[ , colSums(is.na(data)) != nrow(data)]
    data = dplyr::select(data, !any_of(c("time", "country")))
    DataMatrix = as.matrix(data)
    DataMatrix = cleardata(DataMatrix)
    DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value
    DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0
    # Supersample 72 --*5--> 360 timepoints 
    #DataMatrix = splinecreate(DataMatrix)
    DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data
    # remove some of features  
    #DataSuperSample = DataSuperSample[, -c(50:80)]; dim(X)  # 360 167
    # ensure full-rank design matrix, DataSuperSample
    DataSuperSample <- 
      DataSuperSample[ , qr(DataSuperSample)$pivot[seq_len(qr(DataSuperSample)$rank)]]
    print(paste0("dim()=(", dim(DataSuperSample)[1], ",", dim(DataSuperSample)[2], ") ..."))
    # update the current DF/Country
    list_of_dfs_CommonFeatures[[j]] <- DataSuperSample
  }

  # Identify All Xreg features that are homologous (same feature columns) across All 31 countries
  # Identify Common Columns (features)
  comCol <- Reduce(intersect, lapply(list_of_dfs_CommonFeatures, colnames))
  list_of_dfs_CommonFeatures <- lapply(list_of_dfs_CommonFeatures, function(x) x[comCol])

  for (j in 1:length(list_of_dfs_CommonFeatures)) {
    list_of_dfs_CommonFeatures[[j]] <- subset(list_of_dfs_CommonFeatures[[j]], select = comCol)
    print(paste0("dim(", countryNames[j], ")=(", dim(list_of_dfs_CommonFeatures[[j]])[1], 
                 ",", dim(list_of_dfs_CommonFeatures[[j]])[2], ")!"))  # 72 * 197
  }
  return(list_of_dfs_CommonFeatures)
}
# Test completeHomologousX_features: dim(AllCountries)=(360,42)!
list_of_dfs_CommonFeatures <- completeHomologousX_features(countryData); 
length(list_of_dfs_CommonFeatures); dim(list_of_dfs_CommonFeatures[[1]]) # Austria data matrix 360*42
```

# Laplace Transform 

Define the *forward* and *reverse* *Laplace Transforms* (LT/ILT).

```{r}
x2 = seq(from = 1, to = 11, length.out = 50)
# drop the first row to avoid real part value of 0
y2 = seq(from = -5, to = 5, length.out = 50)
# drop the first column to avoid imaginary part value of 0
XY = expand.grid(X=x2, Y=y2)       
complex_xy = mapply(complex, real=XY$X,imaginary=XY$Y)
X<-1:72
time_points <- seq(0+0.01, 2*pi, length.out = 72)
```

```{r}
# create the LT
NuLT = function(datax, datay, inputz, k = 3, fitwarning = FALSE, mirror = FALSE, range = 2*pi) {
  
  datax = as.numeric(datax)
  datay = as.numeric(datay)

  n = length(datax)
  x1 = n/(n+0.5)*((datax-min(datax))/(max(datax)-min(datax)))*range
  
  if(mirror){
    x1 = c(x1,rev(2*range - x1))/2
    n = 2*n
    datay = c(datay, rev(datay))
    #plot(x1, datay)
  }
  
  #generate the coefficients in indefinite integral of t^n*exp(-zt)
  coef = 1;
  coefm = as.matrix(coef)
  for(i in 1:k){
    coefm = cbind(coefm,0)
    coef = c(coef*i,1)
    coefm = rbind(coefm,coef)
  }
  # these coefficients ordered by ^0, ^1, ^2, ... in column format
  
  # compute 1, z, z^2...,z^k
  zz = cbind(1,inputz)
  zt = inputz
  for (i in 2:k){
    zt = zt*inputz
    zz = cbind(zz,zt)
  }
  zd = zt*inputz
  
  # compute 1, x, x^2...,x^k
  tx = x1;
  xm = cbind(1,x1)
  for (i in 2:k){
    tx = tx*x1
    xm = cbind(xm,tx)
  }
  
  
  # sum over intervals
  result = 0*inputz
  ii = 1
  while(ii+k<=n)
  {
    A = xm[seq(ii,ii+k),c(0:k+1)]
    b = datay[seq(ii,ii+k)]
    # polyfit might be faster when using polynomial basis, while matrix inverse, `solve()`,
    # is the more general approach that works for any function basis
    polyc = as.numeric(solve(A,b))

  
    #ordered by ^0, ^1, ^2, ... in column format
    
    # Enter a new function variable qualityCheck=FALSE
    # check fit quality; this step can be skipped for speed/efficiency
    # if (qualityCheck) { .... }
    
    if (fitwarning){
      xx = seq(A[1,2],A[k+1,2],length.out = 100);
      yy = polyval(rev(polyc),xx)
      if(max(abs(yy-mean(b)))>2*max(abs(b-mean(b)))){
        print(c("Warning: Poor Fit at ",ii,", Largest Deviation is",max(abs(yy-mean(b)))))
        print(c("Spline Polynomial is", polyc),3);
        #print(c(polyval(rev(polyc),A[,2]),b))
        plot(xx, yy, main="Polynomial fit", ylab="", type="l", col="blue")
        lines(A[,2],b, col="red")
        legend("topleft",c("fit","data"),fill=c("blue","red"))
        print(" ")
      }
    }
    
    # Use vector/matrix operations to avoid looping, 
    # some of the expressions look weird
    # May need to actually compare the efficiency/speed of
    # vector based vs. standard numeric calculations
    
    m1 = t(t(polyc*coefm)*A[1,])
    m11 = as.numeric(tapply(m1, col(m1)-row(m1), sum))[0:k+1] 
    
    m2 = t(t(polyc*coefm)*A[k+1,])
    m22 = as.numeric(tapply(m2, col(m2)-row(m2), sum))[0:k+1]
    
    intgl = (exp(-inputz*A[1,2])*colSums(t(zz)*m11)-
               exp(-inputz*A[k+1,2])*colSums(t(zz)*m22))/zd
    result = result+intgl
    ii=ii+k
  }
  
  # Computations over the last interval
  if(ii<n){
    nk = n-ii;
    A = xm[seq(ii,ii+nk),c(0:nk+1)]
    b = datay[seq(ii,ii+nk)]
    nc = as.numeric(solve(A,b))
    nc = c(nc,seq(0,0,length.out = k-nk))
    
    A = xm[seq(ii,ii+nk),]
    m1 = t(t(nc*coefm)*A[1,])
    m11 = as.numeric(tapply(m1, col(m1)-row(m1), sum))[0:k+1]
    
    m2 = t(t(nc*coefm)*A[nk+1,])
    m22 = as.numeric(tapply(m2, col(m2)-row(m2), sum))[0:k+1]
    
    # cc = colSums(coefm*polyc)
    intgl = (exp(-inputz*A[1,2])*colSums(t(zz)*m11)-
               exp(-inputz*A[nk+1,2])*colSums(t(zz)*m22))/zd
    result = result+intgl
  }
  #offset = 0.05*pi
  #result = result + datay[n]*(exp(-2*pi*inputz)-exp(-(2*pi+offset)*inputz))/inputz
  return(result)
}
```


```{r}
tensor_all<-array(dim=c(30,33,50,50))
for(m in 1:30)
{
  for(i in 1:33){
  Y=list_of_dfs_CommonFeatures[[m]][,i]
  poly_z<-NuLT(time_points, Y, complex_xy, k = 3, fitwarning = FALSE)
  dim(poly_z) = c(length(x2), length(y2))
  tensor_all[m,i,,]<-poly_z
}
}
```

```{r}
 X_tensor<-tensor_all[,-31,,]
 Y_tensor<-tensor_all[,31,,]
 
```

# Kimesurface of GDP for 30 EU countries

```{r}
magnitude_feature<-lapply(1:33,function(i) Mod(tensor_all[,i,,]))
phase_feature<-lapply(1:33,function(i) atan2(Im(tensor_all[,i,,]), Re(tensor_all[,i,,])))
#xy2<-expand.grid(1:20,1:20)
colorscale = cbind(seq(0, 1, by=1/(length(x2) - 1)), rainbow(length(x2)))
commonefeature<-colnames(list_of_dfs_CommonFeatures[[31]])
p_feature<- plot_ly(hoverinfo="none", showscale = FALSE)  %>% layout(title=commonefeature[31])

for (j in 1:5)
{
  for (i in 1:6){
    xx2<-1:50+50*(j-1)
    yy2<-1:50+50*(i-1)
  p_feature <- p_feature %>%
    add_trace(x=xx2,y=yy2, z =magnitude_feature[[31]][j+(i-1)*5,,], 
              surfacecolor=phase_feature[[31]][j+(i-1)*5,,], colorscale=colorscale,   #Phase-based color
              type = 'surface',name=substr(countryNames[j+(i-1)*5],0,50),opacity=0.7,showlegend=TRUE)

  }
}
```

```{r,message=F, warning=F}
p_feature
```

# Inverse Laplace Transform

```{r}
ctILT = function(
  LTF,
  tini = 0.001,
  tend = 9,
  nnt = 200){
  if (TRUE){
    a=8; ns=100; nd=29;  
  }   #% implicit parameters
  
  N = ns+nd+1

  radt=seq(tini,tend*nnt /(nnt + 0.5),length.out = nnt); # time vector
  
  if (tini==0){
      #radt=radt[c(2:nnt)]
  } # t=0 is not allowed
  #tic    % measure the CPU time
  
  alfa = seq(1,ns+1+nd)
  beta = alfa
  
  for (j in seq(1,ns+1+nd)){#     % prepare necessary coefficients
     alfa[j]=a+(j-1)*pi*1i;
     beta[j]=-exp(a)*(-1)^j;
  }
  
  #print(beta)
  n = c(1:nd)
  bdif=rev(cumsum(gamma(nd+1)/gamma(nd+2-n)/gamma(n)))/(2^nd)
  #print(beta[ns+2:ns+1+nd])
  temp = beta[seq(ns+2,ns+1+nd)]*bdif
  print(temp)
  beta[seq(ns+2,ns+1+nd)]= temp
  beta[1]=beta[1]/2;
  ft2 = seq(1,nnt)
  
  Qz = c()
  
  for (kt in seq(1,nnt)){  # cycle for time t
     tt=radt[kt];
     s=alfa/tt;                     # complex frequency s
     Qz = c(Qz,s)
  }
   
  LTQz = LTF(Qz)

  for (kt in seq(1,nnt)){  # cycle for time t
     tt=radt[kt];
     s=alfa/tt;                     # complex frequency s
     bt=beta/tt;
     #btF=bt*NuLT(datax, datay, s);  # functional value F(s)
     btF = bt*LTQz[seq((kt-1)*N+1,kt*N)]
     ft2[kt]=sum(Re(btF));          # original f(tt)
     if(is.na(ft2[kt])){
       print(kt)
       print(LTQz[seq((kt-1)*N+1,kt*N)])
       print(btF)
     }
     
  }
  return(ft2)
}

```


```{r,echo = T, results = 'hide'}

rge = 2*pi
tnd = 2*pi



z2_funct<- function(z) NuLT(time_points,list_of_dfs_CommonFeatures[[2]][,31],inputz = z,mirror = TRUE, range  = rge)
 inv_result<-ctILT(z2_funct,tend = tnd, nnt=72*2)
z2_funct<- function(z) NuLT(time_points,list_of_dfs_CommonFeatures[[3]][,31],inputz = z,mirror = TRUE, range  = rge)
 inv_result_2<-ctILT(z2_funct,tend = tnd, nnt=72*2)
z2_funct<- function(z) NuLT(time_points,list_of_dfs_CommonFeatures[[10]][,31],inputz = z,mirror = TRUE, range  = rge)
 inv_result_3<-ctILT(z2_funct,tend = tnd, nnt=72*2)
z2_funct<- function(z) NuLT(time_points,list_of_dfs_CommonFeatures[[21]][,31],inputz = z,mirror = TRUE, range  = rge)
 inv_result_4<-ctILT(z2_funct,tend = tnd, nnt=72*2)
```

```{r}
valsn_df_1 <- as.data.frame(cbind(inv_result=inv_result[1:72],
                                    time_series=list_of_dfs_CommonFeatures[[2]][,31], time_points=time_points))
x <- list(
  title = "Time"
  
)
y <- list(
  title = "GDP of Belgium"
  
)
p1=plot_ly(valsn_df_1, x = ~time_points,y = ~time_series, name = 'original_ts',mode = 'lines', type = 'scatter')

p1<- p1 %>% add_trace(y = ~ inv_result, name = 'inv_result',mode = 'lines', line=list(dash='dot'))
p1 <- p1 %>% layout(xaxis = x, yaxis = y)


```

```{r}
valsn_df_2 <- as.data.frame(cbind(inv_result=inv_result_2[1:72],
                                    time_series=list_of_dfs_CommonFeatures[[3]][,31], time_points=time_points))

x <- list(
  title = "Time"
  
)
y <- list(
  title = "GDP of Bulgaria"
  
)
p2=plot_ly(valsn_df_2, x = ~time_points,y = ~time_series, name = 'original_ts',mode = 'lines', type = 'scatter')

p2<- p2 %>% add_trace(y = ~ inv_result, name = 'inv_result',mode = 'lines', line=list(dash='dot'))
p2 <- p2 %>% layout(xaxis = x, yaxis = y)


```


```{r}


valsn_df_3 <- as.data.frame(cbind(inv_result=inv_result_3[1:72],
                                    time_series=list_of_dfs_CommonFeatures[[10]][,31], time_points=time_points))

x <- list(
  title = "Time"
  
)
y <- list(
  title = "GDP of France"
  
)
p3=plot_ly(valsn_df_3, x = ~time_points,y = ~time_series, name = 'original_ts',mode = 'lines', type = 'scatter')

p3 <- p3 %>% add_trace(y = ~ inv_result, name = 'inv_result',mode = 'lines', line=list(dash='dot'))
p3 <- p3 %>% layout(xaxis = x, yaxis = y)

```

```{r}
valsn_df_4 <- as.data.frame(cbind(inv_result=inv_result_4[1:72],
                                    time_series=list_of_dfs_CommonFeatures[[21]][,31], time_points=time_points))

x <- list(
  title = "Time"
  
)
y <- list(
  title = "GDP of Netherlands"
  
)
p4=plot_ly(valsn_df_4, x = ~time_points,y = ~time_series, name = 'original_ts',mode = 'lines', type = 'scatter')

p4 <- p4 %>% add_trace(y = ~ inv_result, name = 'inv_result',mode = 'lines', line=list(dash='dot'))
p4 <- p4 %>% layout(xaxis = x, yaxis = y)

```

```{r}
fig<-subplot(
  p1,
  p2,
  p3,
  p4,
  nrows = 2,
  titleY = TRUE,margin = 0.05,shareX = TRUE
 )%>% layout(title ="Original Time-series and Reconstructed Time-series by ILT")
fig
```

# Data Preprocessing {.tabset .tabset-fade .tabset-dropdown}

Loading all packages that are required.

```{r, message=F, warning=F}
library(dplyr)
library(arm)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(plot3D)
library(scatterplot3d)
library(plotly)
library(fastDummies)
library(forecast)

# Load Previously Computed Workspace:
load("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_SpaceKime.RData")
```

Load the data and preprocess the data.

```{r message=F, error=F, warning=F}
setwd("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics")
eu <- read.csv("Master_Aggregate_EU_Econ_Data_11_29_2018_TimeTransform.csv", stringsAsFactors = F)[,-5]
colnames(eu) <- c("country","time","feature","value")
eu <- filter(eu,!country %in% c("European Union (25 countries)","D1_Country",""))
eu$value <- sapply(c(1:nrow(eu)),function(x) as.numeric(gsub(":|,","",eu$value[x])))
eu <- filter(eu, feature != "")
dim(eu)
```

# Reformat the data into a 3D array (country \* feature \* time)

```{r warning=F}
unq_country <- sort(unique(eu$country))
unq_time <- sort(unique(eu$time))
unq_fea <- sort(unique(eu$feature))
num_country <- length(unq_country)
num_time <- length(unq_time)
num_fea <- length(unq_fea)
eu <- arrange(eu,country,time,feature)
eu_3d_array <- array(NA,dim = c(num_country,num_time,num_fea),dimnames = list(unq_country,unq_time,unq_fea))
for (i in 1:num_country){
  for (j in 1:num_time){
    for (k in 1:num_fea){
      eu_3d_array[i,j,k] = eu$value[(i-1)*num_time*num_fea + (j-1)*num_fea + k]
    }
  }
}
eu_3d_array[1:10,1:10,1]
```

# Data visualization

```{r warning=F}
eu <- arrange(eu,time,feature,country)
eu_visualization <- dplyr::select(eu,time,feature,country,value)
eu_visualization$time <- sapply(c(1:nrow(eu_visualization)),function(x) as.numeric(gsub("Q",".",eu_visualization$time[x])))
eu_visualization$feature <- as.factor(eu_visualization$feature)
eu_visualization$country <- as.factor(eu_visualization$country)
eu_visualization$value <- as.numeric(eu_visualization$value)
eu_visualization$feature <- sapply(c(1:nrow(eu_visualization)),function(x) substr(eu_visualization$feature[x],1,20))
#plot_ly(eu_visualization, x = ~time, y = ~country, z = ~value, color = ~feature,split = ~ country,type = 'scatter3d', mode = 'lines')
```

# Time series format

```{r warning=F}
#Find the duplicates
eu_time_series <- na.omit(eu)
allFeatures = as.character(unique(eu_time_series$feature))
allTime = unique(eu_time_series$time)
allCountry = as.character(unique(eu_time_series$country))
allCombination = length(allFeatures)*length(allTime)*length(allCountry)
dup = c()
for (i in 1:length(allFeatures)){
  for (j in 1:length(allCountry)){
    for (k in 1:length(allTime)){
      if (nrow(filter(eu_time_series,country == allCountry[j] & feature == allFeatures[i] & time == allTime[k]))>1){
        dup = c(dup,as.character(allFeatures[i]))
        break
      }
    }
    break
  }
}
dup #These features have multiple observations at the same time point
```

Remove duplicates.

```{r warning=F}
removeDup = filter(eu_time_series, feature != "Employment by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels" &
                     feature != "Labor cost for LCI excluding bonuses" &
                     feature != "Labor costs other than wages or salaries" &
                     feature != "Labour cost for LCI (compensation of employees plus taxes minus subsidies)" &
                     feature != "Labour cost for LCI excluding bonuses" &
                     feature != "Labour costs other than wages and salaries" &
                     feature != "Wages and salaries (total)")
time_series = spread(removeDup,feature,value)
dim(time_series)
```

# Fit the ARIMA model for each country

## Belgium 

Extract Belgium data.

```{r warning=F}
#Chose Belgium and fit the arima
belgium = filter(time_series,country == "Belgium")
belgium = belgium[, colSums(is.na(belgium)) != nrow(belgium)] #delete the feature that is missing at all the time point
belgium = dplyr::select(belgium,-time,-country)
dim(belgium)
```

Manually clean (preprocess) the Belgium data and fit ARIMAX model 
$$Y = BelguimSuperSample\$'Unemployment,\ Females,\ From\ 15-64\ years,\ Total',$$

$$X = XReg\ (all\ other\ covariates\ -starts\_with('Unemployment')).$$

```{r warning=F}
#super sample the dataset
cleardata <- function(mat) {
  for (i in 1:ncol(mat)) {
    mat[is.na(mat[,i]),i]<-mean(mat[,i],na.rm = T) + rnorm(sum(is.na(mat[,i])),sd = sd(mat[,i],na.rm = T))
  }
  return(mat)
}

#use spline regression model to expand the dataset
splinecreate <- function(mat) {
  res<-NULL
  for (i in 1:ncol(mat)) {
    sp<-smooth.spline(seq(1:72),mat[,i])
    spresult<-predict(sp,seq(from=0,by=1/5,length.out = 360))
    spfeat<-spresult$y+rnorm(360,sd=sd(mat[,i]))
    res<-cbind(res,spfeat)
  }
  colnames(res)<-colnames(mat)
  return(res)
}
BelguimMatrix = as.matrix(belgium)
BelguimMatrix = cleardata(BelguimMatrix)
BelguimMatrix = splinecreate(BelguimMatrix)
BelguimSuperSample = as.data.frame(BelguimMatrix)

#############################
# Outcome Features to examine, predict, forecast, spacekime-analyze
#   "Gross domestic product at market prices"
#   "Unemployment , Females, From 15-64 years, Total"
#   "Capital transfers, payable"       
#   "Capital transfers, receivable"
#   "Debt securities"
#   "Government consolidated gross debt"
#   ... View(colnames(BelguimMatrix))
############################# 

################################################
#    "Unemployment , Females, From 15-64 years, Total"
Y = dplyr::select(BelguimSuperSample, "Unemployment , Females, From 15-64 years, Total"); dim(Y)
X = dplyr::select(BelguimSuperSample, -starts_with("Unemployment")); dim(X)
fitArimaX = auto.arima(Y, xreg=as.matrix(X[,-112])); fitArimaX$arma

################################################
#    "Government consolidated gross debt"
#Y = select(BelguimSuperSample, "Government consolidated gross debt"); dim(Y)
#X = select(BelguimSuperSample, -matches("debt|Debt")); dim(X)
# X_scale <- scale(X); Matrix::rankMatrix(X_scale); any(is.na(X_scale)); any(is.infinite(X_scale))
#     remove columns with infinite or missing value  
# X_scale_1 <- X_scale[ , !is.infinite(colSums(X_scale)) & !is.na(colSums(X_scale))]
#X_1 <- X[ , !is.infinite(colSums(X)) & !is.na(colSums(X))]; dim(X_1); Matrix::rankMatrix(X_1)
#X_2 <- X_1[, qr(X_1)$pivot[seq_len(qr(X_1)$rank)]]; dim(X_2)
# 
#fitArimaX = auto.arima(Y, xreg=as.matrix(X_2), method = "CSS", # "SANN", method = "CSS", optim.method = "BFGS"
#          optim.control=list(maxit = 20000, temp = 20), optim.method = "BFGS")
#fitArimaX; View(sort(fitArimaX$coef)[1:10]); fitArimaX$arma

################################################
#   "Gross domestic product at market prices"
Y = dplyr::select(BelguimSuperSample, "Gross domestic product at market prices"); dim(Y)
X = dplyr::select(BelguimSuperSample, -matches("debt|Debt")); dim(X)  # 360 167
X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X)
ts_Y <- ts(Y, start=c(2000,1), end=c(2017, 20), frequency = 20); length(ts_Y)
set.seed(1234)
fitArimaX = auto.arima(ts_Y, xreg=as.matrix(X[ , -c(50:60)])); fitArimaX$arma
# 5  0  2  0 20  0  0
# sigma^2 estimated as 57.04:  log likelihood=-1132.78 AIC=2593.55   AICc=2871.09   BIC=3230.88
pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60 <- 
predict(fitArimaX, n.ahead = 60, newxreg = as.matrix(X[301:360 , -c(50:60)]))$pred
plot(forecast(fitArimaX, xreg = as.matrix(X[301:360 , -c(50:60)])), # ARIMA forecast
     include=120, lwd=4, lty=3, xlab="Time", ylab="GDP", ylim=c(50, 150),
     main = "ARIMAX Analytics (Train: 2000-2017; Test: 2018-2020) GDP Forecasting\n
      based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
 lines(pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60, col = "red", lwd = 4, lty=3) 
 legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2017)", 
                        "ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)",
                        "ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)"),
       col=c("black", "blue", "red"), 
       lty=c(3,3,3), lwd=c(4,4,4), cex=1.2, x.intersp=1.5, y.intersp=0.6)
text(2015, 60, expression(atop(paste("Training Region (2000-2017)"), 
                paste(Model(Unempl) %->% "ARIMAX(p, q, r) ;  ", 
                      XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)
text(2019.5, 60, expression(atop(paste("Validation Region (2018-2020)"), 
        paste(hat(Unempl) %<-% "ARIMAX(5,0 ,2); ", 
              XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)
```

* *GDP Description*: [GDP](https://datamarket.com/data/set/196v/gross-domestic-product-at-market-prices) (gross domestic product) is an indicator for a nation's economic situation. It reflects the total value of all goods and services produced less the value of goods and services used for intermediate consumption in their production. Expressing GDP in PPS (purchasing power standards) eliminates differences in price levels between countries, and calculations on a per head basis allows for the comparison of economies significantly different in absolute size.

[GDP unit of measure](https://ec.europa.eu/eurostat/statistics-explained/index.php/Glossary:GDP_per_capita_in_purchasing_power_standards) represents the *Current prices, euro per capita*. The volume index of GDP per capita in Purchasing Power Standards (PPS) is intended for cross-country comparisons rather than for temporal comparisons. GDP per capita when expressed in PPS eliminates the differences in price levels between countries allowing meaningful volume comparisons of GDP between countries. Expressed in relation to the European Union (*EU27 GDP = 100*), a country with an index that is higher than $100$ or lower than $100$ corresponds to that country's level of GDP per head being higher or lower than the EU average, respectively.


## Bulgaria

Define a new function that (1) cleans the data, and (2) fits the ARIMA model estimating the seasonal and non-seasonal time-series parameters $(p,d,q)$ and the effect-sizes ($\beta$'s) for the exogenous regression features ($X$).

```{r warning=F}
library(dplyr)
#write a function of clean the data and fit the ARIMA model
Fit_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20,
                     feature="Unemployment , Females, From 15-64 years, Total")
{
  #delete features that are missing at all time points
  countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)]
  countryData = dplyr::select(countryData, -time, -country)
  DataMatrix = as.matrix(countryData)
  DataMatrix = cleardata(DataMatrix)
  DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove feature that only has one value
  DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove feature that all the values are 0
  DataMatrix = splinecreate(DataMatrix)
  DataSuperSample = as.data.frame(DataMatrix)
  if (feature=="Unemployment , Females, From 15-64 years, Total") {
      Y = dplyr::select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total")
      X = dplyr::select(DataSuperSample, -starts_with("Unemployment"))
  } else if (feature=="Gross domestic product at market prices") {
    Y = dplyr::select(DataSuperSample, "Gross domestic product at market prices"); dim(Y)
    X = dplyr::select(DataSuperSample, -matches("debt|Debt")); dim(X)  # 360 167
    print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], ");  ",
                 " dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ..."))
    X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X)  # ensure full-rank design matrix, X
  }
  else {
    print(paste0("This feature ", feature, " is not implemented yet! Exiting Fit_ARIMA() method ...")) 
    return(NULL)
  }
  ts_Y <- ts(Y, start=c(start, 1), end=c(end, frequency), frequency = frequency); length(ts_Y)
  set.seed(1234)
  fitArimaX = auto.arima(ts_Y, xreg=as.matrix(X))
  return(fitArimaX)
}
Bulgaria = filter(time_series,country == "Bulgaria")
BulgariaARIMA = Fit_ARIMA(countryData=Bulgaria, start=2000, end=2017, frequency=20, 
                     feature="Gross domestic product at market prices")
BulgariaARIMA$arma

# Extend the Fit-ARIMA method to ensure testing-training modeling/assessment for 2 countries works
preprocess_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20,
                     feature="Unemployment , Females, From 15-64 years, Total")
{
  #delete features that are missing at all time points
  countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)]
  countryData = dplyr::select(countryData, !any_of(c("time", "country")))
  DataMatrix = as.matrix(countryData)
  DataMatrix = cleardata(DataMatrix)
  DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value
  DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0
  DataMatrix = splinecreate(DataMatrix)
  DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data
  print(paste0("Processing feature: ...", feature, "... "))
      
  if (feature=="Unemployment , Females, From 15-64 years, Total") {
      Y = dplyr::select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total")
      X = dplyr::select(DataSuperSample, -starts_with("Unemployment"))
  } else if (feature=="Gross domestic product at market prices") {
      Y = dplyr::select(DataSuperSample, "Gross domestic product at market prices"); dim(Y)
      X = dplyr::select(DataSuperSample, -matches("debt|Debt")); 
      X <- X [, -c(50:80)]; dim(X)  # 360 167
  } else {
    print(paste0("This feature: ...", feature, "... is not implemented yet! Exiting preprocess_ARIMA() method ...")) 
    return(NULL)
  }
  
  # reduce the number of observations (matrix rows) to specified time range
  len_1 <- (end + 1 - start) * frequency; print(paste0("dim(X)[1]=", len_1))
  X <- X[1:len_1 , qr(X[1:len_1 , ])$pivot[seq_len(qr(X[1:len_1 , ])$rank)]]; dim(X)  
  # ensure full-rank design matrix, X
  Y <- as.data.frame(Y[1:len_1 , ])
  print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], ");  ",         # 300 136
                 " dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ..."))    # 300 1
  return(list("X"=X, "Y"=Y))
}

# Outcome Variable to be modeled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures)
# Spline interpolation *5;   2000-01 - 2014-20 (300 observations for training): 60 observations (2015-2017) for Testing

Belgium  <- filter(time_series, country == "Belgium")
Bulgaria <- filter(time_series, country == "Bulgaria")
Netherlands <- filter(time_series, country == "Netherlands")
# Test preprocess_ARIMA()
#preprocess_Belgium <- preprocess_ARIMA(countryData=Belgium, start=2000, end=2014,
#                frequency=20, feature="Gross domestic product at market prices")
#preprocess_Bulgaria <- preprocess_ARIMA(countryData=Bulgaria, start=2000, end=2014,
#                frequency=20, feature="Gross domestic product at market prices")


# General function that ensures the XReg predictors for 2 countries are homologous
homologousX_features <- function (X_Country1, X_Country2){
  # Check if the Belgium and Bulgaria Xreg are homologous (same feature columns)
  common_cols <- intersect(colnames(X_Country1), colnames(X_Country2))
  X_Country1 <- subset(X_Country1, select = common_cols)
  X_Country2 <- subset(X_Country2, select = common_cols)
  print(paste0("dim(X1)=(", dim(X_Country1)[1], ",", dim(X_Country1)[2], ");  ", # 300 131
              " dim(X2)=(", dim(X_Country2)[1], ",", dim(X_Country2)[2], ")!"))  # 300 131
  return(list("X_Country1"=X_Country1, "X_Country2"=X_Country2))
}
# Test homologousX_features
# homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
# X_Belgium  <- homoFeat$X_Country1
# X_Bulgaria <- homoFeat$X_Country2

fit_ARIMA <- function(country1Data=Belgium, country2Data=Bulgaria, 
                     start=2000, end=2014, frequency=20,
                     feature="Gross domestic product at market prices") {
  preprocess_Country1 <- preprocess_ARIMA(countryData=country1Data, 
                    start=start, end=end, frequency=frequency, feature=feature)
  preprocess_Country2 <- preprocess_ARIMA(countryData=country2Data, 
                    start=start, end=end, frequency=frequency, feature=feature)
  ts_Y_Country1 <- ts(preprocess_Country1$Y, start=c(start, 1), 
            end=c(end, frequency), frequency = frequency); length(ts_Y_Country1)
  
  homoFeat <- homologousX_features(preprocess_Country1$X, preprocess_Country2$X)
  X_Country1 <- homoFeat$X_Country1
  X_Country2 <- homoFeat$X_Country2
  
  set.seed(1234)
  fitArimaX_Country1 = auto.arima(ts_Y_Country1, xreg=as.matrix(X_Country1))
  return(fitArimaX_Country1)
}

# Belgium = filter(time_series,country == "Belgium")
BelgiumARIMA = fit_ARIMA(country1Data=Belgium, 
                    country2Data=Bulgaria, # country2Data=Netherlands,
                    start=2000, end=2014, frequency=20,
                    feature="Gross domestic product at market prices")
BelgiumARIMA$arma    # [1]  4  0  2  0 20  0  0

# sigma^2 estimated as 45.99:  log likelihood=-919.13 AIC=2116.26 AICc=2359.51 BIC=2631.09

# Outcome Variable to be modeled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures)
# Spline interpolation x5;   2000-01 - 2014-20 (300 observations for training)
# ts_Y_Belgium_train <- ts(Y_Belgium_train, start=c(2000,1), end=c(2014, 20), frequency=20)
# length(ts_Y_Belgium_train)
# ts_Y_Belgium_test <- ts(Y_Belgium_test, start=c(2015,1), end=c(2017, 20), frequency = 20)
#length(ts_Y_Belgium_test)
# Find ARIMAX model
# arimaX_Belgium_train <- auto.arima(ts_Y_Belgium_train, xreg=as.matrix(X_Belgium_train)); # arimaX_Belgium_train$arma
```

# TS Forecasting

## Perform ARIMAX modeling 

<!-- Predict `Y={Unemployment , Females, From 15-64 years, Total}`, using all other features not directly related to "unemployment". Recall the core data organization: -->

Predict `Y={Gross domestic product at market prices}`, using all other features not directly related to ***GDP**. Recall the core data organization:

 - 131 (common BE + BG) `Xreg` Predictors and 1 Outcome Variable to be modeled as a timeseries: 2000 - 2014 (15 years, Quarterly measures, 5-fold spline interpolation, $15*4*5=300$);
 - Spline interpolation x5 (freq=20 observations per year);   2000-01 to 2014-20 (300 timepoint observations over 15 years, for `training`). The remaining 60 timepoints used for `testing` (2015-01 to 2017-20).

Report `ARIMA(p,d,q)` params and quality metrics AIC/BIC.

```{r message=F, error=F, warning=F}
# Previously, we already extracted the Belgium and Bulgaria Data, preprocess it, 
# and fit the ARIMAX (Belgium-training) model. Now, we will assess the model on Bulgaria_testing sets
# BelgiumARIMA = fit_ARIMA(country1Data=Belgium, country2Data=Bulgaria,
#                    start=2000, end=2014, frequency=20,
#                    feature="Gross domestic product at market prices")
# BelgiumARIMA$arma    # [1]  4  0  2  0 20  0  0

# View rank-ordered ARIMAX effects:
# View(BelgiumARIMA$coef[order(BelgiumARIMA$coef)])
sort(BelgiumARIMA$coef)[1:10]
#           Unemployment , Females, From 15-64 years, From 18 to 23 months 
#                                                               -1.5203281 
#                                                                      ar2 
#                                                               -1.1808472 
#                                 Labor cost other than wages and salaries 
#                                                               -0.8380554 
#           Unemployment , Females, From 15-64 years, From 12 to 17 months 
#                                                               -0.7037336 
#                                                                      ar4 
#                                                               -0.6360880 
#                                                                     sar2 
#                                                               -0.6260002 
#             Unemployment , Females, From 15-64 years, From 3 to 5 months 
#                                                               -0.5262376 
#  Labor cost for LCI (compensation of employees plus taxes minus subsidies) 
#                                                               -0.2885038 
#              Unemployment , Females, From 15-64 years, 48 months or over 
#                                                               -0.2773650 
#               Unemployment , Males, From 15-64 years, from 3 to 5 months 
#                                                               -0.2567934 

#Get the Prospective Xreg=X design matrices ready (2015-2017, 60 rows)
preprocess_Belgium <- preprocess_ARIMA(countryData=Belgium, 
                    start=2000, end=2017, frequency=20, 
                    feature="Gross domestic product at market prices")
preprocess_Bulgaria <- preprocess_ARIMA(countryData=Bulgaria,  #Netherlands
                    start=2000, end=2017, frequency=20, 
                    feature="Gross domestic product at market prices")
homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
X_Belgium_test  <- homoFeat$X_Country1[301:360, ]; dim(X_Belgium_test)
X_Bulgaria_test <- homoFeat$X_Country2[301:360, ]; dim(X_Bulgaria_test)
 
# Get Predictions
pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60 <- 
      forecast(BelgiumARIMA, xreg = as.matrix(X_Bulgaria_test))$mean
pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60 <- 
      forecast(BelgiumARIMA, xreg = as.matrix(X_Belgium_test))$mean
pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60 <-
  forecast(BelgiumARIMA, xreg = as.matrix(X_Bulgaria_test))$mean +
  mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) -
  mean(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60)

cor(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, ts_Y_Belgium_test)  # 0.11
mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) # [1] 118

# Alternative predictions:
# X_Country1 <- X_Belgium_test; X_Country2 <- X_Bulgaria_test
# pred_arimaX_1_0_2_Y_Belgium_train300_Bulgaria_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Bulgaria_test)$pred
# pred_arimaX_1_0_2_Y_Belgium_train300_Belgium_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Belgium_test)$pred
```

## Display the alternative (spacetime) analytical models (ARIMAX)

Plot only the last 5-years of training (2010-2014), $100TimePoints=5Years\times 4Quarters\times 5SuperSample$ and the 3-year prospective forecasting $60TimePoints=3Years\times 4Quarters\times 5SuperSample$.

```{r message=F, error=F, warning=F, fig.width=8, fig.height=8}
ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ], 
                        start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)

# windows(width=14, height=10)
plot(forecast(BelgiumARIMA, xreg = as.matrix(X_Belgium_test)),   # ARIMA forecast
     include=100, lwd=4, lty=3, xlab="Time", ylab="GDP Purchasing Power Standards (PPS)",
     ylim=c(25, 150),
     main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
      based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, col = "green", lwd = 4, lty=2)   # Belgium train+test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60, col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60, col = "orange", lwd = 4, lty=1) 
# Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1)       # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)", 
                        "ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
                        "ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)",
                        "ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
                        "Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
                        "Belgium Official Reported GDP (2015-2017)"),
       col=c("black", "blue", "green", "purple", "orange", "red"), 
       lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2012.5, 30, expression(atop(paste("Training Region (2000-2014)"), 
                paste(Model(GDP) %->% "ARIMAX(p, q, r) ;  ", 
                      XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2016.5, 30, expression(atop(paste("Validation Region (2015-2017)"), 
        paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ", 
              XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
```

Plot entire 15-year training time-span  (2000-2014), $300TimePoints=15Years\times 4Quarters\times 5SuperSample$ and the 3-year prospective forecasting $60TimePoints=3Years\times 4Quarters\times 5SuperSample$.

```{r message=F, error=F, warning=F, fig.width=8, fig.height=8}
ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ], 
                        start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)

# windows(width=14, height=10)
plot(forecast(BelgiumARIMA, xreg = as.matrix(X_Belgium_test)),     # ARIMA forecast
     lwd=4, lty=3, xlab="Time", ylab="GDP Purchasing Power Standards (PPS)",
     ylim=c(25, 150),
     main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
      based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, col = "green", lwd = 4, lty=2)   # Belgium train+test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60, col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test
lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60, col = "orange", lwd = 4, lty=1) 
# Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1)       # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)", 
                        "ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
                        "ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)",
                        "ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
                        "Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)",
                        "Belgium Official Reported GDP (2015-2017)"),
       col=c("black", "blue", "green", "purple", "orange", "red"), 
       lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2005, 30, expression(atop(paste("Training Region (2000-2014)"), 
                paste(Model(GDP) %->% "ARIMAX(p, q, r) ;  ", 
                      XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2015, 30, expression(atop(paste("Validation Region (2015-2017)"), 
        paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ", 
              XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
```

# Spacekime analytics

## Generic K-Space transformations (FT/IFT)

Let's start by defining the generic k-space transformation.

```{r message=F, error=F, warning=F, fig.width=12, fig.height=12}
# FT/Spacekime Analytics
# 1D timeseries FFT SHIFT
fftshift1D <- function(img_ff) {
  rows <- length(img_ff)   
  rows_half <- ceiling(rows/2)
  return(append(img_ff[(rows_half+1):rows], img_ff[1:rows_half]))
}

# Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain)
kSpaceTransform <- function(data, inverse = FALSE, reconPhases = NULL) {
  # ForwardFT (rawData, FALSE, NULL)
  # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL)
  FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
  mag_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
  phase_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2]))
  IFT_reconPhases_data <- array(complex(), c(dim(data)[1], dim(data)[2]))

  for (i in 1:dim(data)[2]) {
    if (inverse == FALSE | is.null(reconPhases)) {
      FT_data[ , i] <- fft(data[ , i], inverse)
      X2 <- FT_data[ , i]
      # plot(fftshift1D(log(Re(X2)+2)), main = "log(fftshift1D(Re(FFT(timeseries))))") 
      mag_FT_data[ , i] <- sqrt(Re(X2)^2+Im(X2)^2); 
      # plot(log(fftshift1D(Re(mag_FT_MCSI_data))), main = "log(Magnitude(FFT(timeseries)))") 
      phase_FT_data[ , i] <- atan2(Im(X2), Re(X2)); 
      # plot(Re(fftshift1D(phase_FT_MCSI_data[ , 1])), main = "Shift(Phase(FFT(timeseries)))")
    }
    else {  # for IFT synthesis using user-provided Phases, typically from kime-phase aggregators
      Real <- data[ , i] * cos(reconPhases[ , i])  
      Imaginary <- data[ , i] * sin(reconPhases[ , i]) 
      IFT_reconPhases_data[ ,i] <- 
          Re(fft(Real+1i*Imaginary, inverse = TRUE)/length(data[ , i]))
    }
  }
    ######### Test the FT-IFT analysis-synthesis back-and-forth transform process 
    #         to confirm calculations
    # X2 <- FT_data[ , 1]; mag_FT_data[ , 1] <- sqrt(Re(X2)^2+Im(X2)^2); 
    # phase_FT_data[ , 1] <- atan2(Im(X2), Re(X2)); 
    # Real2 = mag_FT_data[ , 1] * cos(phase_FT_data[ , 1])
    # Imaginary2 = mag_FT_data[ , 1] * sin(phase_FT_data[ , 1])
    # man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2))
    # ifelse(abs(man_hat_X2[5] - data[5, 1]) < 0.001, "Perfect Synthesis", "Problems!!!")
    #########
  
    if (inverse == FALSE | is.null(reconPhases)) {
      return(list("magnitudes"=mag_FT_data, "phases"=phase_FT_data))
      # Use kSpaceTransform$magnitudes & kSpaceTransform$phases to retrieve teh Mags and Phases
    }
    else {
      return(IFT_reconPhases_data)
      # Use Re(kSpaceTransform) to extract spacetime Real-valued reconstructed data
    }
}
```

## Kime-Phase Distributions

Examine the Kime-direction Distributions of the Phases for all *Belgium* features (predictors + outcome). Define a generic function that plots the Phase distributions.

```{r message=F, error=F, warning=F, fig.width=12, fig.height=12}
library(tidyr)
library(ggplot2)

plotPhaseDistributions <- function (dataFT, dataColnames, size=10, ...) {
  df.phase <- as.data.frame(Re(dataFT$phases))
  df.phase %>% gather() %>% head()
  colnames(df.phase) <- dataColnames
  phaseDistributions <- gather(df.phase)
  colnames(phaseDistributions) <- c("Feature", "Phase")
  if (is.null(size)) size=10
  
  # map the value as our x variable, and use facet_wrap to separate by the key column:
  ggplot(phaseDistributions, aes(Phase)) + 
    # geom_histogram(bins = 10) + 
    geom_histogram(aes(y=..density..), bins = 10) + 
    facet_wrap( ~Feature, scales = 'free_x') +
    xlim(-pi, pi) + 
    theme(strip.text.x = element_text(size = size, colour = "black", angle = 0))
}

# homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
X_Belgium <- homoFeat$X_Country1; dim(X_Belgium)
Y_Belgium <- preprocess_Belgium$Y; dim(Y_Belgium)

FT_Belgium <- kSpaceTransform(cbind(X_Belgium, Y_Belgium), FALSE, NULL)
dataColnames <- c(colnames(X_Belgium), "Y_GDP_Belgium")
plotPhaseDistributions(FT_Belgium, dataColnames)

IFT_FT_Belgium <- kSpaceTransform(FT_Belgium$magnitudes, TRUE, FT_Belgium$phases)
# Check IFT(FT) == I: 
# ifelse(abs(cbind(X_Belgium, Y_Belgium)[5,4] - Re(IFT_FT_Belgium[5,4])) < 0.001, "Perfect Synthesis", "Problems!!!")
```

## Nil-Phase Synthesis and ARIMAX re-modeling 

Perform Nil-Phase reconstruction - `IFT_NilPhase_FT_Belgium` - and then re-fit the ARIMAX model

```{r message=F, error=F, warning=F} 
# 1. Nil-Phase data synthesis (reconstruction)
temp_Data <- cbind(X_Belgium, Y_Belgium)
nilPhase_FT_data <- array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(nilPhase_FT_data)    # ;  head(nilPhase_FT_data)
# [1] 360 132
IFT_NilPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

#       Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, nilPhase_FT_data))

colnames(IFT_NilPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dim(IFT_NilPhase_FT_Belgium); dim(FT_Belgium$magnitudes)
# colnames(IFT_NilPhase_FT_Belgium); head(IFT_NilPhase_FT_Belgium); # head(temp_Data)

# 2. Perform ARIMAX modeling on IFT_NilPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_NilPhase_FT_Belgium_Y_train <- IFT_NilPhase_FT_Belgium[1:300, 132]; length(IFT_NilPhase_FT_Belgium_Y_train)
IFT_NilPhase_FT_Belgium_Y_test <- IFT_NilPhase_FT_Belgium[301:360]; length(IFT_NilPhase_FT_Belgium_Y_test)

# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_NilPhase_FT_Belgium_X_train <- as.data.frame(IFT_NilPhase_FT_Belgium)[1:300, 1:131]; dim(IFT_NilPhase_FT_Belgium_X_train)
IFT_NilPhase_FT_Belgium_X_test <- as.data.frame(IFT_NilPhase_FT_Belgium)[301:360, 1:131]; dim(IFT_NilPhase_FT_Belgium_X_test)

# Outcome Variable to be ARIMAX-modeled, as a timeseries
ts_IFT_NilPhase_FT_Belgium_Y_train <- 
         ts(IFT_NilPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)

# Find ARIMAX model: 2  1  2  0 20  0  0
set.seed(1234)
modArima_IFT_NilPhase_FT_Belgium_Y_train <- 
        auto.arima(ts_IFT_NilPhase_FT_Belgium_Y_train, xreg=as.matrix(IFT_NilPhase_FT_Belgium_X_train))
modArima_IFT_NilPhase_FT_Belgium_Y_train$arma

# Regression with ARIMA(2,0,1)(2,0,0)[20] errors 
# Coefficients:
#          ar1      ar2     ma1    sar1     sar2  Acquisitions less disposals of non-financial non-produced assets
#      -1.4232  -0.8592  -0.987  0.0941  -0.3451                                                            0.0123
#s.e.   0.0341   0.0311     NaN  0.0688   0.0742                                                            0.0007
# sigma^2 estimated as 0.8622:  log likelihood=-320.39 AIC=914.79   AICc=1148.19   BIC=1422.2

pred_arimax_2_0_1_Nil <- forecast(modArima_IFT_NilPhase_FT_Belgium_Y_train, xreg = as.matrix(IFT_NilPhase_FT_Belgium_X_test))
pred_arimax_2_0_1_Nil_2015_2017 <- 
  ts(pred_arimax_2_0_1_Nil$mean, frequency=20, start=c(2015,1), end=c(2017,20))
pred_arimax_2_0_1_Nil_2015_2017
# alternatively:
# pred_arimax_1_0_1_2015_2017 <- predict(modArima_IFT_NilPhase_FT_Belgium_Y_train, 
#                                              n.ahead = 3*20, newxreg = IFT_NilPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_NilPhase_FT_Belgium_Y_train$coef)[1:10]
# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-1.5972295 
#                                                                      ar1, effect=-1.4231617 
#                                 Labor cost other than wages and salaries, effect=-1.2213214 
#                                                                      ma1, effect=-0.9869571 
#                                                                      ar2, effect=-0.8591937 
#           Unemployment , Females, From 15-64 years, From 12 to 17 months, effect=-0.7075454 
#             Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.5797656 
#               Unemployment , Males, From 15-64 years, from 3 to 5 months, effect=-0.5026139 
#                                                                     sar2, effect=-0.3450866 
#             Unemployment , Males, From 15-64 years, from 24 to 47 months, effect=-0.2965540 
cor(pred_arimax_2_0_1_Nil$mean, ts_Y_Belgium_test)  # 0.14
mean(pred_arimax_2_0_1_Nil_2015_2017) # [1] 105
```

## Swapped-Phase Synthesis and ARIMAX Modeling

Space-time reconstructions by inverting back the `mag_FT` for each feature signal after swapping the feature-phases. In other words, we randomly shuffle the `columns` of the Phases-matrix (Training & Testing XReg Data) and use these *swapped* phases to synthesize the design covariate matrix ($Xreg$).

```{r message=F, error=F, warning=F}
# 1. Swap Feature Phases and then synthesize the data (reconstruction)
# temp_Data <- cbind(X_Belgium, Y_Belgium)
swapped_phase_FT_Belgium_data <- FT_Belgium$phases
colnames(swapped_phase_FT_Belgium_data) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
swapped_phase_FT_Belgium_data1 <- swapped_phase_FT_Belgium_data
dim(swapped_phase_FT_Belgium_data)    # ;  head(swappedPhase_FT_data)
# [1] 360 132
IFT_SwappedPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

set.seed(12345)   # sample randomly Phase-columns for each of the 131 covariates (X)
#swap_phase_FT_Belgium_indices <- sample(ncol(swapped_phase_FT_Belgium_data)-1)
# for (j in 1:131) {  # for all columns of the design Xreg matrix, excluding Y, randomly swap columns phases
#  swapped_phase_FT_Belgium_data1[ , j] <- swapped_phase_FT_Belgium_data[, swap_phase_FT_Belgium_indices[j]]
#}
swapped_phase_FT_Belgium_data1 <- as.data.frame(cbind(
  swapped_phase_FT_Belgium_data[ , sample(ncol(swapped_phase_FT_Belgium_data[ , 1:131]))], 
  swapped_phase_FT_Belgium_data[ , 132]))
swapped_phase_FT_Belgium_data <- swapped_phase_FT_Belgium_data1
colnames(swapped_phase_FT_Belgium_data)[132] <- "Y_GDP_Belgium"
# colnames(swapped_phase_FT_Belgium_data)
dim(swapped_phase_FT_Belgium_data); dim(FT_Belgium$phases)

# Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal using the feature swapped phases
IFT_SwappedPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, swapped_phase_FT_Belgium_data))

colnames(IFT_SwappedPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dim(IFT_SwappedPhase_FT_Belgium); dim(FT_Belgium$magnitudes)
colnames(IFT_SwappedPhase_FT_Belgium); tail(IFT_SwappedPhase_FT_Belgium); # tail(temp_Data)

# 2. Perform ARIMAX modeling on IFT_SwappedPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_SwappedPhase_FT_Belgium_Y_train <- IFT_SwappedPhase_FT_Belgium[1:300, 132]; length(IFT_SwappedPhase_FT_Belgium_Y_train)
IFT_SwappedPhase_FT_Belgium_Y_test <- IFT_SwappedPhase_FT_Belgium[301:360]; length(IFT_SwappedPhase_FT_Belgium_Y_test)

# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_SwappedPhase_FT_Belgium_X_train <- as.data.frame(IFT_SwappedPhase_FT_Belgium)[1:300, 1:131]
dim(IFT_SwappedPhase_FT_Belgium_X_train)
IFT_SwappedPhase_FT_Belgium_X_test <- as.data.frame(IFT_SwappedPhase_FT_Belgium)[301:360, 1:131]
dim(IFT_SwappedPhase_FT_Belgium_X_test)

# Outcome Variable to be ARIMAX-modeled, as a timeseries
ts_IFT_SwappedPhase_FT_Belgium_Y_train <- 
         ts(IFT_SwappedPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)

# Find ARIMAX model: 1  0  2  0 20  0  0
set.seed(1234)
modArima_IFT_SwappedPhase_FT_Belgium_Y_train <- 
        auto.arima(ts_IFT_SwappedPhase_FT_Belgium_Y_train, xreg=as.matrix(IFT_SwappedPhase_FT_Belgium_X_train))
modArima_IFT_SwappedPhase_FT_Belgium_Y_train$arma

# Regression with ARIMA(1,0,0)(2,0,0)[20] errors 
# Coefficients:
#          ar1     sar1    sar2  intercept  Acquisitions less disposals of non-financial non-produced assets
#      -0.3837  -0.2196  0.2827    46.1704                                                            0.0063
#s.e.   0.1113   0.0903  0.0779    36.8492                                                            0.0080
# sigma^2 estimated as 70:  log likelihood=-976.01 AIC=2224.02   AICc=2452.63   BIC=2727.73

pred_arimax_1_0_0_Swapped <- forecast(modArima_IFT_SwappedPhase_FT_Belgium_Y_train, xreg = as.matrix(IFT_SwappedPhase_FT_Belgium_X_test))
pred_arimax_1_0_0_Swapped_2015_2017 <- 
  ts(pred_arimax_1_0_0_Swapped$mean, frequency=20, start=c(2015,1), end=c(2017,20))
pred_arimax_1_0_0_Swapped_2015_2017
# alternatively:
# pred_arimax_1_0_0_Swapped_2015_2017 <- predict(modArima_IFT_SwappedPhase_FT_Belgium_Y_train, 
#                                              n.ahead = 3*20, newxreg = IFT_SwappedPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_SwappedPhase_FT_Belgium_Y_train$coef)[1:10]
#                                                                     ar1, effect=-0.38372043 
#           Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.31137514 
#                                 Labor cost other than wages and salaries, effect=-0.31094561 
#                                                                     sar1, effect=-0.21964957 
#            Unemployment , Females, From 15-64 years, From 6 to 11 months, effect=-0.20878853 
#Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.12497311 
#                Unemployment , Males, From 15-64 years, Less than 1 month, effect=-0.10849013 
#                Unemployment , Total, From 15-64 years, 48 months or over, effect=-0.09066684 
#               Unemployment , Total, From 15-64 years, From 1 to 2 months, effect=-0.05852382 
#              Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.05695172 
cor(pred_arimax_1_0_0_Swapped$mean, ts_Y_Belgium_test)  # 0.0
mean(pred_arimax_1_0_0_Swapped_2015_2017) # [1] 105
```

## Random-Phase Synthesis and ARIMAX re-modeling 

Perform Random-Phase reconstruction - `IFT_RandPhase_FT_Belgium` - by randomly sampling from the phase distributions for each feature and then re-fitting the ARIMAX model

```{r message=F, error=F, warning=F} 
# 1. Random-Phase data synthesis (reconstruction)
# temp_Data <- cbind(X_Belgium, Y_Belgium)
randPhase_FT_data <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(randPhase_FT_data)    # ;  head(randPhase_FT_data)
# [1] 360 132
IFT_RandPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

randPhase_FT_data <- FT_Belgium$phases
for (i in 1:(dim(randPhase_FT_data)[2] -1)) {
  if (i < dim(randPhase_FT_data)[2]) {
    set.seed(12345)   # sample randomly Phases for each of the 131 predictors covariates (X)
    randPhase_FT_data[ , i] <- FT_Belgium$phases[sample(nrow(FT_Belgium$phases)), i]
  } else {   } # for the Y outcome (Last Column) - do not change the phases of the Y
}
#       Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with avg-phase
IFT_RandPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, randPhase_FT_data))
colnames(IFT_RandPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium")
dim(IFT_RandPhase_FT_Belgium); dim(FT_Belgium$magnitudes)
# colnames(IFT_RandPhase_FT_Belgium); tail(IFT_RandPhase_FT_Belgium); # tail(temp_Data)

dim(IFT_RandPhase_FT_Belgium); head(Re(IFT_RandPhase_FT_Belgium)); tail(Re(IFT_RandPhase_FT_Belgium))


# 2. Perform ARIMAX modeling on IFT_RandPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC
# library(forecast)
IFT_RandPhase_FT_Belgium_Y_train <- IFT_RandPhase_FT_Belgium[1:300, 132]; length(IFT_RandPhase_FT_Belgium_Y_train)
IFT_RandPhase_FT_Belgium_Y_test <- IFT_RandPhase_FT_Belgium[301:360]; length(IFT_RandPhase_FT_Belgium_Y_test)

# Training and Testing Data Covariates explaining the longitudinal outcome (Y)
IFT_RandPhase_FT_Belgium_X_train <- as.data.frame(IFT_RandPhase_FT_Belgium)[1:300, 1:131]; dim(IFT_RandPhase_FT_Belgium_X_train)
IFT_RandPhase_FT_Belgium_X_test <- as.data.frame(IFT_RandPhase_FT_Belgium)[301:360, 1:131]; dim(IFT_RandPhase_FT_Belgium_X_test)

# Outcome Variable to be ARIMAX-modeled, as a timeseries
ts_IFT_RandPhase_FT_Belgium_Y_train <- 
         ts(IFT_RandPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20)

# Find ARIMAX model: 0  0  2  0 20  0  0
set.seed(1234)
modArima_IFT_RandPhase_FT_Belgium_Y_train <- 
        auto.arima(ts_IFT_RandPhase_FT_Belgium_Y_train, xreg=as.matrix(IFT_RandPhase_FT_Belgium_X_train))
modArima_IFT_RandPhase_FT_Belgium_Y_train$arma

# Regression with ARIMA(0,0,0)(2,0,0)[20] errors 
# Coefficients:
#         sar1    sar2  Acquisitions less disposals of non-financial non-produced assets
#      -0.0743  0.5766                                                            0.0162
#s.e.   0.0625  0.0752                                                            0.0100 
#sigma^2 estimated as 72.17:  log likelihood=-988.06 AIC=2244.12   AICc=2463.4   BIC=2740.43

pred_arimax_0_0_0_Rand <- forecast(modArima_IFT_RandPhase_FT_Belgium_Y_train, xreg = as.matrix(IFT_RandPhase_FT_Belgium_X_test))
pred_arimax_0_0_0_Rand_2015_2017 <- 
  ts(pred_arimax_0_0_0_Rand$mean, frequency=20, start=c(2015,1), end=c(2017,20))
pred_arimax_0_0_0_Rand_2015_2017
# alternatively:
# pred_arimax_1_0_1_Rand_2015_2017 <- predict(modArima_IFT_RandPhase_FT_Belgium_Y_train, 
#                                              n.ahead = 3*20, newxreg = IFT_RandPhase_FT_Belgium_X_test)$pred
sort(modArima_IFT_RandPhase_FT_Belgium_Y_train$coef)[1:10]
# Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.71989958 
#           Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.54541627 
#              Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.44230677 
#                                 Labor cost other than wages and salaries, effect=-0.32854422 
#              Unemployment , Males, From 15-64 years, from 6 to 11 months, effect=-0.24511374 
#             Unemployment , Total, From 15-64 years, From 24 to 47 months, effect=-0.19283037 
#      Agriculture, forestry and fishing - Employers' social contributions, effect=-0.11994897 
#           Unemployment , Females, From 15-64 years, From 24 to 47 months, effect=-0.10835175 
#             Unemployment , Females, From 15-64 years, From 1 to 2 months, effect=-0.09093252 
#             Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.07427297 
cor(pred_arimax_0_0_0_Rand$mean, ts_Y_Belgium_test)  # -0.15
mean(pred_arimax_0_0_0_Rand_2015_2017) # [1] 87.74201
```

## Result Visualization

```{r message=F, error=F, warning=F, fig.width=8, fig.height=8}
## Plot the results of the model ARIMAX fitting
ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ], 
                        start=c(2015,1), end=c(2017, 20), frequency = 20)
length(ts_Y_Belgium_test)

# windows(width=14, height=10)
plot(forecast(BelgiumARIMA, xreg = as.matrix(X_Belgium_test)),         # ARIMA forecast
     include=60, lwd=4, lty=3, xlab="Time", ylab="GDP Purchasing Power Standards (PPS)",
     ylim=c(60, 160),
     main = "Spacekime ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n
      based on fitting ARIMAX Models on spline interpolated & kime-transformed Belgium data")
lines(pred_arimax_2_0_1_Nil_2015_2017, col = "green", lwd = 4, lty=2)   # Belgium Xreg Nil-Phase Reconstructions
lines(pred_arimax_1_0_0_Swapped_2015_2017, col = "purple", lwd = 4, lty=1) # Belgium Xreg Swapped-Phase Reconstructions
lines(pred_arimax_0_0_0_Rand_2015_2017, col = "orange", lwd = 4, lty=1) # Belgium Xreg Random-Phase Reconstructions
lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1)       # Observed Y_Test timeseries
legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)", 
                        "ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)",
                        "ARIMAX(2,0,1) Belgium Xreg Nil-Phase Reconstruction (2015-2017)",
                        "ARIMAX(1,0,0) Belgium Xreg Swapped-Phase Reconstructions (2015-2017)",
                        "ARIMAX(0,0,0)(2,0,0)[20]  Belgium Xreg Random-Phase Reconstructions (2015-2017)",
                        "Belgium Official Reported GDP (2015-2017)"),
       col=c("black", "blue", "green", "purple", "orange", "red"), 
       lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7)
text(2013.5, 65, expression(atop(paste("Training Region (2000-2014)"), 
                paste(Model(GDP) %->% "ARIMAX(p, q, r) ;  ", 
                      XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
text(2016.5, 65, expression(atop(paste("Validation Region (2015-2017)"), 
        paste(hat(GDP) %<-% "ARIMAX(., ., .); ", 
              XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2)
```

# ML Data Analytics

## Data Preprocessing

Find all "Common" features (highly-observed and congruent Econ indicators)

```{r message=F, error=F, warning=F}
# 1. Find all "Common" features (highly-observed and congruent Econ indicators)
countryNames <- unique(time_series$country); length(countryNames); # countryNames
# initialize 3D array of DF's that will store the data for each of the countries into a 2D frame
countryData <- list()  # countryData[[listID==Country]][1-time-72, 1-feature-197]

for (i in 1:length(countryNames)) {
  countryData[[i]] <- filter(time_series, country == countryNames[i])
}
# Check countryData[[2]][2, 3] == Belgium[2,3]

list_of_dfs_CommonFeatures <- list()  # list of data for supersampled countries 360 * 197

# 2. General function that ensures the XReg predictors for ALL 31 EU countries are homologous
completeHomologousX_features <- function (list_of_dfs) {
  # delete features that are missing at all time points
  for (j in 1:length(list_of_dfs)) {
    print(paste0("Pre-processing Country: ...", countryNames[j], "... "))
    data = list_of_dfs[[j]]
    data = data[ , colSums(is.na(data)) != nrow(data)]
    data = dplyr::select(data, !any_of(c("time", "country")))
    DataMatrix = as.matrix(data)
    DataMatrix = cleardata(DataMatrix)
    DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value
    DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0
    # Supersample 72 --*5--> 360 timepoints 
    DataMatrix = splinecreate(DataMatrix)
    DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data
    # remove some of features  
    DataSuperSample = DataSuperSample[, -c(50:80)]; dim(X)  # 360 167
    # ensure full-rank design matrix, DataSuperSample
    DataSuperSample <- 
      DataSuperSample[ , qr(DataSuperSample)$pivot[seq_len(qr(DataSuperSample)$rank)]]
    print(paste0("dim()=(", dim(DataSuperSample)[1], ",", dim(DataSuperSample)[2], ") ..."))
    # update the current DF/Country
    list_of_dfs_CommonFeatures[[j]] <- DataSuperSample
  }

  # Identify All Xreg features that are homologous (same feature columns) across All 31 countries
  # Identify Common Columns (features)
  comCol <- Reduce(intersect, lapply(list_of_dfs_CommonFeatures, colnames))
  list_of_dfs_CommonFeatures <- lapply(list_of_dfs_CommonFeatures, function(x) x[comCol])

  for (j in 1:length(list_of_dfs_CommonFeatures)) {
    list_of_dfs_CommonFeatures[[j]] <- subset(list_of_dfs_CommonFeatures[[j]], select = comCol)
    print(paste0("dim(", countryNames[j], ")=(", dim(list_of_dfs_CommonFeatures[[j]])[1], 
                 ",", dim(list_of_dfs_CommonFeatures[[j]])[2], ")!"))  # 72 * 197
  }
  return(list_of_dfs_CommonFeatures)
}
# Test completeHomologousX_features: dim(AllCountries)=(360,42)!
list_of_dfs_CommonFeatures <- completeHomologousX_features(countryData); 
length(list_of_dfs_CommonFeatures); dim(list_of_dfs_CommonFeatures[[1]]) # Austria data matrix 360*42
```

For each country ($n$) and each common feature ($k$), fit ARIMA model and estimate the parameters $(p,d,q)$ (non-exogenous, just the timeseries model for this feature), (p,d,q) triples for non-seasonal and seasonal effects.  For each (Country, Feature) pair, the 9 ARIMA-derived vector includes:
** (ts_avg, forecast_avg, non-seasonal AR, non-seasonal MA, seasonal AR, seasonal MA, period, non-seasonal Diff, seasonal differences)**.

```{r message=F, error=F, warning=F}
# 3. For each country (n) and each common feature (k), compute (p,d,q) ARIMA models (non-exogenous, 
# just the timeseries model for this feature), (p,d,q) triples
# Country * Feature
arimaModels_DF <- list() 
#data.frame(matrix(NA, nrow = length(countryNames), 
#  ncol = dim(list_of_dfs_CommonFeatures[[1]])[2]), row.names=countryNames, stringsAsFactors=T)

# colnames(arimaModels_DF) <- colnames(list_of_dfs_CommonFeatures[[1]])

# list_index <- 1
# arimaModels_ARMA_coefs <- list()  # array( , c(31, 9*dim(list_of_dfs_CommonFeatures[[1]])[2]))
# dim(arimaModels_ARMA_coefs) # [1]  31 x 378 == 31 x (9 * 42)
# For each (Country, feature) index, the 9 ARIMA-derived vector includes:
# (ts_avg, forecast_avg, non-seasonal AR, non-seasonal MA, seasonal AR, seasonal MA, period, non-seasonal Diff, seasonal differences)

for(n in 1:(length(list_of_dfs_CommonFeatures))) {          # for each Country 1<=n<=31
  for (k in 1:(dim(list_of_dfs_CommonFeatures[[1]])[2])) {  # for each feature 1<=k<=42
    # extract one timeseries (the feature+country time course)
    ts = ts(list_of_dfs_CommonFeatures[[n]][ , k],
            frequency=20, start=c(2000,1), end=c(2017,20))
    set.seed(1234)
    arimaModels_DF[[list_index]] <- auto.arima(ts)
      # pred_arimaModels_DF = forecast(arimaModels_DF[[list_index]])
      # ts_pred_arimaModels_DF <- 
      #  ts(pred_arimaModels_DF$mean, frequency=20, start=c(2015,1), end=c(2017,20))
      # ts_pred_arimaModels_DF
    arimaModels_ARMA_coefs[[list_index]] <- c (
        mean(ts),                           # time-series average (retrospective) 
        mean(forecast(arimaModels_DF[[list_index]])$mean), # forecasted TS average (prospective)
        arimaModels_DF[[list_index]]$arma)  # 7 ARMA estimated parameters
    # cat("arimaModels_ARMA_coefs[country=", countryNames[n], ", feature=",
    #          colnames(list_of_dfs_CommonFeatures[[1]])[k],
    #          "] Derived-Features=(", round(arimaModels_ARMA_coefs[[list_index]], 2), ") ...")
    #print(paste0("arimaModels_DF[country=", countryNames[i], ", feature=",
    #         colnames(list_of_dfs_CommonFeatures[[1]])[k],
    #         "]$arma =", arimaModels_DF[[list_index]]$arma))
    list_index <- list_index + 1
  }
}

length(arimaModels_ARMA_coefs) == 31*42  # [1] TRUE  # Each list-element consists of 9 values, see above
#  == dim(list_of_dfs_CommonFeatures[[1]])[1] * dim(list_of_dfs_CommonFeatures[[1]])[2]
# Maps to convert between 1D indices and 2D (Country, Feature) pairs
index2CountryFeature <- function(indx=1) { 
  if (indx<1 | indx>length(arimaModels_ARMA_coefs)) {
    cat("Index out of bounds: indx=", indx, "; must be between 1 and ", 
        length(arimaModels_ARMA_coefs), " ... Exiting ...")
    return (NULL)
  } else { 
    feature = (indx-1) %% (dim(list_of_dfs_CommonFeatures[[1]])[2])
    country = floor((indx - feature)/(dim(list_of_dfs_CommonFeatures[[1]])[2]))
    return(list("feature"=(feature+1), "country"=(country+1)))  }
}

countryFeature2Index <- function(countryIndx=1, featureIndx=1) { 
  if (countryIndx<1 | countryIndx>(dim(list_of_dfs_CommonFeatures[[1]])[1]) |
      featureIndx<1 | featureIndx>(dim(list_of_dfs_CommonFeatures[[1]])[2])) {
    cat("Indices out of bounds: countryIndx=", countryIndx, "; featureIndx=", featureIndx, "; Exiting ...")
    return (NULL)
  } else { return (featureIndx + (countryIndx-1)*(dim(list_of_dfs_CommonFeatures[[1]])[2])) }
}

# test forward and reverse index mapping functions
index2CountryFeature(42);  index2CountryFeature(45)$country;  index2CountryFeature(45)$feature
countryFeature2Index(countryIndx=2, featureIndx=3)

# Column/Feature Names: colnames(list_of_dfs_CommonFeatures[[1]])
# Country/Row Names: countryNames
arimaModels_ARMA_coefs[[1]] # Austria/Feature1  1:9 feature vector
# Cuntry2=Bulgaria, feature 5, 1:9 vector
arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=2, featureIndx=5)]]
```

Convert list of ARIMA models to a Data.Frame `[Countries, megaFeatures]` that can be put through ML data analytics. Augment the features using the [EU_SOCR_Country_Ranking_Data_2011 dataset](http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_2008_World_CountriesRankings).

```{r message=F, error=F, warning=F}
# 4. Add the country ranking as a new feature, using the OA ranks here:
# (http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_2008_World_CountriesRankings)
EU_SOCR_Country_Ranking_Data_2011 <- read.csv2("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_SOCR_Country_Ranking_Data_2011.csv", header=T, sep=",")

length(arimaModels_ARMA_coefs) # 31*42 
arima_df <- data.frame(matrix(NA, 
          nrow=length(countryNames), ncol=length(colnames(list_of_dfs_CommonFeatures[[1]]))*9))
dim(arima_df)   # [1]  31 378

for(n in 1:dim(arima_df)[1]) {        # for each Country 1<=n<=31
  for (k in 1:length(colnames(list_of_dfs_CommonFeatures[[1]]))) {     # for each feature 1<=k<=42
    for (l in 1:9) {                  # for each arima vector 1:9, see above
        arima_df[n, (k-1)*9 + l] <- 
          round(arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=n, featureIndx=k)]][l], 1)
        if (n==dim(arima_df)[1]) colnames(arima_df)[(k-1)*9 + l] <- paste0("Feature_",k, "_ArimaVec_",l)
    }
  }
}

# DF Conversion Validation
arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=3, featureIndx=5)]][2] == arima_df[3, (5-1)*9 + 2]
# [1] 1802.956

# Aggregate 2 datasets
dim(EU_SOCR_Country_Ranking_Data_2011)  # [1] 31 10

aggregate_arima_vector_country_ranking_df <- 
  as.data.frame(cbind(arima_df, EU_SOCR_Country_Ranking_Data_2011[ , -1]))
dim(aggregate_arima_vector_country_ranking_df)   # [1]  Country=31 * Features=387 (ARIMA=378 + Ranking=9)
# View(aggregate_arima_vector_country_ranking_df)
rownames(aggregate_arima_vector_country_ranking_df) <- countryNames
write.csv(aggregate_arima_vector_country_ranking_df, row.names = T, fileEncoding = "UTF-16LE",
           "C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_aggregate_arima_vector_country_ranking.csv")
```

## `Spacetime` Analytics

### Supervised classification

Use [Model-based and Model-free methods](https://dspa.predictive.space/) to predict the *overall* (OA) country ranking.

```{r message=F, error=F, warning=F, fig.width=12, fig.height=12}
# 1. LASSO regression/feature extraction
library(glmnet)
library(arm)
library(knitr)
# subset test data
Y = aggregate_arima_vector_country_ranking_df$OA
X = aggregate_arima_vector_country_ranking_df[ , -387]
# remove columns containing NAs and convert character DF to numeric type
X = as.data.frame(apply(X[ , colSums(is.na(X)) == 0], 2, as.numeric)); dim(X)  # [1]  31 386

fitRidge = glmnet(as.matrix(X), Y, alpha = 0)  # Ridge Regression
fitLASSO = glmnet(as.matrix(X), Y, alpha = 1)  # The LASSO
# LASSO
plot(fitLASSO, xvar="lambda", label="TRUE", lwd=3)
# add label to upper x-axis
mtext("LASSO regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Ridge
plot(fitRidge, xvar="lambda", label="TRUE", lwd=3)
# add label to upper x-axis
mtext("Ridge regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

#### 10-fold cross validation ####
# LASSO
library(doParallel)
registerDoParallel(6)
set.seed(1234)  # set seed 
# (10-fold) cross validation for the LASSO
cvLASSO = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE)
cvRidge = cv.glmnet(data.matrix(X), Y, alpha = 0, parallel=TRUE)
plot(cvLASSO)
mtext("CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO <-  predict(cvLASSO, s = cvLASSO$lambda.min, newx = data.matrix(X))
testMSE_LASSO <- mean((predLASSO - Y)^2); testMSE_LASSO

predLASSO = predict(cvLASSO, s = cvLASSO$lambda.min, newx = data.matrix(X))
predRidge = predict(fitRidge, s = cvRidge$lambda.min, newx = data.matrix(X))

# calculate test set MSE
testMSELASSO = mean((predLASSO - Y)^2)
testMSERidge = mean((predRidge - Y)^2)
##################################Use only ARIMA effects, no SOCR meta-data#####
set.seed(4321)
cvLASSO_lim = cv.glmnet(data.matrix(X[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_lim)
mtext("CV LASSO (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_lim <-  predict(cvLASSO_lim, s = 3, # cvLASSO_lim$lambda.min, 
                          newx = data.matrix(X[ , 1:(42*9)]))
coefList_lim <- coef(cvLASSO_lim, s=3) # 'lambda.min')
coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1],coefList_lim@x)
names(coefList_lim) <- c('Feature','EffectSize')
arrange(coefList_lim, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_lim[, 1])  # 0.84
################################################################################

# Plot Regression Coefficients: create variable names for plotting 
library("arm")
# par(mar=c(2, 13, 1, 1))   # extra large left margin # par(mar=c(5,5,5,5))
varNames <- colnames(X); # varNames; 
length(varNames)

betaHatLASSO = as.double(coef(fitLASSO, s = cvLASSO$lambda.min))  # cvLASSO$lambda.1se
betaHatRidge = as.double(coef(fitRidge, s = cvRidge$lambda.min))

#coefplot(betaHatLASSO[2:386], sd = rep(0, 385), pch=0, cex.pts = 3, main = "LASSO-Regularized Regression Coefficient Estimates", varnames = varNames)
coefplot(betaHatLASSO[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", main = "LASSO-Regularized Regression Coefficient Estimates", varnames = varNames[377:386])
coefplot(betaHatRidge[377:386], sd = rep(0, 10), pch=2, add = TRUE, col.pts = "blue", cex.pts = 3)
legend("bottomleft", c("LASSO", "Ridge"), col = c("red", "blue"), pch = c(1 , 2), bty = "o", cex = 2)

varImp <- function(object, lambda = NULL, ...) {
  ## skipping a few lines
  beta <- predict(object, s = lambda, type = "coef")
  if(is.list(beta)) {
    out <- do.call("cbind", lapply(beta, function(x) x[,1]))
    out <- as.data.frame(out)
  } else out <- data.frame(Overall = beta[,1])
  out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE])
  out
}

varImp(cvLASSO, lambda = cvLASSO$lambda.min)

coefList <- coef(cvLASSO, s='lambda.min')
coefList <- data.frame(coefList@Dimnames[[1]][coefList@i+1],coefList@x)
names(coefList) <- c('Feature','EffectSize')
arrange(coefList, -abs(EffectSize))[2:10, ]
#                      var        val  # Feature names: colnames(list_of_dfs_CommonFeatures[[1]])
#1            (Intercept) 49.4896874
#2   Feature_1_ArimaVec_8 -2.4050811   # Feature 1 = Active population: Females 15 to 64 years
#3  Feature_20_ArimaVec_8 -1.4015001   # Feature 20= "Employment: Females 15 to 64 years
#4            IncomeGroup -1.2271177
#5   Feature_9_ArimaVec_8 -1.0629835   # Feature 9= Active population: Total 15 to 64 years
#6                     ED -0.7481041
#7                     PE -0.5167668
#8  Feature_25_ArimaVec_5  0.4416775   # Feature 25= Property income
#9   Feature_9_ArimaVec_4 -0.2217804
#10                   QOL -0.1965342
#                                     ARIMA: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#
#9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

# [1] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels"                                                     
# [2] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"     
# [3] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Tertiary education (levels 5-8)"                                           
# [4] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
# [5] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, All ISCED 2011 levels"                                                       
# [6] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"       
# [7] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Tertiary education (levels 5-8)"                                             
# [8] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"  
# [9] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels"  
#[10] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" 
#[11] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Tertiary education (levels 5-8)"        
#[12] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
#[13] "All ISCED 2011 levels " 
# [14] "All ISCED 2011 levels, Females"  
# [15] "All ISCED 2011 levels, Males"
# [16] "Capital transfers, payable"  
# [17] "Capital transfers, receivable"  
# [18] "Compensation of employees, payable"  
# [19] "Current taxes on income, wealth, etc., receivable"
#[20] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels " 
# [21] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"            
# [22] "Other current transfers, payable"
# [23] "Other current transfers, receivable" 
# [24] "Property income, payable" 
# [25] "Property income, receivable" 
# [26] "Savings, gross" 
# [27] "Subsidies, payable"
# [28] "Taxes on production and imports, receivable"
# [29] "Total general government expenditure"
# [30] "Total general government revenue"
# [31] "Unemployment , Females, From 15-64 years, Total"
# [32] "Unemployment , Males, From 15-64 years" 
# [33] "Unemployment , Males, From 15-64 years, from 1 to 2 months" 
# [34] "Unemployment , Males, From 15-64 years, from 3 to 5 months"
# [35] "Unemployment , Males, From 15-64 years, from 6 to 11 months"  
# [36] "Unemployment , Total, From 15-64 years, From 1 to 2 months"
# [37] "Unemployment , Total, From 15-64 years, From 12 to 17 months"
# [38] "Unemployment , Total, From 15-64 years, From 3 to 5 months" 
# [39] "Unemployment , Total, From 15-64 years, From 6 to 11 months"  
# [40] "Unemployment , Total, From 15-64 years, Less than 1 month" 
# [41] "Unemployment by sex, age, duration. DurationNA not started"
# [42] "VAT, receivable"   

coef(cvLASSO, s = "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

validation <- data.frame(matrix(NA, nrow = dim(predLASSO)[1], ncol=3), row.names=countryNames)
validation [ , 1] <- Y; validation [ , 2] <- predLASSO_lim[, 1]; validation [ , 3] <- predRidge[, 1]
colnames(validation) <- c("Y", "LASSO", "Ridge")
dim(validation)
head(validation)

# Prediction correlations: 
cor(validation[ , 1], validation[, 2])  # Y=observed OA rank vs. LASSO-pred  0.96 (lim) 0.84
cor(validation[ , 1], validation[, 3])  # Y=observed OA rank vs. Ridge-pred  0.95

# Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^
linFit1 <- lm(validation[ , 1] ~ predLASSO)
plot(validation[ , 1] ~ predLASSO,
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="LASSO 9/(42*9) param model",
     main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", 
                    cor(validation[ , 1], validation[, 2])))
abline(linFit1, lwd=3, col="red")

# Plot observed LASSO (9-parameters) predicted Y^ vs. Y (Overall Country ranking) 
linFit1 <- lm(predLASSO_lim ~ validation[ , 1])
plot(predLASSO_lim ~ validation[ , 1],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="LASSO 9/(42*9) param model",
     main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", 
                    cor(validation[ , 1], validation[, 2])))
abline(linFit1, lwd=3, col="red")
```

<!-- ### Unsupervised clustering -->

<!-- Use [hierarchical, k-means and spectral clustering](https://dspa.predictive.space/) to generate derived-computed phenotypes of countries. Do these derived labels relate (correspond to) the overall (OA) country ranking? -->

<!-- ```{r message=F, error=F, warning=F} -->

<!-- # ... -->

<!-- ``` -->

### Dimensionality Reduction

PCA and t-SNE 2D and 3D projections using [SOCR Dimensionality Reduction Webapp](https://socr.umich.edu/HTML5/SOCR_TensorBoard_UKBB/UsersData.html).

 - the *tensor* file `EU_Econ_TensorData_31Countries_By_386Features.txt` contains 31 rows (countries) and 386 = 42*9(ARIMA) + 8(meta-data) columns
 - the *labels* file `EU_Econ_Labels_31Countries_and OverallRankingOutcome.txt` includes 31 rows with 2 columns specifying the Countries and their OA (Overall rankings).
 - The reader can load these *tensor* data files into the [SOCR Dimensionality Reduction Webapp](https://socr.umich.edu/HTML5/SOCR_TensorBoard_UKBB/UsersData.html) and generate dynamic views of the 2D or 3D projections (PCA=Euclidean) and (t-SNE manifold).

```{r message=F, error=F, warning=F}
# https://www.socr.umich.edu/people/dinov/courses/DSPA_notes/05_DimensionalityReduction.html 
# ...
```


## `Spacekime` Analytics

### Supervised classification

Use [Model-based and Model-free methods](https://dspa.predictive.space/) to predict the *overall* (OA) country ranking.

```{r message=F, error=F, warning=F, fig.width=12, fig.height=12}

# Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain): kSpaceTransform(data, inverse = FALSE, reconPhases = NULL) 
  # ForwardFT (rawData, FALSE, NULL)
  # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL)

# DATA
# subset test data
Y = aggregate_arima_vector_country_ranking_df$OA
X = aggregate_arima_vector_country_ranking_df[ , -387]
# remove columns containing NAs
X = as.data.frame(apply(X[ , colSums(is.na(X)) == 0], 2, as.numeric)); dim(X)  # [1]  31 386
length(Y); dim(X)

FT_aggregate_arima_vector_country_ranking_df <- 
  kSpaceTransform(as.data.frame(apply(
    aggregate_arima_vector_country_ranking_df[ , colSums(is.na(aggregate_arima_vector_country_ranking_df)) == 0], 
    2, as.numeric)), inverse = FALSE, reconPhases = NULL) 
  
## Kime-Phase Distributions
# Examine the Kime-direction Distributions of the Phases for all *Belgium* features (predictors + outcome). Define a generic function that plots the Phase distributions.
# plotPhaseDistributions(dataFT, dataColnames)
plotPhaseDistributions(FT_aggregate_arima_vector_country_ranking_df,
                       colnames(aggregate_arima_vector_country_ranking_df), size=4, cex=0.1)

IFT_FT_aggregate_arima_vector_country_ranking_df <- 
  kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                  TRUE, FT_aggregate_arima_vector_country_ranking_df$phases)
# Check IFT(FT) == I: 
# ifelse(aggregate_arima_vector_country_ranking_df[5,4] -
#     Re(IFT_FT_aggregate_arima_vector_country_ranking_df[5,4]) < 0.001, "Perfect Synthesis", "Problems!!!")

##############################################
# Nil-Phase Synthesis and LASSO model estimation
# 1. Nil-Phase data synthesis (reconstruction)
temp_Data <- aggregate_arima_vector_country_ranking_df
nilPhase_FT_aggregate_arima_vector <- 
  array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(nilPhase_FT_aggregate_arima_vector)    # ;  head(nilPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_NilPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

#       Invert back to spacetime the 
# FT_aggregate_arima_vector_country_ranking_df$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, nilPhase_FT_aggregate_arima_vector))

colnames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_NilPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_NilPhase_FT_aggregate_arima_vector)
# IFT_NilPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_NilPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
# library(forecast)
set.seed(54321)
cvLASSO_kime = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , -387]), 
           # IFT_NilPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE)
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime)
mtext("(Spacekime, Nil-phase) CV LASSO: Number of Nonzero (Active) Coefficients", 
      side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime <-  predict(cvLASSO_kime, s = cvLASSO_kime$lambda.min, 
        newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , -387])); predLASSO_kime
# testMSE_LASSO_kime <- mean((predLASSO_kime - IFT_NilPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_kime
predLASSO_kime = predict(cvLASSO_kime, s = exp(1/3), # cvLASSO_kime$lambda.min, 
        newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , -387])); predLASSO_kime

##################################Use only ARIMA effects, no SOCR meta-data#####
set.seed(12345)
cvLASSO_kime_lim = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:(42*9)]), 
                        Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime_lim)
mtext("CV LASSO Nil-Phase (using only Timeseries data): Number of Nonzero (Active) Coefficients",
      side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime_lim <-  predict(cvLASSO_kime_lim, s = 1, 
                          newx = data.matrix(X[ , 1:(42*9)]))
coefList_kime_lim <- coef(cvLASSO_kime_lim, s=1)
coefList_kime_lim <- data.frame(coefList_kime_lim@Dimnames[[1]][coefList_kime_lim@i+1],coefList_kime_lim@x)
names(coefList_kime_lim) <- c('Feature','EffectSize')
arrange(coefList_kime_lim, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_kime_lim[, 1])  # 0.1142824
################################################################################

# Plot Regression Coefficients: create variable names for plotting 
library("arm")
# par(mar=c(2, 13, 1, 1))   # extra large left margin # par(mar=c(5,5,5,5))
# varNames <- colnames(X); varNames; length(varNames)

betaHatLASSO_kime = as.double(coef(cvLASSO_kime, s=cvLASSO_kime$lambda.min))
#cvLASSO_kime$lambda.1se

coefplot(betaHatLASSO_kime[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", 
         main = "(Spacekime) LASSO-Regularized Regression Coefficient Estimates",
         varnames = varNames[377:386])

varImp(cvLASSO_kime, lambda = cvLASSO_kime$lambda.min)

coefList_kime <- coef(cvLASSO_kime, s=1)  # 'lambda.min')
coefList_kime <- data.frame(coefList_kime@Dimnames[[1]][coefList_kime@i+1], coefList_kime@x)
names(coefList_kime) <- c('Feature','EffectSize')
arrange(coefList_kime, -abs(EffectSize))[1:9, ]
#                 Feature  EffectSize
#1           (Intercept) 26.069326257
#2 Feature_12_ArimaVec_8 -8.662856430
#3 Feature_11_ArimaVec_4  8.585283751
#4 Feature_12_ArimaVec_4 -5.023601842
#5 Feature_30_ArimaVec_4  2.242157842
#6 Feature_26_ArimaVec_6  1.760267216
#7 Feature_39_ArimaVec_5 -1.256101950
#8 Feature_34_ArimaVec_5 -1.148865337
#9 Feature_37_ArimaVec_2  0.001322367
#     ARIMA-spacetime:     4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekimeNil:  2=forecast_avg, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 8=non-seasonal Diff
#9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_kime, s = 1/5) %>%  ### "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

# pack a 31*3 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, Y)
validation_kime <- cbind(predLASSO_kime[, 1], 
                         IFT_NilPhase_FT_aggregate_arima_vector[ , 387], Y)  
colnames(validation_kime) <- c("predLASSO_kime", "IFT_NilPhase_FT_Y", "Orig_Y")
head(validation_kime)

# Prediction correlations: 
cor(validation_kime[ , 1], validation_kime[, 2])  # Y=predLASSO_kime OA rank vs. kime_LASSO_pred:  0.99
cor(validation_kime[ , 1], validation_kime[, 3])  # Y=predLASSO_kime OA rank vs. Orig_Y:  0.64

# Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^
linFit1_kime <- lm(predLASSO_kime ~ validation_kime[ , 3])
plot(predLASSO_kime ~ validation_kime[ , 3],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="IFT_NilPhase predLASSO_kime",
     main = sprintf("Observed (x) vs. IFT_NilPhase Predicted (y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime[ , 1], validation_kime[, 3])))
abline(linFit1_kime, lwd=3, col="red")
# abline(linFit1, lwd=3, col="green")


##############################################
# 3. Swap Feature Phases and then synthesize the data (reconstruction)
# temp_Data <- aggregate_arima_vector_country_ranking_df
swappedPhase_FT_aggregate_arima_vector <- FT_aggregate_arima_vector_country_ranking_df$phases
dim(swappedPhase_FT_aggregate_arima_vector)    # ;  head(swappedPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_SwappedPhase_FT_aggregate_arima_vector <- array(complex(), 
                          c(dim(temp_Data)[1], dim(temp_Data)[2]))

set.seed(12345)   # sample randomly Phase-columns for each of the 131 covariates (X)
swappedPhase_FT_aggregate_arima_vector1 <- as.data.frame(cbind(
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 1:378]))],  # mix ARIMA signature phases
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 379:386]))],# mix the meta-data phases
  swappedPhase_FT_aggregate_arima_vector[ , 387]))                          # add correct Outcome phase
swappedPhase_FT_aggregate_arima_vector <- swappedPhase_FT_aggregate_arima_vector1

colnames(swappedPhase_FT_aggregate_arima_vector) <- colnames(temp_Data)
colnames(swappedPhase_FT_aggregate_arima_vector); dim(swappedPhase_FT_aggregate_arima_vector)
# 31 387

#       Invert back to spacetime the 
# FT_aggregate_arima_vector$magnitudes[ , i] signal with swapped-X-phases (Y-phase is fixed)
IFT_SwappedPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, swappedPhase_FT_aggregate_arima_vector))

colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_SwappedPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_SwappedPhase_FT_aggregate_arima_vector)
# IFT_SwappedPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_SwappedPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
set.seed(12345)
cvLASSO_kime_swapped = 
  cv.glmnet(data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , -387]), 
           # IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE)
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime_swapped)
mtext("(Spacekime, Swapped-Phases) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

##################################Use only ARIMA effects, no SOCR meta-data#####
set.seed(12345)
cvLASSO_kime_swapped_lim = cv.glmnet(data.matrix(
  IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime_swapped_lim)
mtext("CV LASSO Swapped-Phase (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime_swapped_lim <-  predict(cvLASSO_kime_swapped_lim, 
              s = cvLASSO_kime_swapped_lim$lambda.min, 
              newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:(42*9)]))
coefList_kime_swapped_lim <- coef(cvLASSO_kime_swapped_lim, s='lambda.min')
coefList_kime_swapped_lim <- data.frame(coefList_kime_swapped_lim@Dimnames[[1]][coefList_kime_swapped_lim@i+1],coefList_kime_swapped_lim@x)
names(coefList_kime_swapped_lim) <- c('Feature','EffectSize')
arrange(coefList_kime_swapped_lim, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_kime_swapped_lim[, 1])  # 0.86
################################################################################


# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime_swapped <-  predict(cvLASSO_kime_swapped, s = cvLASSO_kime_swapped$lambda.min, 
        newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , -387]))
# testMSE_LASSO_kime_swapped <- 
#        mean((predLASSO_kime_swapped - IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_kime_swapped
predLASSO_kime_swapped = predict(cvLASSO_kime_swapped, s = 3, 
        newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , -387]))
predLASSO_kime_swapped

# Plot Regression Coefficients: create variable names for plotting 
betaHatLASSO_kime_swapped = as.double(coef(cvLASSO_kime_swapped,
                                           s=cvLASSO_kime_swapped$lambda.min))
#cvLASSO_kime_swapped$lambda.1se

coefplot(betaHatLASSO_kime_swapped[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", 
         main = "(Spacekime, Swapped-Phases) LASSO-Regularized Regression Coefficient Estimates",
         varnames = varNames[377:386])

varImp(cvLASSO_kime_swapped, lambda = cvLASSO_kime_swapped$lambda.min)

coefList_kime_swapped <- coef(cvLASSO_kime_swapped, s=3)    # 'lambda.min')
coefList_kime_swapped <- data.frame(coefList_kime_swapped@Dimnames[[1]][coefList_kime_swapped@i+1], coefList_kime_swapped@x)
names(coefList_kime_swapped) <- c('Feature','EffectSize')
arrange(coefList_kime_swapped, -abs(EffectSize))[2:10, ]
#                 Feature  EffectSize
#2  Feature_32_ArimaVec_6  3.3820076
#3   Feature_1_ArimaVec_3  2.2133139
#4  Feature_21_ArimaVec_4  1.5376447
#5  Feature_22_ArimaVec_3  1.0546605
#6  Feature_14_ArimaVec_5  0.7428693
#7                     ED  0.6525794
#8  Feature_24_ArimaVec_5  0.5987113
#9  Feature_12_ArimaVec_5  0.3177650
#10 Feature_37_ArimaVec_6  0.1598574
#
#     ARIMA-spacetime:        4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekime_nill:   3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
#    ARIMA-spacekime_swapped: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
# 9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_kime_swapped, s = 3) %>%  # "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime, Swapped-Phases) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

# pack a 31*4 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, 
#                      IFT_SwappedPhase_FT_aggregate_arima_vector_Y, Y)
validation_kime_swapped <- cbind(predLASSO_lim[, 1], 
                         predLASSO_kime[ , 1], predLASSO_kime_swapped[ , 1], Y)  
colnames(validation_kime_swapped) <- c("predLASSO (spacetime)", "predLASSO_IFT_NilPhase",
                               "predLASSO_IFT_SwappedPhase", "Orig_Y")
head(validation_kime_swapped); dim(validation_kime_swapped)

# Prediction correlations: 
cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])  
# predLASSO_IFT_SwappedPhase OA rank vs. predLASSO_spacekime:  0.7
cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 3])  
# predLASSO (spacetime) vs.  predLASSO_IFT_SwappedPhase OA rank:  0.83

# Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^
linFit1_kime_swapped <- lm(validation_kime_swapped[ , 4] ~ predLASSO)
plot(validation_kime_swapped[ , 4] ~ predLASSO,
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="predLASSO_spacekime Country Overall Ranking", ylab="predLASSO_IFT_SwappedPhase_FT_Y",
     main = sprintf("Spacetime Predicted (x) vs. Kime IFT_SwappedPhase_FT_Y (y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])))
abline(linFit1_kime_swapped, lwd=3, col="red")
#abline(linFit1_kime, lwd=3, col="green")

# Plot Spacetime LASSO forecasting
# Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^
linFit1_spacetime <- lm(validation_kime_swapped[ , 1] ~ validation_kime_swapped[ , 4])
plot(validation_kime_swapped[ , 1] ~ validation_kime_swapped[ , 4],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="predLASSO_spacetime",
     main = sprintf("Spacetime Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 4])))
abline(linFit1_spacetime, lwd=3, col="red")

# test with using swapped-phases LASSO estimates
linFit1_spacekime <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4])
plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="predLASSO_spacekime Swapped-Phases",
     main = sprintf("Spacekime Predicted, Swapped-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])))
abline(linFit1_spacekime, lwd=3, col="red")

# add Top_30_Ranking_Indicator
validation_kime_swapped <- as.data.frame(cbind(validation_kime_swapped, ifelse (validation_kime_swapped[,4]<=30, 1, 0)))
colnames(validation_kime_swapped)[5] <- "Top30Rank"
head(validation_kime_swapped)

library("ggrepel")
# Spacetime LASSO modeling
myPlotSpacetime <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y,
            y=`predLASSO (spacetime)`, label=rownames(validation_kime_swapped))) +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) +
     geom_label_repel(aes(label = rownames(validation_kime_swapped),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacetime LASSO Prediction (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 4])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacetime LASSO Rank Forecasting")
myPlotSpacetime

# NIL-PHASE KIME reconstruction
myPlotNilPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y,
            y=predLASSO_kime, label=rownames(validation_kime_swapped))) +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) +
     geom_label_repel(aes(label = rownames(validation_kime_swapped),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacekime LASSO Predicted, Nil-Phases, (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_kime_swapped[ , 2], validation_kime_swapped[, 4])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacekime LASSO Predicted, using Nil-Phases")
myPlotNilPhase

# SWAPPED PHASE KIME reconstruction
myPlotSwappedPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y,
            y=predLASSO_kime_swapped, label=rownames(validation_kime_swapped))) +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) +
     geom_label_repel(aes(label = rownames(validation_kime_swapped),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacekime LASSO Predicted, Swapped-Phases, (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacekime LASSO Predicted, using Swapped-Phases")
myPlotSwappedPhase

countryNames[11]<-"Germany"
aggregateResults <- (rbind(cbind(as.character(countryNames), "predLASSO_spacetime", as.numeric(predLASSO)), 
                           cbind(as.character(countryNames), "predLASSO_lim", predLASSO_lim),
                           cbind(as.character(countryNames), "predLASSO_nil", predLASSO_kime),
                           cbind(as.character(countryNames), "predLASSO_swapped", predLASSO_kime_swapped),
                           cbind(as.character(countryNames), "observed", Y)
                     ))
aggregateResults <- data.frame(aggregateResults[ , -3], as.numeric(aggregateResults[,3]))
colnames(aggregateResults) <- c("country", "estimate_method", "ranking")
ggplot(aggregateResults, aes(x=country, y=ranking, color=estimate_method)) +
  geom_point(aes(shape=estimate_method, color=estimate_method, size=estimate_method)) + geom_point(size = 5) +
  geom_line(data = aggregateResults[aggregateResults$estimate_method == "observed", ], 
            aes(group = estimate_method), size=2, linetype = "dashed") +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
  # theme(legend.position = "bottom") +
  # scale_shape_manual(values = as.factor(aggregateResults$estimate_method)) +
  theme(text = element_text(size = 15), legend.position = c(0.3, 0.85), 
           axis.text=element_text(size=16),
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold"))
#  + scale_fill_discrete(
#    name="Country Overall Ranking",
#    breaks=c("predLASSO_spacetime", "predLASSO_lim", "predLASSO_nil", "predLASSO_swapped", "observed"),
#    labels=c(sprintf("predLASSO_spacetime LASSO Predicted (386), cor=%.02f",  cor(predLASSO, Y)),
#             sprintf("predLASSO_lim LASSO Predicted (378), cor=%.02f",  cor(predLASSO_lim, Y)), 
#             sprintf("predLASSO_nil (spacekime) LASSO Predicted, cor=%.02f",  cor(predLASSO_kime, Y)), 
#             sprintf("predLASSO_swapped (spacekime) LASSO Predicted, cor=%.02f",  cor(predLASSO_kime_swapped, Y)), 
#             "observed"))
```

### Core redesign

Generic Functions

```{r message=F, error=F, warning=F}
# Plotting the coefficients
coef_plot <- function(betahat, varn, plotname) {
  betahat<-betahat[-1]
  P <- coefplot(betahat[which(betahat!=0)], sd = rep(0, length(betahat[which(betahat!=0)])), 
                pch=0, cex.pts = 3, col="red", main = plotname, varnames = varn[which(betahat!=0)])
  return(P)
}

# Plotting the coefficients for those two methods
findfeatures <- function(lassobeta, ridgebeta=NULL) {
  lassobeta<-lassobeta[-1]
  feat1 <- which(lassobeta!=0)
  features <- feat1
  if (!is.null(ridgebeta)) {
    ridgebeta<-ridgebeta[-1]
    feat2 <- order(abs(ridgebeta),decreasing = TRUE)[1:10]
    features <- union(feat1, feat2)
  }
  return(features)
}

varImp <- function(object, lambda = NULL, ...) {
  ## skipping a few lines
  beta <- predict(object, s = lambda, type = "coef")
  if(is.list(beta)) {
    out <- do.call("cbind", lapply(beta, function(x) x[,1]))
    out <- as.data.frame(out)
    s <- rowSums(out)
    out <- out[which(s)!=0,,drop=FALSE]
  } else  {
    out<-data.frame(Overall = beta[,1])
    out<-out[which(out!=0),,drop=FALSE]
  }
  out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE])
  out
}
```

#### Weak-signal Analytics

Using only the 378 ARIMA signatures for the prediction (out of the total of 386 features).

##### Space-Time Analytics

```{r message=F, error=F, warning=F}
# 1. LASSO regression/feature extraction
library(glmnet)
library(arm)
library(knitr)
# subset test data
Y = aggregate_arima_vector_country_ranking_df$OA
X = aggregate_arima_vector_country_ranking_df[ , 1:378]
# remove columns containing NAs
X = X[ , colSums(is.na(X)) == 0]; dim(X)  # [1]  31 378

#### 10-fold cross validation: for the LASSO
library("glmnet")
library(doParallel)
registerDoParallel(6)

set.seed(4321)
cvLASSO_lim = cv.glmnet(data.matrix(X[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_lim)
mtext("CV LASSO (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_lim <-  predict(cvLASSO_lim, s = 3, # cvLASSO_lim$lambda.min, 
                          newx = data.matrix(X[ , 1:(42*9)]))
coefList_lim <- coef(cvLASSO_lim, s=3) # 'lambda.min')
coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1],coefList_lim@x)
names(coefList_lim) <- c('Feature','EffectSize')
arrange(coefList_lim, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_lim[, 1])  # 0.84
################################################################################

varImp(cvLASSO_lim, lambda = cvLASSO_lim$lambda.min)
#2   Feature_1_ArimaVec_8 -2.3864299
#3  Feature_19_ArimaVec_8  2.0871310
#4  Feature_16_ArimaVec_3  2.0465254
#5  Feature_13_ArimaVec_8 -1.7348553
#6  Feature_15_ArimaVec_4 -1.4588173
#7  Feature_22_ArimaVec_4 -1.1068801
#8  Feature_25_ArimaVec_5  0.9336800
#9  Feature_35_ArimaVec_4 -0.9276244
#10 Feature_25_ArimaVec_4 -0.8486434

#coefList_lim <- coef(cvLASSO_lim, s='lambda.min')
#coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1], coefList_lim@x)
#names(coefList_lim) <- c('Feature','EffectSize')
#arrange(coefList_lim, -abs(EffectSize))[2:10, ]
#
#      9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

# [1] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels"                                                     
# [2] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"     
# [3] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Tertiary education (levels 5-8)"                                           
# [4] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
# [5] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, All ISCED 2011 levels"                                                       
# [6] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"       
# [7] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Tertiary education (levels 5-8)"                                             
# [8] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"  
# [9] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels"  
#[10] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" 
#[11] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Tertiary education (levels 5-8)"        
#[12] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
#[13] "All ISCED 2011 levels " 
# [14] "All ISCED 2011 levels, Females"  
# [15] "All ISCED 2011 levels, Males"
# [16] "Capital transfers, payable"  
# [17] "Capital transfers, receivable"  
# [18] "Compensation of employees, payable"  
# [19] "Current taxes on income, wealth, etc., receivable"
#[20] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels " 
# [21] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"            
# [22] "Other current transfers, payable"
# [23] "Other current transfers, receivable" 
# [24] "Property income, payable" 
# [25] "Property income, receivable" 
# [26] "Savings, gross" 
# [27] "Subsidies, payable"
# [28] "Taxes on production and imports, receivable"
# [29] "Total general government expenditure"
# [30] "Total general government revenue"
# [31] "Unemployment , Females, From 15-64 years, Total"
# [32] "Unemployment , Males, From 15-64 years" 
# [33] "Unemployment , Males, From 15-64 years, from 1 to 2 months" 
# [34] "Unemployment , Males, From 15-64 years, from 3 to 5 months"
# [35] "Unemployment , Males, From 15-64 years, from 6 to 11 months"  
# [36] "Unemployment , Total, From 15-64 years, From 1 to 2 months"
# [37] "Unemployment , Total, From 15-64 years, From 12 to 17 months"
# [38] "Unemployment , Total, From 15-64 years, From 3 to 5 months" 
# [39] "Unemployment , Total, From 15-64 years, From 6 to 11 months"  
# [40] "Unemployment , Total, From 15-64 years, Less than 1 month" 
# [41] "Unemployment by sex, age, duration. DurationNA not started"
# [42] "VAT, receivable"   

coef(cvLASSO_lim, s = 3) %>%  # "lambda.min"
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

validation_lim <- data.frame(matrix(NA, nrow = dim(predLASSO_lim)[1], ncol=2), row.names=countryNames)
validation_lim [ , 1] <- Y; validation_lim[ , 2] <- predLASSO_lim[, 1]
colnames(validation_lim) <- c("Orig_Y", "LASSO")
dim(validation_lim); head(validation_lim)

# add Top_30_Ranking_Indicator
validation_lim <- as.data.frame(cbind(validation_lim, ifelse (validation_lim[, 1]<=30, 1, 0)))
colnames(validation_lim)[3] <- "Top30Rank"
head(validation_lim)

# Prediction correlations: 
cor(validation_lim[ , 1], validation_lim[, 2])  # Y=observed OA rank vs. LASSO-pred  0.98 (lim) 0.84

# Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^
linFit_lim <- lm(validation_lim[ , 1] ~ validation_lim[, 2])
plot(validation_lim[ , 1] ~ validation_lim[, 2],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="LASSO (42*9 +8) param model",
     main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_lim[ , 1], validation_lim[, 2])))
abline(linFit_lim, lwd=3, col="red")

# Plot
myPlot <- ggplot(as.data.frame(validation_lim), aes(x=validation_lim[ , 1],
            y=validation_lim[ , 2], label=rownames(validation_lim))) +
     geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_lim)))) +
     geom_label_repel(aes(label = rownames(validation_lim),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacetime LASSO Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_lim[ , 1], validation_lim[, 2])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacetime LASSO Predicted")
myPlot
```

##### Space-Kime Analytics

**Nil-Phase Synthesis** and LASSO model estimation ...
 
```{r message=F, error=F, warning=F}
# Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain): kSpaceTransform(data, inverse = FALSE, reconPhases = NULL) 
  # ForwardFT (rawData, FALSE, NULL)
  # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL)
# DATA
# subset test data
aggregate_arima_vector_country_ranking_df <- as.data.frame(apply(
    aggregate_arima_vector_country_ranking_df[ , colSums(is.na(aggregate_arima_vector_country_ranking_df)) == 0], 
    2, as.numeric))

Y = aggregate_arima_vector_country_ranking_df$OA
X = aggregate_arima_vector_country_ranking_df[ , 1:378]
# remove columns containing NAs
# X = X[ , colSums(is.na(X)) == 0]; dim(X)  # [1]  31 386
length(Y); dim(X)

FT_aggregate_arima_vector_country_ranking_df <- 
  kSpaceTransform(aggregate_arima_vector_country_ranking_df, inverse = FALSE, reconPhases = NULL) 
  
## Kime-Phase Distributions
# Examine the Kime-direction Distributions of the Phases for all *Belgium* features (predictors + outcome). Define a generic function that plots the Phase distributions.
# plotPhaseDistributions(dataFT, dataColnames)
plotPhaseDistributions(FT_aggregate_arima_vector_country_ranking_df,
                       colnames(aggregate_arima_vector_country_ranking_df), size=4, cex=0.1)

IFT_FT_aggregate_arima_vector_country_ranking_df <- 
  kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                  TRUE, FT_aggregate_arima_vector_country_ranking_df$phases)
# Check IFT(FT) == I: 
# ifelse(aggregate_arima_vector_country_ranking_df[5,4] -
#     Re(IFT_FT_aggregate_arima_vector_country_ranking_df[5,4]) < 0.001, "Perfect Synthesis", "Problems!!!")

##############################################
# Nil-Phase Synthesis and LASSO model estimation
# 1. Nil-Phase data synthesis (reconstruction)
temp_Data <- aggregate_arima_vector_country_ranking_df
nilPhase_FT_aggregate_arima_vector <- 
  array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(nilPhase_FT_aggregate_arima_vector)    # ;  head(nilPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_NilPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

#       Invert back to spacetime the 
# FT_aggregate_arima_vector_country_ranking_df$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, nilPhase_FT_aggregate_arima_vector))

colnames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_NilPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_NilPhase_FT_aggregate_arima_vector)
# IFT_NilPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_NilPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
# library(forecast)
set.seed(123)
cvLASSO_nil_kime = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:378]), 
           # IFT_NilPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE)
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_nil_kime)
mtext("(Spacekime, Nil-phase) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_nil_kime <-  predict(cvLASSO_nil_kime, s = cvLASSO_nil_kime$lambda.min, 
        newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:378])); predLASSO_nil_kime
# testMSE_LASSO_nil_kime <- mean((predLASSO_nil_kime - IFT_NilPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_nil_kime

# Plot Regression Coefficients: create variable names for plotting 
library("arm")
# par(mar=c(2, 13, 1, 1))   # extra large left margin # par(mar=c(5,5,5,5))
# varNames <- colnames(X); varNames; length(varNames)
#betaHatLASSO_kime = as.double(coef(cvLASSO_kime, s=cvLASSO_kime$lambda.min))
#cvLASSO_kime$lambda.1se
#
#coefplot(betaHatLASSO_kime[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", 
#         main = "(Spacekime) LASSO-Regularized Regression Coefficient Estimates",
#         varnames = varNames[377:386])

varImp(cvLASSO_nil_kime, lambda = cvLASSO_nil_kime$lambda.min)

coefList_nil_kime <- coef(cvLASSO_nil_kime, s='lambda.min')
coefList_nil_kime <- data.frame(coefList_nil_kime@Dimnames[[1]][coefList_nil_kime@i+1], coefList_nil_kime@x)
names(coefList_nil_kime) <- c('Feature','EffectSize')
arrange(coefList_nil_kime, -abs(EffectSize))[1:9, ]
#                 Feature  EffectSize
#1           (Intercept) 26.385520159
#2 Feature_12_ArimaVec_8 -9.312528495
#3 Feature_11_ArimaVec_4  8.561417371
#4 Feature_12_ArimaVec_4 -5.220797416
#5 Feature_30_ArimaVec_4  2.623218791
#6 Feature_26_ArimaVec_6  1.927773213
#7 Feature_39_ArimaVec_5 -1.534741402
#8 Feature_34_ArimaVec_5 -1.171720008
#9 Feature_37_ArimaVec_2  0.004213823
#     ARIMA-spacetime:     4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekimeNil:  2=forecast_avg, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 8=non-seasonal Diff
#9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_nil_kime, s = "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

# pack a 31*3 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, Y)
validation_nil_kime <- cbind(predLASSO_nil_kime[, 1], 
                         IFT_NilPhase_FT_aggregate_arima_vector[ , 387], Y)  
colnames(validation_nil_kime) <- c("predLASSO_kime", "IFT_NilPhase_FT_Y", "Orig_Y")
rownames(validation_nil_kime)[11] <- "Germany"
head(validation_nil_kime)

validation_nil_kime <- as.data.frame(cbind(validation_nil_kime, ifelse (validation_nil_kime[,3]<=30, 1, 0)))
colnames(validation_nil_kime)[4] <- "Top30Rank"
head(validation_nil_kime)

# Prediction correlations: 
# cor(validation_nil_kime[ , 1], validation_nil_kime[, 2])  # Y=predLASSO_kime OA rank vs. kime_LASSO_pred:  0.99
cor(validation_nil_kime[ , 1], validation_nil_kime[, 3])  # Y=predLASSO_kime OA rank vs. Orig_Y:  0.64

# Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^
linFit1_nil_kime <- lm(predLASSO_nil_kime ~ validation_nil_kime[ , 3])
plot(predLASSO_nil_kime ~ validation_nil_kime[ , 3],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="IFT_NilPhase predLASSO_kime",
     main = sprintf("Observed (x) vs. IFT_NilPhase Predicted (y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_nil_kime[ , 1], validation_nil_kime[, 3])))
abline(linFit1_kime, lwd=3, col="red")
# abline(linFit1, lwd=3, col="green")

# Spacetime LASSO modeling
myPlotNilPhase <- ggplot(as.data.frame(validation_nil_kime), aes(x=Orig_Y,
            y=predLASSO_nil_kime, label=rownames(validation_nil_kime))) +
     geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_nil_kime)))) +
     geom_label_repel(aes(label = rownames(validation_nil_kime),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacekime LASSO Predicted, Nil-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_nil_kime[ , 1], validation_nil_kime[, 3])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacekime LASSO Predicted, using Nil-Phases")
myPlotNilPhase
```

**Swapped Feature Phases** and then synthesize the data (reconstruction)

```{r message=F, error=F, warning=F}
# temp_Data <- aggregate_arima_vector_country_ranking_df
swappedPhase_FT_aggregate_arima_vector <- FT_aggregate_arima_vector_country_ranking_df$phases
dim(swappedPhase_FT_aggregate_arima_vector)    # ;  head(swappedPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_SwappedPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

set.seed(12345)   # sample randomly Phase-columns for each of the 131 covariates (X)
swappedPhase_FT_aggregate_arima_vector1 <- as.data.frame(cbind(
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 1:378]))],  # mix ARIMA signature phases
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 379:386]))],# mix the meta-data phases
  swappedPhase_FT_aggregate_arima_vector[ , 387]))                          # add correct Outcome phase
swappedPhase_FT_aggregate_arima_vector <- swappedPhase_FT_aggregate_arima_vector1

colnames(swappedPhase_FT_aggregate_arima_vector) <- colnames(temp_Data)
colnames(swappedPhase_FT_aggregate_arima_vector); dim(swappedPhase_FT_aggregate_arima_vector)
# 31 387

#       Invert back to spacetime the 
# FT_aggregate_arima_vector$magnitudes[ , i] signal with swapped-X-phases (Y-phase is fixed)
IFT_SwappedPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, swappedPhase_FT_aggregate_arima_vector))

colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_SwappedPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_SwappedPhase_FT_aggregate_arima_vector)
# IFT_SwappedPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_SwappedPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
set.seed(12)
cvLASSO_kime_swapped = 
  cv.glmnet(data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378]), 
           # IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE)
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime_swapped)
mtext("(Spacekime, Swapped-Phases) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime_swapped <-  predict(cvLASSO_kime_swapped, s = 3,    # cvLASSO_kime_swapped$lambda.min, 
        newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378]))
# testMSE_LASSO_kime_swapped <- 
#        mean((predLASSO_kime_swapped - IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_kime_swapped
predLASSO_kime_swapped

# Plot Regression Coefficients: create variable names for plotting 
betaHatLASSO_kime_swapped = as.double(coef(cvLASSO_kime_swapped,
                   s=3))   # cvLASSO_kime_swapped$lambda.min))
#cvLASSO_kime_swapped$lambda.1se
#coefplot(betaHatLASSO_kime_swapped[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", 
#         main = "(Spacekime, Swapped-Phases) LASSO-Regularized Regression Coefficient Estimates",
#         varnames = varNames[377:386])
varImp(cvLASSO_kime_swapped, lambda = 3)  #cvLASSO_kime_swapped$lambda.min)

coefList_kime_swapped <- coef(cvLASSO_kime_swapped, s=3)    # 'lambda.min')
coefList_kime_swapped <- data.frame(coefList_kime_swapped@Dimnames[[1]][coefList_kime_swapped@i+1], coefList_kime_swapped@x)
names(coefList_kime_swapped) <- c('Feature','EffectSize')
arrange(coefList_kime_swapped, -abs(EffectSize))[2:10, ]
#                 Feature  EffectSize
#2   Feature_3_ArimaVec_8  5.4414240889
#3  Feature_24_ArimaVec_5 -4.9895032906
#4  Feature_41_ArimaVec_6  1.7580440109
#5   Feature_7_ArimaVec_6 -1.6317407164
#6  Feature_41_ArimaVec_3 -0.4666980893
#7  Feature_42_ArimaVec_3  0.4100416326
#8   Feature_6_ArimaVec_3 -0.2052325091
#9   Feature_7_ArimaVec_1 -0.0007922646
#10 Feature_24_ArimaVec_1 -0.0002003192
#
#     ARIMA-spacetime:        4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekime_nill:   3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
#    ARIMA-spacekime_swapped: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
# 9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_kime_swapped, s = 3) %>%  # "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime, Swapped-Phases) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

# pack a 31*4 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, 
#                      IFT_SwappedPhase_FT_aggregate_arima_vector_Y, Y)
validation_kime_swapped <- cbind(predLASSO_lim[, 1], predLASSO_nil_kime[, 1], 
                         predLASSO_kime_swapped[ , 1], Y)  
colnames(validation_kime_swapped) <- c("predLASSO (spacetime)", "predLASSO_IFT_NilPhase",
                               "predLASSO_IFT_SwappedPhase", "Orig_Y")
head(validation_kime_swapped); dim(validation_kime_swapped)

# Prediction correlations: 
cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])  
# predLASSO_IFT_SwappedPhase OA rank vs. predLASSO_spacekime:  0.7
cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 3])  
# predLASSO (spacetime) vs.  predLASSO_IFT_SwappedPhase OA rank:  0.83

# Plot observed Y (Overall Country ranking), x-axis vs. Kime-Swapped LASSO (9-parameters) predicted Y^
linFit1_kime_swapped <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4])
plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="predLASSO_IFT_SwappedPhase_FT_Y", ylab="predLASSO_spacekime_swapped Country Overall Ranking",
     main = sprintf("Observed (x) vs. Kime IFT_SwappedPhase_FT_Y (y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])))
abline(linFit1_kime_swapped, lwd=3, col="red")
#abline(linFit1_kime, lwd=3, col="green")

# Plot Spacetime LASSO forecasting
# Plot observed Y (Overall Country ranking), x-axis vs. LASSO (9-parameters) predicted Y^, y-axis
linFit1_spacetime <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4])
plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="predLASSO_spacetime",
     main = sprintf("Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])))
abline(linFit1_spacetime, lwd=3, col="red")

# add Top_30_Ranking_Indicator
validation_kime_swapped <- as.data.frame(cbind(validation_kime_swapped, ifelse (validation_kime_swapped[,4]<=30, 1, 0)))
colnames(validation_kime_swapped)[5] <- "Top30Rank"
rownames(validation_kime_swapped)[11] <- "Germany"
head(validation_kime_swapped)

# Spacetime LASSO modeling
myPlotSwappedPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y,
            y=validation_kime_swapped[, 3], label=rownames(validation_kime_swapped))) +
     geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) +
     geom_label_repel(aes(label = rownames(validation_kime_swapped),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacekime LASSO Predicted, Swapped-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacekime LASSO Predicted, using Swapped-Phases")
myPlotSwappedPhase
```

### Stronger Signal Analytics

Using all 386 features (378 ARIMA signatures + 8 meta-data).

##### Space-Time Analytics

```{r message=F, error=F, warning=F}
# 1. LASSO regression/feature extraction
library(glmnet)
library(arm)
library(knitr)
# subset test data
Y = aggregate_arima_vector_country_ranking_df$OA
X = aggregate_arima_vector_country_ranking_df[ , 1:386]
# remove columns containing NAs
X = X[ , colSums(is.na(X)) == 0]; dim(X)  # [1]  31 378

#### 10-fold cross validation: for the LASSO
set.seed(4321)
cvLASSO_lim_all = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_lim)
mtext("CV LASSO (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_lim_all <-  predict(cvLASSO_lim_all, s = 1.1,  # cvLASSO_lim$lambda.min, 
                          newx = data.matrix(X))
coefList_lim_all <- coef(cvLASSO_lim_all, s='lambda.min')
coefList_lim_all <- data.frame(coefList_lim_all@Dimnames[[1]][coefList_lim_all@i+1],coefList_lim_all@x)
names(coefList_lim_all) <- c('Feature','EffectSize')
arrange(coefList_lim_all, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_lim_all[, 1])  # 0.9974121

varImp(cvLASSO_lim_all, lambda = 1.1) # cvLASSO_lim_all$lambda.min)
#Feature_1_ArimaVec_8  2.7518241
#Feature_9_ArimaVec_4  0.2662136
#Feature_9_ArimaVec_8  1.0871240
#Feature_20_ArimaVec_8 1.6851990
#Feature_25_ArimaVec_5 0.5113345
#IncomeGroup           1.1787811
#ED                    0.7508295
#QOL                   0.2057181
#PE                    0.5131427

coef(cvLASSO_lim_all, s = 1.1) %>%  #"lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

countryNames[11] <- "Germany"
validation_lim_all <- data.frame(matrix(NA, nrow = dim(predLASSO_lim_all)[1], ncol=2), row.names=countryNames)
validation_lim_all [ , 1] <- Y; validation_lim_all[ , 2] <- predLASSO_lim_all[, 1]
colnames(validation_lim_all) <- c("Orig_Y", "LASSO")
dim(validation_lim_all); head(validation_lim_all)

# add Top_30_Ranking_Indicator
validation_lim_all <- as.data.frame(cbind(validation_lim_all, ifelse (validation_lim_all[, 1]<=30, 1, 0)))
colnames(validation_lim_all)[3] <- "Top30Rank"
head(validation_lim_all)

# Prediction correlations: 
cor(validation_lim_all[ , 1], validation_lim_all[, 2])  # Y=observed OA rank vs. LASSO-pred  0.98 (lim) 0.84

# Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^
linFit_lim_all <- lm(validation_lim_all[ , 1] ~ validation_lim_all[, 2])
plot(validation_lim_all[ , 1] ~ validation_lim_all[, 2],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="LASSO (42*9 +8) param model",
     main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_lim_all[ , 1], validation_lim_all[, 2])))
abline(linFit_lim_all, lwd=3, col="red")

# Plot
myPlot_all <- ggplot(as.data.frame(validation_lim_all), aes(x=validation_lim_all[ , 1],
            y=validation_lim_all[ , 2], label=rownames(validation_lim_all))) +
     geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_lim_all)))) +
     geom_label_repel(aes(label = rownames(validation_lim_all),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacetime LASSO Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_lim_all[ , 1], validation_lim_all[, 2])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacetime LASSO Predicted")
myPlot_all
```

##### Space-Kime Analytics

**Nil-Phase Synthesis** and LASSO model estimation ...

```{r message=F, error=F, warning=F}
library(glmnet)
# Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain): kSpaceTransform(data, inverse = FALSE, reconPhases = NULL) 
  # ForwardFT (rawData, FALSE, NULL)
  # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL)
# DATA
# subset test data
Y = aggregate_arima_vector_country_ranking_df$OA
X = aggregate_arima_vector_country_ranking_df[ , 1:386]
# remove columns containing NAs
# X = X[ , colSums(is.na(X)) == 0]; dim(X)  # [1]  31 386
length(Y); dim(X)

FT_aggregate_arima_vector_country_ranking_df <- 
  kSpaceTransform(aggregate_arima_vector_country_ranking_df, inverse = FALSE, reconPhases = NULL) 
  
## Kime-Phase Distributions
# Examine the Kime-direction Distributions of the Phases for all *Belgium* features (predictors + outcome). Define a generic function that plots the Phase distributions.
# plotPhaseDistributions(dataFT, dataColnames)
plotPhaseDistributions(FT_aggregate_arima_vector_country_ranking_df,
                       colnames(aggregate_arima_vector_country_ranking_df), size=4, cex=0.1)

IFT_FT_aggregate_arima_vector_country_ranking_df <- 
  kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                  TRUE, FT_aggregate_arima_vector_country_ranking_df$phases)
# Check IFT(FT) == I: 
# ifelse(aggregate_arima_vector_country_ranking_df[5,4] -
#     Re(IFT_FT_aggregate_arima_vector_country_ranking_df[5,4]) < 0.001, "Perfect Synthesis", "Problems!!!")

##############################################
# Nil-Phase Synthesis and LASSO model estimation
# 1. Nil-Phase data synthesis (reconstruction)
temp_Data <- aggregate_arima_vector_country_ranking_df
nilPhase_FT_aggregate_arima_vector <- 
  array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(nilPhase_FT_aggregate_arima_vector)    # ;  head(nilPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_NilPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

#       Invert back to spacetime the 
# FT_aggregate_arima_vector_country_ranking_df$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, nilPhase_FT_aggregate_arima_vector))

colnames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_NilPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_NilPhase_FT_aggregate_arima_vector)
# IFT_NilPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_NilPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
# library(forecast)
set.seed(123)
cvLASSO_nil_kime_all = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:386]), 
           # IFT_NilPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE)
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_nil_kime_all)
mtext("(Spacekime, Nil-phase) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_nil_kime_all <-  predict(cvLASSO_nil_kime_all, s = exp(-1/4),    # cvLASSO_nil_kime$lambda.min, 
        newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:386])); predLASSO_nil_kime_all
# testMSE_LASSO_nil_kime <- mean((predLASSO_nil_kime - IFT_NilPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_nil_kime

varImp(cvLASSO_nil_kime_all, lambda = exp(-1/4)) # cvLASSO_nil_kime_all$lambda.min)

coefList_nil_kime_all <- coef(cvLASSO_nil_kime_all, s=exp(-1/4))   # 'lambda.min')
coefList_nil_kime_all <- data.frame(coefList_nil_kime_all@Dimnames[[1]][coefList_nil_kime_all@i+1],
                                    coefList_nil_kime_all@x)
names(coefList_nil_kime_all) <- c('Feature','EffectSize')
arrange(coefList_nil_kime_all, -abs(EffectSize))[1:9, ]
#                 Feature  EffectSize
#1           (Intercept) 26.385520159
#Feature_2_ArimaVec_6   0.07190025
#Feature_6_ArimaVec_6   0.49799326
#Feature_11_ArimaVec_4  8.45132661
#Feature_12_ArimaVec_4  5.38002499
#Feature_12_ArimaVec_8 10.37633956
#Feature_26_ArimaVec_6  2.06530937
#Feature_30_ArimaVec_4  3.29579474
#Feature_34_ArimaVec_5  0.96507673
#Feature_37_ArimaVec_2  0.01033978
#Feature_39_ArimaVec_5  2.08659578
#     ARIMA-spacetime:     4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekimeNil:  2=forecast_avg, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 8=non-seasonal Diff
#9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_nil_kime_all, s = exp(-1/4)) %>%   # "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

# pack a 31*3 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, Y)
validation_nil_kime_all <- cbind(predLASSO_nil_kime_all[, 1], 
                         IFT_NilPhase_FT_aggregate_arima_vector[ , 387], Y)  
colnames(validation_nil_kime_all) <- c("predLASSO_kime", "IFT_NilPhase_FT_Y", "Orig_Y")
rownames(validation_nil_kime_all)[11] <- "Germany"
head(validation_nil_kime_all)

validation_nil_kime_all <- as.data.frame(cbind(validation_nil_kime_all, 
                              ifelse (validation_nil_kime_all[,3]<=30, 1, 0)))
colnames(validation_nil_kime_all)[4] <- "Top30Rank"
head(validation_nil_kime_all)

# Prediction correlations: 
# cor(validation_nil_kime[ , 1], validation_nil_kime[, 2])  # Y=predLASSO_kime OA rank vs. kime_LASSO_pred:  0.99
cor(validation_nil_kime_all[ , 1], validation_nil_kime_all[, 3])  # Y=predLASSO_kime OA rank vs. Orig_Y:  0.64

# Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^
linFit1_nil_kime_all <- lm(predLASSO_nil_kime_all ~ validation_nil_kime_all[ , 3])
plot(predLASSO_nil_kime_all ~ validation_nil_kime_all[ , 3],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="IFT_NilPhase predLASSO_kime",
     main = sprintf("Observed (x) vs. IFT_NilPhase Predicted (y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_nil_kime_all[ , 1], validation_nil_kime_all[, 3])))
abline(linFit1_nil_kime_all, lwd=3, col="red")
# abline(linFit1, lwd=3, col="green")

# Spacetime LASSO modeling
myPlotNilPhase_all <- ggplot(as.data.frame(validation_nil_kime_all), aes(x=Orig_Y,
            y=predLASSO_nil_kime_all, label=rownames(validation_nil_kime_all))) +
     geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_nil_kime)))) +
     geom_label_repel(aes(label = rownames(validation_nil_kime_all),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacekime LASSO Predicted, Nil-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_nil_kime_all[ , 1], validation_nil_kime_all[, 3])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacekime LASSO Predicted, using Nil-Phases")
myPlotNilPhase_all
```

**Swapped Feature Phases** and then synthesize the data (reconstruction)

```{r message=F, error=F, warning=F}
# temp_Data <- aggregate_arima_vector_country_ranking_df
swappedPhase_FT_aggregate_arima_vector <- FT_aggregate_arima_vector_country_ranking_df$phases
dim(swappedPhase_FT_aggregate_arima_vector)    # ;  head(swappedPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_SwappedPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

set.seed(1234)   # sample randomly Phase-columns for each of the 131 covariates (X)
swappedPhase_FT_aggregate_arima_vector1 <- as.data.frame(cbind(
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 1:378]))],  # mix ARIMA signature phases
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 379:386]))],# mix the meta-data phases
  swappedPhase_FT_aggregate_arima_vector[ , 387]))                          # add correct Outcome phase
swappedPhase_FT_aggregate_arima_vector <- swappedPhase_FT_aggregate_arima_vector1

colnames(swappedPhase_FT_aggregate_arima_vector) <- colnames(temp_Data)
colnames(swappedPhase_FT_aggregate_arima_vector); dim(swappedPhase_FT_aggregate_arima_vector)
# 31 387

#       Invert back to spacetime the 
# FT_aggregate_arima_vector$magnitudes[ , i] signal with swapped-X-phases (Y-phase is fixed)
IFT_SwappedPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, swappedPhase_FT_aggregate_arima_vector))

# Save IFT_SwappedPhase_FT_aggregate_arima_vector out for PCA/t-SNE SpaceKime modeling
#options(digits = 2)
#write.table(format(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:386]), 
#                   #scientific=FALSE),  #, digits=2), 
#          fileEncoding = "UTF-16LE", append = FALSE, quote = FALSE, sep = "\t",
#          eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = FALSE,
#          "C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_TensorData_31Countries_By_386Features_SpaceKime_SwappedPhase.txt")

colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_SwappedPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_SwappedPhase_FT_aggregate_arima_vector)
# IFT_SwappedPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_SwappedPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
set.seed(12)
cvLASSO_kime_swapped = 
  cv.glmnet(data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378]), 
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime_swapped)
mtext("(Spacekime, Swapped-Phases) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime_swapped <-  predict(cvLASSO_kime_swapped, s = cvLASSO_kime_swapped$lambda.min, 
        newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378]))
# testMSE_LASSO_kime_swapped <- 
#        mean((predLASSO_kime_swapped - IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_kime_swapped
predLASSO_kime_swapped

# Plot Regression Coefficients: create variable names for plotting 
betaHatLASSO_kime_swapped = as.double(coef(cvLASSO_kime_swapped,
                   s=cvLASSO_kime_swapped$lambda.min))
#cvLASSO_kime_swapped$lambda.1se
#coefplot(betaHatLASSO_kime_swapped[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", 
#         main = "(Spacekime, Swapped-Phases) LASSO-Regularized Regression Coefficient Estimates",
#         varnames = varNames[377:386])
varImp(cvLASSO_kime_swapped, lambda = cvLASSO_kime_swapped$lambda.min)

coefList_kime_swapped <- coef(cvLASSO_kime_swapped, s='lambda.min')
coefList_kime_swapped <- data.frame(coefList_kime_swapped@Dimnames[[1]][coefList_kime_swapped@i+1], coefList_kime_swapped@x)
names(coefList_kime_swapped) <- c('Feature','EffectSize')
arrange(coefList_kime_swapped, -abs(EffectSize))[2:10, ]
#                 Feature  EffectSize
#2   Feature_2_ArimaVec_8  1.484856e+15
#3  Feature_42_ArimaVec_7  1.970862e+14
#4  Feature_23_ArimaVec_7 -2.467246e+13
#5  Feature_37_ArimaVec_5 -2.983216e+00
#6  Feature_34_ArimaVec_4 -1.382639e+00
#7  Feature_36_ArimaVec_3 -1.198157e+00
#8   Feature_6_ArimaVec_3  1.106294e-01
#9  Feature_38_ArimaVec_2 -1.058259e-02
#10 Feature_38_ArimaVec_1 -1.124584e-03
#
#     ARIMA-spacetime:        4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekime_nill:   3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
#    ARIMA-spacekime_swapped: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
# 9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_kime_swapped, s = "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime, Swapped-Phases) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

# pack a 31*4 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, 
#                      IFT_SwappedPhase_FT_aggregate_arima_vector_Y, Y)
validation_kime_swapped <- cbind(predLASSO[, 1], predLASSO_nil_kime_all[, 1], 
                         predLASSO_kime_swapped[ , 1], Y)  
colnames(validation_kime_swapped) <- c("predLASSO (spacetime)", "predLASSO_IFT_NilPhase",
                               "predLASSO_IFT_SwappedPhase", "Orig_Y")
head(validation_kime_swapped); dim(validation_kime_swapped)

# Prediction correlations: 
cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])  
# predLASSO_IFT_SwappedPhase OA rank vs. predLASSO_spacekime:  0.7
cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 3])  
# predLASSO (spacetime) vs.  predLASSO_IFT_SwappedPhase OA rank:  0.83

# Plot observed Y (Overall Country ranking), x-axis vs. Kime-Swapped LASSO (9-parameters) predicted Y^
linFit1_kime_swapped <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4])
plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="predLASSO_IFT_SwappedPhase_FT_Y", ylab="predLASSO_spacekime_swapped Country Overall Ranking",
     main = sprintf("Observed (x) vs. Kime IFT_SwappedPhase_FT_Y (y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])))
abline(linFit1_kime_swapped, lwd=3, col="red")
#abline(linFit1_kime, lwd=3, col="green")

# Plot Spacetime LASSO forecasting
# Plot observed Y (Overall Country ranking), x-axis vs. LASSO (9-parameters) predicted Y^, y-axis
linFit1_spacetime <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4])
plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="predLASSO_spacetime",
     main = sprintf("Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])))
abline(linFit1_spacetime, lwd=3, col="red")

# add Top_30_Ranking_Indicator
validation_kime_swapped <- as.data.frame(cbind(validation_kime_swapped, ifelse (validation_kime_swapped[,4]<=30, 1, 0)))
colnames(validation_kime_swapped)[5] <- "Top30Rank"
rownames(validation_kime_swapped)[11] <- "Germany"
head(validation_kime_swapped)

# Write out the Binary top-30/Not-Top-30 labels
df_top30 <- cbind(countryNames, validation_kime_swapped[ , 5])
colnames(df_top30) <- c("Country", "Top-30-Label")
write.table(df_top30, 
          fileEncoding = "UTF-16LE", append = FALSE, quote = FALSE, sep = "\t",
          eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = TRUE,
          "C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_Labels_31Countries_Top_30_BinaryOutcome.txt")

# Spacetime LASSO modeling
myPlotSwappedPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y,
            y=validation_kime_swapped[, 3], label=rownames(validation_kime_swapped))) +
     geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) +
     geom_label_repel(aes(label = rownames(validation_kime_swapped),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacekime LASSO Predicted, Swapped-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacekime LASSO Predicted, using Swapped-Phases")
myPlotSwappedPhase
```


#### Overall Plots

```{r message=F, error=F, warning=F}
myPlotNilPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y,
            y=predLASSO_kime, label=rownames(validation_kime_swapped))) +
     geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) +
     geom_label_repel(aes(label = rownames(validation_kime_swapped),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacekime LASSO Predicted, Nil-Phases, (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_kime_swapped[ , 2], validation_kime_swapped[, 4])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacekime LASSO Predicted, using Nil-Phases")
myPlotNilPhase

countryNames[11]<-"Germany"
aggregateResults <- (rbind(cbind(as.character(countryNames), "predLASSO_spacetime_386", as.numeric(predLASSO)), 
                     cbind(as.character(countryNames), "predLASSO_spacetime_378", predLASSO_lim),
                     cbind(as.character(countryNames), "predLASSO_nil_kime_386", predLASSO_nil_kime_all),
                     cbind(as.character(countryNames), "predLASSO_nil_kime_378", predLASSO_nil_kime),
                     cbind(as.character(countryNames), "predLASSO_swapped_kime_386", predLASSO_kime_swapped),
                     cbind(as.character(countryNames), "predLASSO_swapped_kime_378", predLASSO_kime_swapped_lim),
                     cbind(as.character(countryNames), "observed", Y)
                ))
aggregateResults <- data.frame(aggregateResults[ , -3], as.numeric(aggregateResults[,3]))
colnames(aggregateResults) <- c("country", "estimate_method", "ranking")
ggplot(aggregateResults, aes(x=country, y=ranking, color=estimate_method)) +
  geom_point(aes(shape=estimate_method, color=estimate_method, size=estimate_method)) + 
  geom_point(size = 5) +
  geom_line(data = aggregateResults[aggregateResults$estimate_method == "observed", ], 
            aes(group = estimate_method), size=2, linetype = "dashed") +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
  # theme(legend.position = "bottom") +
  # scale_shape_manual(values = as.factor(aggregateResults$estimate_method)) +
  # scale_color_gradientn(colours = rainbow(7)) +
  theme(text = element_text(size = 15), legend.position = c(0.3, 0.85), 
           axis.text=element_text(size=16),
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold"))

library(plotly)
plot_ly(aggregateResults) %>%
  add_markers(x = ~country, y = ~ranking, type = "scatter", 
        color = ~estimate_method, colors = c("black", "red", "blue", "green", "purple", "orange", "yellow"),
        mode = "markers", marker = list(size = ~ranking, opacity=~(1-(ranking-1)/49),
                                        line = list(color = "black", width = 2))) %>% 
    layout(title="Spacekime Analytics - Country Ranking using Different Phase Estimators",
           legend = list(orientation = "h",   # show legend horizontally
                     xanchor = "center",      # use center of legend as anchor
                     x = 0))                # put legend in center of x-axis)
```

### Unsupervised clustering

Use [hierarchical, k-means and spectral clustering](https://dspa.predictive.space/) to generate derived-computed phenotypes of countries. Do these derived labels relate (correspond to) the overall (OA) country ranking?

```{r message=F, error=F, warning=F}
load("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_SpaceKime.RData")
# View(aggregate_arima_vector_country_ranking_df)
dim(aggregate_arima_vector_country_ranking_df)  
# 31(countries) 387(features)
# Features = country-index + 386 features (378 time-series derivatives + 8 meta-data features)
```

#### Spacetime analytics

 - **1.1 Lasso features selection**

```{r warning=F, error=F}
eudata <- aggregate_arima_vector_country_ranking_df
colnames(eudata) <- c("country",colnames(eudata[,-1]))
eudata <- eudata[ , -ncol(eudata)]
Y<-aggregate_arima_vector_country_ranking_df$OA

# Complete data 386 features (378 + 8)
X<-eudata[,-ncol(eudata)]; dim(X)
# TS-derivative features only (378)
X378 <- X[, -c(379:386)]; dim(X378)
countryinfo<-as.character(X[,1])
countryinfo[11]<-"Germany"
X<-X[,-1]
keeplist<-NULL
for (i in 1:ncol(X)) {
  if(FALSE %in% (X[,i]==mean(X[,i]))) {keeplist<-c(keeplist,i)}
}
X<-X[,keeplist]; dim(X)

# Reduced to 378 features 
#countryinfo<-as.character(X378[,1])
#countryinfo[11]<-"Germany"
#X378<-X378[,-1]
#keeplist<-NULL
#for (i in 1:ncol(X378)) {
#    if(FALSE %in% (X378[,i]==mean(X378[,i]))) {keeplist<-c(keeplist,i)}
#}
#X378<-X378[,keeplist]; dim(X378)

library(glmnet)
fitLASSO <- glmnet(as.matrix(X), Y, alpha = 1)
library(doParallel)
registerDoParallel(5)
#cross-validation
cvLASSO = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE)

# fitLASSO <- glmnet(as.matrix(X378), Y, alpha = 1)
#library(doParallel)
#registerDoParallel(5)
#cross-validation
#cvLASSO = cv.glmnet(data.matrix(X378), Y, alpha = 1, parallel=TRUE)

# To choose features we like to have based on lasso
chooselambda <- function(cvlasso, option, k=NULL) {
  lambmat<-cbind(cvlasso$glmnet.fit$df,cvlasso$glmnet.fit$lambda)
  result<-tapply(lambmat[,2],lambmat[,1],max)
  kresult<-result[which(names(result)==as.factor(k))]
  if(option==1) {return(result)}
  else if (option==2) {return(kresult)}
  else (return("Not a valid option"))
}
showfeatures <- function(object, lambda, k ,...) {
  lam<-lambda[which(names(lambda)==as.factor(k))]
  beta <- predict(object, s = lam, type = "coef")
  if(is.list(beta)) {
    out <- do.call("cbind", lapply(beta, function(x) x[,1]))
    out <- as.data.frame(out)
    s <- rowSums(out)
    out <- out[which(s)!=0,,drop=FALSE]
  } else  {out<-data.frame(Overall = beta[,1])
  out<-out[which(out!=0),,drop=FALSE]
  }
  out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE])
  out
}
```

 - ** 1.2 Comparison of different ML algorithms of different feature numbers**

```{r message=F, error=F, warning=F}
#test training data setup
randchoose <- function(matr) {
  leng<-nrow(matr)
  se<-seq(1:leng)
  sam<-sample(se,as.integer(leng*0.6))
  return(sam)
}

eusample<-X
eusample$Rank<-as.factor(ifelse(Y<30, 1, 0))
set.seed(1234)
eutrain<-eusample[randchoose(eusample), ]
set.seed(1234)
eutest<-eusample[-randchoose(eusample), ]

eusample378 <- X378
eusample378$Rank <- as.factor(ifelse(Y<30, 1, 0))
set.seed(1234)
eutrain378 <- eusample378[randchoose(eusample378), ]
set.seed(1234)
eutest378 <- eusample378[-randchoose(eusample378), ]

# Load Libraries
library(e1071)
library("randomForest")
library(ada); library(adabag)
library(caret)
library(kernlab)
library(cluster)
library(ipred)
library(ggplot2)

MLcomp <- function(fitlas, cvlas, trn, test, option=1) {
  allfeat<-as.numeric(names(chooselambda(cvlasso = cvlas, option = 1)))
  allfeat<-allfeat[which(allfeat>4)]
  trainlist<-as.list(NULL)
  for (i in 1:length(allfeat)) {
    trainlist[[i]]<-trn[,which(colnames(trn) %in% 
                                 c(row.names(showfeatures(fitlas, chooselambda(cvlas = cvlas,1), allfeat[i])), "Rank"))]
  }
resultframe<-data.frame(origin=rep(NA,length(allfeat)))
rownames(resultframe)<-allfeat
resultframe$Decision_tree_bagging<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    eubag<-ipred::bagging(Rank~.,data = trainlist[[i]],nbagg=100)
    bagtest<-predict(eubag, eutest)
    bagagg<-bagtest==eutest$Rank
    accuracy<-prop.table(table(bagagg))[c("TRUE")]
    resultframe$Decision_tree_bagging[i]<-accuracy
  }
resultframe$Random_forest<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    eurf<-randomForest(Rank~.,data=trainlist[[i]])
    rftest<-predict(eurf,eutest)
    rfagg<-rftest==eutest$Rank
    accuracy<-prop.table(table(rfagg))[c("TRUE")]
    resultframe$Random_forest[i]<-accuracy
  }
resultframe$Decision_tree_adaboost<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    enada<-ada(Rank~.,data = trainlist[[i]],iter=50)
    adatest<-predict(enada,eutest)
    adaagg<-adatest==eutest$Rank
    accuracy<-prop.table(table(adaagg))[c("TRUE")]
    resultframe$Decision_tree_adaboost[i]<-accuracy
  }
resultframe$GLM<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    euglm<-glm(Rank~.,data = trainlist[[i]],family = "binomial")
    glmtest<-predict(euglm,eutest)
    glmtest<-ifelse(glmtest<0,0,1)
    glmagg<-glmtest==eutest$Rank
    accuracy<-prop.table(table(glmagg))[c("TRUE")]
    resultframe$GLM[i]<-accuracy
  }
resultframe$SVM_best_Gamma_Cost<-rep(NA,length(allfeat))  
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    svmtune<-tune.svm(Rank~.,data = trainlist[[i]],gamma = 10^(-6:1),cost = 10^(-10:10))
    svmed<-svm(Rank~.,data=trainlist[[i]],gamma=svmtune$best.parameters[1],cost=svmtune$best.parameters[2])
    svmtest<-predict(svmed,eutest)
    svmagg<-svmtest==eutest$Rank
    accuracy<-prop.table(table(svmagg))[c("TRUE")]
    resultframe$SVM_best_Gamma_Cost[i]<-accuracy
  }
  resultframe$origin<-NULL
  if(option==1){return(resultframe)}
}
resultframe <- MLcomp(fitLASSO, cvLASSO, eutrain, eutest, 1)
resultframe_386_ST <- resultframe
# View(resultframe_386_ST)

# resultframe_378_ST <- MLcomp(fitLASSO, cvLASSO, eutrain378, eutest378, 1)

# Display results
resultframe$features<-as.factor(as.numeric(rownames(resultframe)))
ppframe<-data.frame(NULL)
for (i in 1:5) {
  FM <- data.frame(resultframe[,i], resultframe$features,
                   Methods<-rep(colnames(resultframe)[i], nrow(resultframe)))
  ppframe<-rbind(ppframe, FM)
}
colnames(ppframe)<-c("Accuracy", "Features", "Methods")
ggplot(ppframe, aes(x=Features, y=Accuracy, colour=Methods, 
                    group=Methods, shape=Methods))+
  geom_line(position=position_dodge(0.2), lwd=2)+
  ylim(0.2, 1.0) +
  geom_point(size=5, position=position_dodge(0.2))+
  theme(legend.position="top", legend.text=element_text(size=16))+
  ggtitle("Spacetime (386 features): Compare ML Forecasting Results")+
  theme(
    axis.text=element_text(size=16),
    plot.title = element_text(size=18, face="bold.italic"),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold"))

# spacetime (ST) 378_ST
resultframe_378_ST$features<-as.factor(as.numeric(rownames(resultframe_378_ST)))
ppframe_378_ST<-data.frame(NULL)
for (i in 1:5) {
  FM_378_ST <- data.frame(resultframe_378_ST[,i], resultframe_378_ST$features,
          Methods<-rep(colnames(resultframe_378_ST)[i], nrow(resultframe_378_ST)))
  ppframe_378_ST<-rbind(ppframe_378_ST, FM_378_ST)
}
colnames(ppframe_378_ST)<-c("Accuracy", "Features", "Methods")
ggplot(ppframe_378_ST, aes(x=Features, y=Accuracy, colour=Methods, 
                    group=Methods, shape=Methods))+
  geom_line(position=position_dodge(0.2), lwd=2)+
  ylim(0.2, 1.0) +
  geom_point(size=5, position=position_dodge(0.2))+
  theme(legend.position="top", legend.text=element_text(size=16))+
  ggtitle("Spacetime (386 features): Compare ML Forecasting Results")+
  theme(
    axis.text=element_text(size=16),
    plot.title = element_text(size=18, face="bold.italic"),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold"))
```

 - **1.3 Clustering**

```{r message=F, error=F, warning=F}
showfeatures(fitLASSO, chooselambda(cvLASSO,1), 10)
feat_5 <-  predict(fitLASSO, s = chooselambda(cvLASSO,2,10), newx = data.matrix(X))
df1 <- as.data.frame(rbind(as.numeric(feat_5),Y), 
                     row.names = c("Predicted Rank","OA Rank"))
colnames(df1) <- countryNames
df1 # View(t(df1))

# Clustering
cluster5 <- X[, which(colnames(X) %in%
                        row.names(showfeatures(fitLASSO, chooselambda(cvLASSO,1), 10)))]
rownames(cluster5) <- countryNames # countryinfo

#1. hierarchical clustering
scaled_cluster5 <- scale(cluster5)
##deal with NAN values
#scaled_country<-scaled_country[,which(is.nan(scaled_country[1,])==FALSE)]
dis_SC5 <- dist(scaled_cluster5)
H_clust_SC5 <- hclust(dis_SC5)

library("factoextra")
library("FactoMineR")
H_clust_SC5 <- eclust(scaled_cluster5, k=5, "hclust")
fviz_dend(H_clust_SC5, rect = TRUE, cex=0.5)
# fviz_dend(H_clust_SC5, lwd=2, rect = TRUE)


# ST 378
cluster5_378_ST <- X378[, which(colnames(X378) %in%
                        row.names(showfeatures(fitLASSO, chooselambda(cvLASSO,1), 10)))]
rownames(cluster5_378_ST) <- countryNames # countryinfo
#1. hierarchical clustering
scaled_cluster5_378_ST <- scale(cluster5_378_ST)
dis_SC5_378_ST <- dist(scaled_cluster5_378_ST)
H_clust_SC5_378_ST <- hclust(dis_SC5_378_ST)
H_clust_SC5_378_ST <- eclust(scaled_cluster5_378_ST, k=5, "hclust")
fviz_dend(H_clust_SC5_378_ST,rect = TRUE, cex=0.5)
```

#### Spacekime - Nil-Phase

 - ** 2.1 Lasso features selection**

 - ** 2.2 Comparison of different ML algorithms of different feature numbers**

 - ** 2.3 Clustering**
  
#### Spacekime - Swapped-Phase

 - ** 3.1 Lasso features selection**

```{r warning=F, error=F}
dim(IFT_SwappedPhase_FT_aggregate_arima_vector)
# [1]  31 387
eudata_SwappedPhase <- IFT_SwappedPhase_FT_aggregate_arima_vector
colnames(eudata_SwappedPhase) <- c("country", colnames(eudata_SwappedPhase[,-1]))
eudata_SwappedPhase <- as.data.frame(eudata_SwappedPhase[ , -ncol(eudata_SwappedPhase)])
Y <- as.data.frame(IFT_SwappedPhase_FT_aggregate_arima_vector)$OA

# Complete data 386 features (378 + 8)
X <- eudata_SwappedPhase
countryinfo<-as.character(X[,1])
countryinfo[11]<-"Germany"
X<-X[,-1]
keeplist<-NULL
for (i in 1:ncol(X)) {
  if(FALSE %in% (X[,i]==mean(X[,i]))) {keeplist<-c(keeplist,i)}
}
X<-X[,keeplist]; dim(X)   # 31 343

# Reduced to 378 features 
# TS-derivative features only (378)
# X378 <- X[, -c(379:386)]; dim(X378)
#countryinfo<-as.character(X378[,1])
#countryinfo[11]<-"Germany"
#X378<-X378[,-1]
#keeplist<-NULL
#for (i in 1:ncol(X378)) {
#    if(FALSE %in% (X378[,i]==mean(X378[,i]))) {keeplist<-c(keeplist,i)}
#}
#X378<-X378[,keeplist]; dim(X378)

library(glmnet)
fitLASSO_X <- glmnet(as.matrix(X), Y, alpha = 1)
library(doParallel)
registerDoParallel(5)
#cross-validation
cvLASSO_X = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE)

# fitLASSO_X <- glmnet(as.matrix(X378), Y, alpha = 1)
#library(doParallel)
#registerDoParallel(5)
#cross-validation
#cvLASSO_X = cv.glmnet(data.matrix(X378), Y, alpha = 1, parallel=TRUE)

# To choose features we like to have based on lasso
chooselambda <- function(cvlasso, option, k=NULL) {
  lambmat<-cbind(cvlasso$glmnet.fit$df,cvlasso$glmnet.fit$lambda)
  result<-tapply(lambmat[,2],lambmat[,1],max)
  kresult<-result[which(names(result)==as.factor(k))]
  if(option==1) {return(result)}
  else if (option==2) {return(kresult)}
  else (return("Not a valid option"))
}
showfeatures <- function(object, lambda, k ,...) {
  lam<-lambda[which(names(lambda)==as.factor(k))]
  beta <- predict(object, s = lam, type = "coef")
  if(is.list(beta)) {
    out <- do.call("cbind", lapply(beta, function(x) x[,1]))
    out <- as.data.frame(out)
    s <- rowSums(out)
    out <- out[which(s)!=0,,drop=FALSE]
  } else  {out<-data.frame(Overall = beta[,1])
  out<-out[which(out!=0),,drop=FALSE]
  }
  out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE])
  out
}
```

 - ** 3.2 Comparison of different ML algorithms of different feature numbers**

```{r message=F, error=F, warning=F}
#test training data setup
randchoose <- function(matr) {
  leng<-nrow(matr)
  se<-seq(1:leng)
  sam<-sample(se,as.integer(leng*0.6))
  return(sam)
}

Xsample <- X
Xsample$Rank <- as.factor(ifelse(Y<30, 1, 0))
set.seed(1234)
Xtrain <- Xsample[randchoose(Xsample), ]
set.seed(1234)
Xtest <- Xsample[-randchoose(Xsample), ]

#Xsample378 <- X378
#Xsample378$Rank <- as.factor(ifelse(Y<30, 1, 0))
#set.seed(1234)
#Xtrain378 <- Xsample378[randchoose(Xsample378), ]
#set.seed(1234)
#Xtest378 <- Xsample378[-randchoose(Xsample378), ]

# Load Libraries
library(e1071)
library("randomForest")
library(ada); library(adabag)
library(caret)
library(kernlab)
library(cluster)
library(ipred)
library(ggplot2)

# resultXframe <- MLcomp(fitLASSO, cvLASSO, Xtrain, Xtest, 1)
MLcompX <- function(fitlas, cvlas, trn, test, option=1) {
  allfeat<-as.numeric(names(chooselambda(cvlasso = cvlas, option = 1)))
  allfeat<-allfeat[which(allfeat>4)]
  trainlist<-as.list(NULL)
  for (i in 1:length(allfeat)) {
    trainlist[[i]]<-trn[,which(colnames(trn) %in% 
                                 c(row.names(showfeatures(fitlas, chooselambda(cvlas = cvlas,1), allfeat[i])), "Rank"))]
  }
 
  resultXframe<-data.frame(origin=rep(NA,length(allfeat)))
  rownames(resultXframe)<-allfeat
  resultXframe$Decision_tree_bagging<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    #ERROR HANDLING
    possibleError <- tryCatch(
        function () {
          set.seed(1234)
          Xbag<-ipred::bagging(Rank~ . ,data = trainlist[[i]], nbagg=100, 
                               control=rpart.control(minsplit=2, cp=0.1, xval=10))
          bagtest<-predict(Xbag, Xtest)
          bagagg<-bagtest==Xtest$Rank
          accuracy<-prop.table(table(bagagg))[c("TRUE")]
          resultXframe$Decision_tree_bagging[i]<-accuracy
        },
        error=function(e) e
    )
    if(inherits(possibleError, "error")) next
    # print(i)
  }

  resultXframe$Random_forest<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    Xrf<-randomForest(Rank~.,data=trainlist[[i]])
    rftest<-predict(Xrf,test)
    rfagg<-rftest==test$Rank
    accuracy<-prop.table(table(rfagg))[c("TRUE")]
    resultXframe$Random_forest[i]<-accuracy
  }

  resultXframe$Decision_tree_adaboost<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    Xada<-ada(Rank~.,data = trainlist[[i]],iter=50)
    adatest<-predict(Xada,test)
    adaagg<-adatest==test$Rank
    accuracy<-prop.table(table(adaagg))[c("TRUE")]
    resultXframe$Decision_tree_adaboost[i]<-accuracy
  }

  resultXframe$GLM<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    euglm<-glm(Rank~.,data = trainlist[[i]],family = "binomial")
    glmtest<-predict(euglm,test)
    glmtest<-ifelse(glmtest<0,0,1)
    glmagg<-glmtest==test$Rank
    accuracy<-prop.table(table(glmagg))[c("TRUE")]
    resultXframe$GLM[i]<-accuracy
  }

  resultXframe$SVM_best_Gamma_Cost<-rep(NA,length(allfeat))  
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    svmtune<-tune.svm(Rank~.,data = trainlist[[i]],gamma = 10^(-6:1),cost = 10^(-10:10))
    svmed<-svm(Rank~.,data=trainlist[[i]],gamma=svmtune$best.parameters[1],cost=svmtune$best.parameters[2])
    svmtest<-predict(svmed,test)
    svmagg<-svmtest==test$Rank
    accuracy<-prop.table(table(svmagg))[c("TRUE")]
    resultXframe$SVM_best_Gamma_Cost[i]<-accuracy
  }
  resultXframe$origin<-NULL
  if(option==1){return(resultXframe)}
}

resultXframe <- MLcompX(fitLASSO_X, cvLASSO_X, Xtrain, Xtest, 1)
resultXframe_386_SK_Swapped <- resultXframe
# View(resultXframe_386_SK_Swapped)

# resultXframe_378_SK_Swapped <- MLcompX(fitLASSO_X, cvLASSO_X, Xtrain378, Xtest378, 1)

# Display results
resultXframe$features<-as.factor(as.numeric(rownames(resultXframe)))
ppframeX<-data.frame(NULL)
for (i in 1:5) {
  FM <- data.frame(resultXframe[,i], resultXframe$features,
                   Methods<-rep(colnames(resultXframe)[i], nrow(resultXframe)))
  ppframeX<-rbind(ppframeX, FM)
}
colnames(ppframeX)<-c("Accuracy", "Features", "Methods")
ggplot(ppframeX, aes(x=Features, y=Accuracy, colour=Methods, 
                    group=Methods, shape=Methods))+
  geom_line(position=position_dodge(0.2), lwd=2)+
  ylim(0.2, 1.0) +
  geom_point(size=5, position=position_dodge(0.2))+
  theme(legend.position="top", legend.text=element_text(size=16))+
  ggtitle("Spacekime Swapped-Phases (386 features): Compare ML Forecasting Results")+
  theme(
    axis.text=element_text(size=16),
    plot.title = element_text(size=18, face="bold.italic"),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold"))

# spacetime (ST) 378_ST
resultframe_378_ST$features<-as.factor(as.numeric(rownames(resultframe_378_ST)))
ppframe_378_ST<-data.frame(NULL)
for (i in 1:5) {
  FM_378_ST <- data.frame(resultframe_378_ST[,i], resultframe_378_ST$features,
          Methods<-rep(colnames(resultframe_378_ST)[i], nrow(resultframe_378_ST)))
  ppframe_378_ST<-rbind(ppframe_378_ST, FM_378_ST)
}
colnames(ppframe_378_ST)<-c("Accuracy", "Features", "Methods")
ggplot(ppframe_378_ST, aes(x=Features, y=Accuracy, colour=Methods, 
                    group=Methods, shape=Methods))+
  geom_line(position=position_dodge(0.2), lwd=2)+
  ylim(0.2, 1.0) +
  geom_point(size=5, position=position_dodge(0.2))+
  theme(legend.position="top", legend.text=element_text(size=16))+
  ggtitle("Spacetime (386 features): Compare ML Forecasting Results")+
  theme(
    axis.text=element_text(size=16),
    plot.title = element_text(size=18, face="bold.italic"),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold"))

##################### for resultXframe_378_SK_Swapped
resultXframe_378_SK_Swapped$features<-as.factor(as.numeric(rownames(resultXframe_378_SK_Swapped)))
ppframeX<-data.frame(NULL)
for (i in 1:5) {
    FM <- data.frame(resultXframe_378_SK_Swapped[, i], resultXframe_378_SK_Swapped$features,
                     Methods<-rep(colnames(resultXframe_378_SK_Swapped)[i], nrow(resultXframe_378_SK_Swapped)))
    ppframeX<-rbind(ppframeX, FM)
}
colnames(ppframeX)<-c("Accuracy", "Features", "Methods")
ggplot(ppframeX, aes(x=Features, y=Accuracy, colour=Methods, 
                     group=Methods, shape=Methods))+
    geom_line(position=position_dodge(0.2), lwd=2)+
    ylim(0.2, 1.0) +
    geom_point(size=5, position=position_dodge(0.2))+
    theme(legend.position="top", legend.text=element_text(size=16))+
    ggtitle("Spacekime Swapped-Phases (386 features): Compare ML Forecasting Results")+
    theme(
        axis.text=element_text(size=16),
        plot.title = element_text(size=18, face="bold.italic"),
        axis.title.x = element_text(size=14, face="bold"),
        axis.title.y = element_text(size=14, face="bold"))
```

 - ** 3.3 Clustering**

```{r message=F, error=F, warning=F}
showfeatures(fitLASSO_X, chooselambda(cvLASSO_X, 1), 10)
feat_5 <-  predict(fitLASSO_X, s = chooselambda(cvLASSO_X, 2, 10), newx = data.matrix(X))
df1 <- as.data.frame(rbind(as.numeric(feat_5), Y), 
                     row.names = c("Predicted Rank","OA Rank"))
colnames(df1) <- countryNames
df1 # View(t(df1))

# Clustering
cluster5 <- X[, which(colnames(X) %in%
                        row.names(showfeatures(fitLASSO_X, chooselambda(cvLASSO_X, 1), 10)))]
rownames(cluster5) <- countryNames # countryinfo

#1. hierarchical clustering
scaled_cluster5 <- scale(cluster5)
##deal with NAN values
#scaled_country<-scaled_country[,which(is.nan(scaled_country[1,])==FALSE)]
dis_SC5 <- dist(scaled_cluster5)
H_clust_SC5 <- hclust(dis_SC5)

library("factoextra")
library("FactoMineR")
H_clust_SC5 <- eclust(scaled_cluster5, k=5, "hclust")
fviz_dend(H_clust_SC5, rect = TRUE, cex=0.5)
# fviz_dend(H_clust_SC5, lwd=2, rect = TRUE)


# ST 378
cluster5_378_SK <- X378[, which(colnames(X378) %in%
                        row.names(showfeatures(fitLASSO_X, chooselambda(cvLASSO_X, 1), 10)))]
rownames(cluster5_378_SK) <- countryNames # countryinfo
#1. hierarchical clustering
scaled_cluster5_378_SK <- scale(cluster5_378_SK)
dis_SC5_378_SK <- dist(scaled_cluster5_378_SK)
H_clust_SC5_378_SK <- hclust(dis_SC5_378_SK)
H_clust_SC5_378_SK <- eclust(scaled_cluster5_378_SK, k=5, "hclust")
fviz_dend(H_clust_SC5_378_SK,rect = TRUE, cex=0.5)
```

# Save Workspace

```{r message=F, error=F, warning=F}
# Save this entire Computed Workspace as an image:

# save.image("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_SpaceKime.RData")
```


<!--html_preserve-->
<div>
    	<footer><center>
			<a href="https://www.socr.umich.edu/">SOCR Resource</a>
				Visitor number <img class="statcounter" src="https://c.statcounter.com/5714596/0/038e9ac4/0/" alt="Web Analytics" align="middle" border="0">
				<script type="text/javascript">
					var d = new Date();
					document.write(" | " + d.getFullYear() + " | ");
				</script> 
				<a href="https://socr.umich.edu/img/SOCR_Email.png"><img alt="SOCR Email"
	 			title="SOCR Email" src="https://socr.umich.edu/img/SOCR_Email.png"
	 			style="border: 0px solid ;"></a>
	 		 </center>
	 	</footer>

	<!-- Start of StatCounter Code -->
		<script type="text/javascript">
			var sc_project=5714596; 
			var sc_invisible=1; 
			var sc_partition=71; 
			var sc_click_stat=1; 
			var sc_security="038e9ac4"; 
		</script>
		
		<script type="text/javascript" src="https://www.statcounter.com/counter/counter.js"></script>
	<!-- End of StatCounter Code -->
	
	<!-- GoogleAnalytics -->
		<script src="https://www.google-analytics.com/urchin.js" type="text/javascript"> </script>
		<script type="text/javascript"> _uacct = "UA-676559-1"; urchinTracker(); </script>
	<!-- End of GoogleAnalytics Code -->
</div>
<!--/html_preserve-->