SOCR ≫ TCIU Website ≫ TCIU GitHub ≫

1 Data Preprocessing

Find all “Common” freatures (highly-observed and congruent Econ indicators)

# 1. Find all "Common" freatures (highly-observed and congruent Econ indicators)
countryNames <- unique(time_series$country); length(countryNames); # countryNames
# initialize 3D array of DF's that will store the data for each of hte countries into a 2D frame
countryData <- list()  # countryData[[listID==Country]][1-time-72, 1-feature-197]

for (i in 1:length(countryNames)) {
  countryData[[i]] <- filter(time_series, country == countryNames[i])
}
# Check countryData[[2]][2, 3] == Belgium[2,3]

list_of_dfs_CommonFeatures <- list()  # list of data for supersampled countries 360 * 197

# 2. General function that ensures the XReg predictors for ALL 31 EU countries are homologous
completeHomologousX_features <- function (list_of_dfs) {
  # delete features that are missing at all time points
  for (j in 1:length(list_of_dfs)) {
    print(paste0("Pre-processing Country: ...", countryNames[j], "... "))
    data = list_of_dfs[[j]]
    data = data[ , colSums(is.na(data)) != nrow(data)]
    data = select(data, -time, -country)
    DataMatrix = as.matrix(data)
    DataMatrix = cleardata(DataMatrix)
    DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value
    DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0
    # Supersample 72 --*5--> 360 timepoints 
    DataMatrix = splinecreate(DataMatrix)
    DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data
    # remove some of features  
    DataSuperSample = DataSuperSample[, -c(50:80)]; dim(X)  # 360 167
    # ensure full-rank design matrix, DataSuperSample
    DataSuperSample <- 
      DataSuperSample[ , qr(DataSuperSample)$pivot[seq_len(qr(DataSuperSample)$rank)]]
    print(paste0("dim()=(", dim(DataSuperSample)[1], ",", dim(DataSuperSample)[2], ") ..."))
    # update the current DF/Country
    list_of_dfs_CommonFeatures[[j]] <- DataSuperSample
  }

  # Identify All Xreg features that are homologous (same feature columns) across All 31 countries
  # Identify Common Columns (freatures)
  comCol <- Reduce(intersect, lapply(list_of_dfs_CommonFeatures, colnames))
  list_of_dfs_CommonFeatures <- lapply(list_of_dfs_CommonFeatures, function(x) x[comCol])

  for (j in 1:length(list_of_dfs_CommonFeatures)) {
    list_of_dfs_CommonFeatures[[j]] <- subset(list_of_dfs_CommonFeatures[[j]], select = comCol)
    print(paste0("dim(", countryNames[j], ")=(", dim(list_of_dfs_CommonFeatures[[j]])[1], 
                 ",", dim(list_of_dfs_CommonFeatures[[j]])[2], ")!"))  # 72 * 197
  }
  return(list_of_dfs_CommonFeatures)
}
# Test completeHomologousX_features: dim(AllCountries)=(360,42)!
list_of_dfs_CommonFeatures <- completeHomologousX_features(countryData); 
length(list_of_dfs_CommonFeatures); dim(list_of_dfs_CommonFeatures[[1]]) # Austria data matrix 360*42

For each country (\(n\)) and each common feature (\(k\)), fit ARIMA model and estimate the parameters \((p,d,q)\) (non-exogenous, just the timeseries model for this feature), (p,d,q) triples for non-seasonal and seasonal effects. For each (Country, Feature) pair, the 9 ARIMA-derived vector includes: ** (ts_avg, forecast_avg, non-seasonal AR, non-seasonal MA, seasonal AR, seasonal MA, period, non-seasonal Diff, seasonal differences)**.

# 3. For each country (n) and each common feature (k), compute (p,d,q) ARIMA models (non-exogenous, 
# just the timeseries model for this feature), (p,d,q) triples
# Country * Feature
arimaModels_DF <- list() 
#data.frame(matrix(NA, nrow = length(countryNames), 
#  ncol = dim(list_of_dfs_CommonFeatures[[1]])[2]), row.names=countryNames, stringsAsFactors=T)
colnames(arimaModels_DF) <- colnames(list_of_dfs_CommonFeatures[[1]])
list_index <- 1
arimaModels_ARMA_coefs <- list()  # array( , c(31, 9*dim(list_of_dfs_CommonFeatures[[1]])[2]))
# dim(arimaModels_ARMA_coefs) # [1]  31 x 378 == 31 x (9 * 42)
# For each (Country, feature) index, the 9 ARIMA-derived vector includes:
# (ts_avg, forecast_avg, non-seasonal AR, non-seasonal MA, seasonal AR, seasonal MA, period, non-seasonal Diff, seasonal differences)

for(n in 1:(length(list_of_dfs_CommonFeatures))) {          # for each Country 1<=n<=31
  for (k in 1:(dim(list_of_dfs_CommonFeatures[[1]])[2])) {  # for each feature 1<=k<=42
    # extract one timeseries (the feature+country time course)
    ts = ts(list_of_dfs_CommonFeatures[[n]][ , k],
            frequency=20, start=c(2000,1), end=c(2017,20))
    set.seed(1234)
    arimaModels_DF[[list_index]] <- auto.arima(ts)
      # pred_arimaModels_DF = forecast(arimaModels_DF[[list_index]])
      # ts_pred_arimaModels_DF <- 
      #  ts(pred_arimaModels_DF$mean, frequency=20, start=c(2015,1), end=c(2017,20))
      # ts_pred_arimaModels_DF
    arimaModels_ARMA_coefs[[list_index]] <- c (
        mean(ts),                           # time-series average (retrospective) 
        mean(forecast(arimaModels_DF[[list_index]])$mean), # forecasted TS average (prospective)
        arimaModels_DF[[list_index]]$arma)  # 7 ARMA estimated parameters
    cat("arimaModels_ARMA_coefs[country=", countryNames[n], ", feature=",
             colnames(list_of_dfs_CommonFeatures[[1]])[k],
             "] Derived-Features=(", round(arimaModels_ARMA_coefs[[list_index]], 2), ") ...")
    #print(paste0("arimaModels_DF[country=", countryNames[i], ", feature=",
    #         colnames(list_of_dfs_CommonFeatures[[1]])[k],
    #         "]$arma =", arimaModels_DF[[list_index]]$arma))
    list_index <- list_index + 1
  }
}

length(arimaModels_ARMA_coefs) == 31*42  # [1] TRUE  # Each list-element consists of 9 values, see above
#  == dim(list_of_dfs_CommonFeatures[[1]])[1] * dim(list_of_dfs_CommonFeatures[[1]])[2]
# Maps to convert between 1D indices and 2D (Country, Feature) pairs
index2CountryFeature <- function(indx=1) { 
  if (indx<1 | indx>length(arimaModels_ARMA_coefs)) {
    cat("Index out of bounds: indx=", indx, "; must be between 1 and ", 
        length(arimaModels_ARMA_coefs), " ... Exiting ...")
    return (NULL)
  } else { 
    feature = (indx-1) %% (dim(list_of_dfs_CommonFeatures[[1]])[2])
    country = floor((indx - feature)/(dim(list_of_dfs_CommonFeatures[[1]])[2]))
    return(list("feature"=(feature+1), "country"=(country+1)))  }
}

countryFeature2Index <- function(countryIndx=1, featureIndx=1) { 
  if (countryIndx<1 | countryIndx>(dim(list_of_dfs_CommonFeatures[[1]])[1]) |
      featureIndx<1 | featureIndx>(dim(list_of_dfs_CommonFeatures[[1]])[2])) {
    cat("Indices out of bounds: countryIndx=", countryIndx, "; featureIndx=", featureIndx, "; Exiting ...")
    return (NULL)
  } else { return (featureIndx + (countryIndx-1)*(dim(list_of_dfs_CommonFeatures[[1]])[2])) }
}

# test forward and reverse index mapping functions
index2CountryFeature(42);  index2CountryFeature(45)$country;  index2CountryFeature(45)$feature
countryFeature2Index(countryIndx=2, featureIndx=3)

# Column/Feature Names: colnames(list_of_dfs_CommonFeatures[[1]])
# Country/Row Names: countryNames
arimaModels_ARMA_coefs[[1]] # Austria/Feature1  1:9 feature vector
# Cuntry2=Bulgaria, feature 5, 1:9 vector
arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=2, featureIndx=5)]]

Convert list of ARIMA models to a Data.Frame [Countries, megaFeatures] that can be put through ML data analytics. Augment the features using the EU_SOCR_Country_Ranking_Data_2011 dataset.

# 4. Add the country ranking as a new feature, using the OA ranks here:
# (http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_2008_World_CountriesRankings)
EU_SOCR_Country_Ranking_Data_2011 <- read.csv2("E:/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_SOCR_Country_Ranking_Data_2011.csv", header=T, sep=",")

length(arimaModels_ARMA_coefs) # 31*42 
arima_df <- data.frame(matrix(NA, 
          nrow=length(countryNames), ncol=length(colnames(list_of_dfs_CommonFeatures[[1]]))*9))
dim(arima_df)   # [1]  31 378

for(n in 1:dim(arima_df)[1]) {        # for each Country 1<=n<=31
  for (k in 1:length(colnames(list_of_dfs_CommonFeatures[[1]]))) {     # for each feature 1<=k<=42
    for (l in 1:9) {                  # for each arima vector 1:9, see above
        arima_df[n, (k-1)*9 + l] <- 
          round(arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=n, featureIndx=k)]][l], 1)
        if (n==dim(arima_df)[1]) colnames(arima_df)[(k-1)*9 + l] <- 
            print(paste0("Feature_",k, "_ArimaVec_",l))
    }
  }
}

# DF Conversion Validation
arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=3, featureIndx=5)]][2] == arima_df[3, (5-1)*9 + 2]
# [1] 1802.956

# Aggregate 2 datasets
dim(EU_SOCR_Country_Ranking_Data_2011)  # [1] 31 10

aggregate_arima_vector_country_ranking_df <- 
  as.data.frame(cbind(arima_df, EU_SOCR_Country_Ranking_Data_2011[ , -1]))
dim(aggregate_arima_vector_country_ranking_df)   # [1]  Country=31 * Features=387 (ARIMA=378 + Ranking=9)
# View(aggregate_arima_vector_country_ranking_df)
rownames(aggregate_arima_vector_country_ranking_df) <- countryNames
write.csv(aggregate_arima_vector_country_ranking_df, row.names = T, fileEncoding = "UTF-16LE",
           "E:/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_aggregate_arima_vector_country_ranking.csv")

2 Regularized Linear Modeling

2.1 Stronger Signal Analytics

Using all 386 features (378 ARIMA signatures + 8 meta-data).

Use Model-based and Model-free methods to predict the overall (OA) country ranking.

# 1. LASSO regression/feature extraction

# subset test data
Y = aggregate_arima_vector_country_ranking_df$OA
X = aggregate_arima_vector_country_ranking_df[ , -387]
# remove columns containing NAs
X = X[ , colSums(is.na(X)) == 0]; dim(X)  # [1]  31 386

fitRidge = glmnet(as.matrix(X), Y, alpha = 0)  # Ridge Regression
fitLASSO = glmnet(as.matrix(X), Y, alpha = 1)  # The LASSO
# LASSO
plot(fitLASSO, xvar="lambda", label="TRUE", lwd=3)
# add label to upper x-axis
mtext("LASSO regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Ridge
plot(fitRidge, xvar="lambda", label="TRUE", lwd=3)
# add label to upper x-axis
mtext("Ridge regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

#### 10-fold cross validation ####
# LASSO
library(doParallel)
registerDoParallel(6)
set.seed(1234)  # set seed 
# (10-fold) cross validation for the LASSO
cvLASSO = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE)
cvRidge = cv.glmnet(data.matrix(X), Y, alpha = 0, parallel=TRUE)
plot(cvLASSO)
mtext("CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO <-  predict(cvLASSO, s = cvLASSO$lambda.min, newx = data.matrix(X))
testMSE_LASSO <- mean((predLASSO - Y)^2); testMSE_LASSO

predLASSO = predict(cvLASSO, s = cvLASSO$lambda.min, newx = data.matrix(X))
predRidge = predict(fitRidge, s = cvRidge$lambda.min, newx = data.matrix(X))

# calculate test set MSE
testMSELASSO = mean((predLASSO - Y)^2)
testMSERidge = mean((predRidge - Y)^2)
##################################Use only ARIMA effects, no SOCR meta-data#####
set.seed(4321)
cvLASSO_lim = cv.glmnet(data.matrix(X[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_lim)
mtext("CV LASSO (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_lim <-  predict(cvLASSO_lim, s = 3, # cvLASSO_lim$lambda.min, 
                          newx = data.matrix(X[ , 1:(42*9)]))
coefList_lim <- coef(cvLASSO_lim, s=3) # 'lambda.min')
coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1],coefList_lim@x)
names(coefList_lim) <- c('Feature','EffectSize')
arrange(coefList_lim, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_lim[, 1])  # 0.84
################################################################################

# Plot Regression Coefficients: create variable names for plotting 

# par(mar=c(2, 13, 1, 1))   # extra large left margin # par(mar=c(5,5,5,5))
varNames <- colnames(X); varNames; length(varNames)

betaHatLASSO = as.double(coef(fitLASSO, s = cvLASSO$lambda.min))  # cvLASSO$lambda.1se
betaHatRidge = as.double(coef(fitRidge, s = cvRidge$lambda.min))

#coefplot(betaHatLASSO[2:386], sd = rep(0, 385), pch=0, cex.pts = 3, main = "LASSO-Regularized Regression Coefficient Estimates", varnames = varNames)
coefplot(betaHatLASSO[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", main = "LASSO-Regularized Regression Coefficient Estimates", varnames = varNames[377:386])
coefplot(betaHatRidge[377:386], sd = rep(0, 10), pch=2, add = TRUE, col.pts = "blue", cex.pts = 3)
legend("bottomleft", c("LASSO", "Ridge"), col = c("red", "blue"), pch = c(1 , 2), bty = "o", cex = 2)

varImp <- function(object, lambda = NULL, ...) {
  ## skipping a few lines
  beta <- predict(object, s = lambda, type = "coef")
  if(is.list(beta)) {
    out <- do.call("cbind", lapply(beta, function(x) x[,1]))
    out <- as.data.frame(out)
  } else out <- data.frame(Overall = beta[,1])
  out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE])
  out
}

varImp(cvLASSO, lambda = cvLASSO$lambda.min)

coefList <- coef(cvLASSO, s='lambda.min')
coefList <- data.frame(coefList@Dimnames[[1]][coefList@i+1],coefList@x)
names(coefList) <- c('Feature','EffectSize')
arrange(coefList, -abs(EffectSize))[2:10, ]
#                      var        val  # Feature names: colnames(list_of_dfs_CommonFeatures[[1]])
#1            (Intercept) 49.4896874
#2   Feature_1_ArimaVec_8 -2.4050811   # Feature 1 = Active population: Females 15 to 64 years
#3  Feature_20_ArimaVec_8 -1.4015001   # Feature 20= "Employment: Females 15 to 64 years
#4            IncomeGroup -1.2271177
#5   Feature_9_ArimaVec_8 -1.0629835   # Feature 9= Active population: Total 15 to 64 years
#6                     ED -0.7481041
#7                     PE -0.5167668
#8  Feature_25_ArimaVec_5  0.4416775   # Feature 25= Property income
#9   Feature_9_ArimaVec_4 -0.2217804
#10                   QOL -0.1965342
#                                     ARIMA: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#
#9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

# [1] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels"                                                     
# [2] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"     
# [3] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Tertiary education (levels 5-8)"                                           
# [4] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
# [5] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, All ISCED 2011 levels"                                                       
# [6] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"       
# [7] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Tertiary education (levels 5-8)"                                             
# [8] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"  
# [9] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels"  
#[10] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" 
#[11] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Tertiary education (levels 5-8)"        
#[12] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
#[13] "All ISCED 2011 levels " 
# [14] "All ISCED 2011 levels, Females"  
# [15] "All ISCED 2011 levels, Males"
# [16] "Capital transfers, payable"  
# [17] "Capital transfers, receivable"  
# [18] "Compensation of employees, payable"  
# [19] "Current taxes on income, wealth, etc., receivable"
#[20] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels " 
# [21] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"            
# [22] "Other current transfers, payable"
# [23] "Other current transfers, receivable" 
# [24] "Property income, payable" 
# [25] "Property income, receivable" 
# [26] "Savings, gross" 
# [27] "Subsidies, payable"
# [28] "Taxes on production and imports, receivable"
# [29] "Total general government expenditure"
# [30] "Total general government revenue"
# [31] "Unemployment , Females, From 15-64 years, Total"
# [32] "Unemployment , Males, From 15-64 years" 
# [33] "Unemployment , Males, From 15-64 years, from 1 to 2 months" 
# [34] "Unemployment , Males, From 15-64 years, from 3 to 5 months"
# [35] "Unemployment , Males, From 15-64 years, from 6 to 11 months"  
# [36] "Unemployment , Total, From 15-64 years, From 1 to 2 months"
# [37] "Unemployment , Total, From 15-64 years, From 12 to 17 months"
# [38] "Unemployment , Total, From 15-64 years, From 3 to 5 months" 
# [39] "Unemployment , Total, From 15-64 years, From 6 to 11 months"  
# [40] "Unemployment , Total, From 15-64 years, Less than 1 month" 
# [41] "Unemployment by sex, age, duration. DurationNA not started"
# [42] "VAT, receivable"   

coef(cvLASSO, s = "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

validation <- data.frame(matrix(NA, nrow = dim(predLASSO)[1], ncol=3), row.names=countryNames)
validation [ , 1] <- Y; validation [ , 2] <- predLASSO_lim[, 1]; validation [ , 3] <- predRidge[, 1]
colnames(validation) <- c("Y", "LASSO", "Ridge")
dim(validation)
head(validation)

# Prediction correlations: 
cor(validation[ , 1], validation[, 2])  # Y=observed OA rank vs. LASSO-pred  0.96 (lim) 0.84
cor(validation[ , 1], validation[, 3])  # Y=observed OA rank vs. Ridge-pred  0.95

# Plot observed Y (Overall Counry ranking) vs. LASSO (9-parameters) predicted Y^
linFit1 <- lm(validation[ , 1] ~ predLASSO)
plot(validation[ , 1] ~ predLASSO,
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="LASSO 9/(42*9) param model",
     main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", 
                    cor(validation[ , 1], validation[, 2])))
abline(linFit1, lwd=3, col="red")

# Plot observed LASSO (9-parameters) predicted Y^ vs. Y (Overall Counry ranking) 
linFit1 <- lm(predLASSO_lim ~ validation[ , 1])
plot(predLASSO_lim ~ validation[ , 1],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="LASSO 9/(42*9) param model",
     main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", 
                    cor(validation[ , 1], validation[, 2])))
abline(linFit1, lwd=3, col="red")

Use Model-based and Model-free methods to predict the overall (OA) country ranking.

# Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain): kSpaceTransform(data, inverse = FALSE, reconPhases = NULL) 
  # ForwardFT (rawData, FALSE, NULL)
  # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL)

# DATA
# subset test data
# Y = aggregate_arima_vector_country_ranking_df$OA
# X = aggregate_arima_vector_country_ranking_df[ , -387]
# remove columns containing NAs
# X = X[ , colSums(is.na(X)) == 0]; dim(X)  # [1]  31 386
length(Y); dim(X)

FT_aggregate_arima_vector_country_ranking_df <- 
  kSpaceTransform(aggregate_arima_vector_country_ranking_df, inverse = FALSE, reconPhases = NULL) 
  
## Kime-Phase Distributions
# Examine the Kime-direction Distributions of the Phases for all *Belgium* features (predictors + outcome). Define a generic function that plots the Phase distributions.
# plotPhaseDistributions(dataFT, dataColnames)
plotPhaseDistributions(FT_aggregate_arima_vector_country_ranking_df,
                       colnames(aggregate_arima_vector_country_ranking_df), size=4, cex=0.1)

IFT_FT_aggregate_arima_vector_country_ranking_df <- 
  kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                  TRUE, FT_aggregate_arima_vector_country_ranking_df$phases)
# Check IFT(FT) == I: 
# ifelse(aggregate_arima_vector_country_ranking_df[5,4] -
#     Re(IFT_FT_aggregate_arima_vector_country_ranking_df[5,4]) < 0.001, "Perfect Syntesis", "Problems!!!")

##############################################
# Nil-Phase Synthesis and LASSO model estimation
# 1. Nil-Phase data synthesys (reconstruction)
temp_Data <- aggregate_arima_vector_country_ranking_df
nilPhase_FT_aggregate_arima_vector <- 
  array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(nilPhase_FT_aggregate_arima_vector)    # ;  head(nilPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_NilPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

#       Invert back to spacetime the 
# FT_aggregate_arima_vector_country_ranking_df$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, nilPhase_FT_aggregate_arima_vector))

colnames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_NilPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_NilPhase_FT_aggregate_arima_vector)
# IFT_NilPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_NilPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
# library(forecast)
set.seed(54321)
cvLASSO_kime = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , -387]), 
           # IFT_NilPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE)
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime)
mtext("(Spacekime, Nil-phase) CV LASSO: Number of Nonzero (Active) Coefficients", 
      side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime <-  predict(cvLASSO_kime, s = cvLASSO_kime$lambda.min, 
        newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , -387])); predLASSO_kime
# testMSE_LASSO_kime <- mean((predLASSO_kime - IFT_NilPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_kime
predLASSO_kime = predict(cvLASSO_kime, s = exp(1/3), # cvLASSO_kime$lambda.min, 
        newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , -387])); predLASSO_kime

##################################Use only ARIMA effects, no SOCR meta-data#####
set.seed(12345)
cvLASSO_kime_lim = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:(42*9)]), 
                        Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime_lim)
mtext("CV LASSO Nil-Phase (using only Timeseries data): Number of Nonzero (Active) Coefficients",
      side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime_lim <-  predict(cvLASSO_kime_lim, s = 1, 
                          newx = data.matrix(X[ , 1:(42*9)]))
coefList_kime_lim <- coef(cvLASSO_kime_lim, s=1)
coefList_kime_lim <- data.frame(coefList_kime_lim@Dimnames[[1]][coefList_kime_lim@i+1],coefList_kime_lim@x)
names(coefList_kime_lim) <- c('Feature','EffectSize')
arrange(coefList_kime_lim, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_kime_lim[, 1])  # 0.1142824
################################################################################

# Plot Regression Coefficients: create variable names for plotting 
library("arm")
# par(mar=c(2, 13, 1, 1))   # extra large left margin # par(mar=c(5,5,5,5))
# varNames <- colnames(X); varNames; length(varNames)

betaHatLASSO_kime = as.double(coef(cvLASSO_kime, s=cvLASSO_kime$lambda.min))
#cvLASSO_kime$lambda.1se

coefplot(betaHatLASSO_kime[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", 
         main = "(Spacekime) LASSO-Regularized Regression Coefficient Estimates",
         varnames = varNames[377:386])

varImp(cvLASSO_kime, lambda = cvLASSO_kime$lambda.min)

coefList_kime <- coef(cvLASSO_kime, s=1)  # 'lambda.min')
coefList_kime <- data.frame(coefList_kime@Dimnames[[1]][coefList_kime@i+1], coefList_kime@x)
names(coefList_kime) <- c('Feature','EffectSize')
arrange(coefList_kime, -abs(EffectSize))[1:9, ]
#                 Feature  EffectSize
#1           (Intercept) 26.069326257
#2 Feature_12_ArimaVec_8 -8.662856430
#3 Feature_11_ArimaVec_4  8.585283751
#4 Feature_12_ArimaVec_4 -5.023601842
#5 Feature_30_ArimaVec_4  2.242157842
#6 Feature_26_ArimaVec_6  1.760267216
#7 Feature_39_ArimaVec_5 -1.256101950
#8 Feature_34_ArimaVec_5 -1.148865337
#9 Feature_37_ArimaVec_2  0.001322367
#     ARIMA-spacetime:     4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekimeNil:  2=forecast_avg, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 8=non-seasonal Diff
#9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_kime, s = 1/5) %>%  ### "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

# pack a 31*3 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, Y)
validation_kime <- cbind(predLASSO_kime[, 1], 
                         IFT_NilPhase_FT_aggregate_arima_vector[ , 387], Y)  
colnames(validation_kime) <- c("predLASSO_kime", "IFT_NilPhase_FT_Y", "Orig_Y")
head(validation_kime)

# Prediction correlations: 
cor(validation_kime[ , 1], validation_kime[, 2])  # Y=predLASSO_kime OA rank vs. kime_LASSO_pred:  0.99
cor(validation_kime[ , 1], validation_kime[, 3])  # Y=predLASSO_kime OA rank vs. Orig_Y:  0.64

# Plot observed Y (Overall Counry ranking) vs. LASSO (9-parameters) predicted Y^
linFit1_kime <- lm(predLASSO_kime ~ validation_kime[ , 3])
plot(predLASSO_kime ~ validation_kime[ , 3],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="IFT_NilPhase predLASSO_kime",
     main = sprintf("Observed (x) vs. IFT_NilPhase Predicted (y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime[ , 1], validation_kime[, 3])))
abline(linFit1_kime, lwd=3, col="red")
# abline(linFit1, lwd=3, col="green")


##############################################
# 3. Swap Feature Phases and then synthesize the data (reconstruction)
# temp_Data <- aggregate_arima_vector_country_ranking_df
swappedPhase_FT_aggregate_arima_vector <- FT_aggregate_arima_vector_country_ranking_df$phases
dim(swappedPhase_FT_aggregate_arima_vector)    # ;  head(swappedPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_SwappedPhase_FT_aggregate_arima_vector <- array(complex(), 
                          c(dim(temp_Data)[1], dim(temp_Data)[2]))

set.seed(12345)   # sample randomly Phase-columns for each of the 131 covariates (X)
swappedPhase_FT_aggregate_arima_vector1 <- as.data.frame(cbind(
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 1:378]))],  # mix ARIMA signature phases
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 379:386]))],# mix the meta-data phases
  swappedPhase_FT_aggregate_arima_vector[ , 387]))                          # add correct Outcome phase
swappedPhase_FT_aggregate_arima_vector <- swappedPhase_FT_aggregate_arima_vector1

colnames(swappedPhase_FT_aggregate_arima_vector) <- colnames(temp_Data)
colnames(swappedPhase_FT_aggregate_arima_vector); dim(swappedPhase_FT_aggregate_arima_vector)
# 31 387

#       Invert back to spacetime the 
# FT_aggregate_arima_vector$magnitudes[ , i] signal with swapped-X-phases (Y-phase is fixed)
IFT_SwappedPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, swappedPhase_FT_aggregate_arima_vector))

colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_SwappedPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_SwappedPhase_FT_aggregate_arima_vector)
# IFT_SwappedPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_SwappedPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
set.seed(12345)
cvLASSO_kime_swapped = 
  cv.glmnet(data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , -387]), 
           # IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE)
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime_swapped)
mtext("(Spacekime, Swapped-Phases) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

##################################Use only ARIMA effects, no SOCR meta-data#####
set.seed(12345)
cvLASSO_kime_swapped_lim = cv.glmnet(data.matrix(
  IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime_swapped_lim)
mtext("CV LASSO Swapped-Phase (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime_swapped_lim <-  predict(cvLASSO_kime_swapped_lim, 
              s = cvLASSO_kime_swapped_lim$lambda.min, 
              newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:(42*9)]))
coefList_kime_swapped_lim <- coef(cvLASSO_kime_swapped_lim, s='lambda.min')
coefList_kime_swapped_lim <- data.frame(coefList_kime_swapped_lim@Dimnames[[1]][coefList_kime_swapped_lim@i+1],coefList_kime_swapped_lim@x)
names(coefList_kime_swapped_lim) <- c('Feature','EffectSize')
arrange(coefList_kime_swapped_lim, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_kime_swapped_lim[, 1])  # 0.86
################################################################################


# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime_swapped <-  predict(cvLASSO_kime_swapped, s = cvLASSO_kime_swapped$lambda.min, 
        newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , -387]))
# testMSE_LASSO_kime_swapped <- 
#        mean((predLASSO_kime_swapped - IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_kime_swapped
predLASSO_kime_swapped = predict(cvLASSO_kime_swapped, s = 3, 
        newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , -387]))
predLASSO_kime_swapped

# Plot Regression Coefficients: create variable names for plotting 
betaHatLASSO_kime_swapped = as.double(coef(cvLASSO_kime_swapped,
                                           s=cvLASSO_kime_swapped$lambda.min))
#cvLASSO_kime_swapped$lambda.1se

coefplot(betaHatLASSO_kime_swapped[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", 
         main = "(Spacekime, Swapped-Phases) LASSO-Regularized Regression Coefficient Estimates",
         varnames = varNames[377:386])

varImp(cvLASSO_kime_swapped, lambda = cvLASSO_kime_swapped$lambda.min)

coefList_kime_swapped <- coef(cvLASSO_kime_swapped, s=3)    # 'lambda.min')
coefList_kime_swapped <- data.frame(coefList_kime_swapped@Dimnames[[1]][coefList_kime_swapped@i+1], coefList_kime_swapped@x)
names(coefList_kime_swapped) <- c('Feature','EffectSize')
arrange(coefList_kime_swapped, -abs(EffectSize))[2:10, ]
#                 Feature  EffectSize
#2  Feature_32_ArimaVec_6  3.3820076
#3   Feature_1_ArimaVec_3  2.2133139
#4  Feature_21_ArimaVec_4  1.5376447
#5  Feature_22_ArimaVec_3  1.0546605
#6  Feature_14_ArimaVec_5  0.7428693
#7                     ED  0.6525794
#8  Feature_24_ArimaVec_5  0.5987113
#9  Feature_12_ArimaVec_5  0.3177650
#10 Feature_37_ArimaVec_6  0.1598574
#
#     ARIMA-spacetime:        4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekime_nill:   3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
#    ARIMA-spacekime_swapped: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
# 9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_kime_swapped, s = 3) %>%  # "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime, Swapped-Phases) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

# pack a 31*4 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, 
#                      IFT_SwappedPhase_FT_aggregate_arima_vector_Y, Y)
validation_kime_swapped <- cbind(predLASSO_lim[, 1], 
                         predLASSO_kime[ , 1], predLASSO_kime_swapped[ , 1], Y)  
colnames(validation_kime_swapped) <- c("predLASSO (spacetime)", "predLASSO_IFT_NilPhase",
                               "predLASSO_IFT_SwappedPhase", "Orig_Y")
head(validation_kime_swapped); dim(validation_kime_swapped)

# Prediction correlations: 
cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])  
# predLASSO_IFT_SwappedPhase OA rank vs. predLASSO_spacekime:  0.7
cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 3])  
# predLASSO (spacetime) vs.  predLASSO_IFT_SwappedPhase OA rank:  0.83

# Plot observed Y (Overall Counry ranking) vs. LASSO (9-parameters) predicted Y^
linFit1_kime_swapped <- lm(validation_kime_swapped[ , 4] ~ predLASSO)
plot(validation_kime_swapped[ , 4] ~ predLASSO,
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="predLASSO_spacekime Country Overall Ranking", ylab="predLASSO_IFT_SwappedPhase_FT_Y",
     main = sprintf("Spacetime Predicted (x) vs. Kime IFT_SwappedPhase_FT_Y (y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])))
abline(linFit1_kime_swapped, lwd=3, col="red")
#abline(linFit1_kime, lwd=3, col="green")

# Plot Spacetime LASSO forecasting
# Plot observed Y (Overall Counry ranking) vs. LASSO (9-parameters) predicted Y^
linFit1_spacetime <- lm(validation_kime_swapped[ , 1] ~ validation_kime_swapped[ , 4])
plot(validation_kime_swapped[ , 1] ~ validation_kime_swapped[ , 4],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="predLASSO_spacetime",
     main = sprintf("Spacetime Predicted (y) vs. Obaserved (x) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 4])))
abline(linFit1_spacetime, lwd=3, col="red")

# test with using swapped-phases LASSO estiumates
linFit1_spacekime <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4])
plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="predLASSO_spacekime Swapped-Phases",
     main = sprintf("Spacekime Predicted, Swapped-Phases (y) vs. Obaserved (x) Overall Country Ranking, cor=%.02f", 
                    cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])))
abline(linFit1_spacekime, lwd=3, col="red")

# add Top_30_Ranking_Indicator
validation_kime_swapped <- as.data.frame(cbind(validation_kime_swapped, ifelse (validation_kime_swapped[,4]<=30, 1, 0)))
colnames(validation_kime_swapped)[5] <- "Top30Rank"
head(validation_kime_swapped)
# Spacetime LASSO modeling
myPlotSpacetime <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y,
            y=`predLASSO (spacetime)`, label=rownames(validation_kime_swapped))) +
  geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) +
     geom_label_repel(aes(label = rownames(validation_kime_swapped),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacetime LASSO Prediction (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 4])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacetime LASSO Rank Forecasting")


# NIL-PHASE KIME reconstruction
myPlotNilPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y,
            y=predLASSO_kime, label=rownames(validation_kime_swapped))) +
  geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) +
     geom_label_repel(aes(label = rownames(validation_kime_swapped),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacekime LASSO Predicted, Nil-Phases, (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_kime_swapped[ , 2], validation_kime_swapped[, 4])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacekime LASSO Predicted, using Nil-Phases")


# SWAPPED PHASE KIME reconstruction
myPlotSwappedPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y,
            y=predLASSO_kime_swapped, label=rownames(validation_kime_swapped))) +
  geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) +
     geom_label_repel(aes(label = rownames(validation_kime_swapped),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacekime LASSO Predicted, Swapped-Phases, (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacekime LASSO Predicted, using Swapped-Phases")

2.2 Weak-signal Analytics

Using only the 378 ARIMA signatures for the prediction (out of the total of 386 features).

# 1. LASSO regression/feature extraction

# subset test data
Y = aggregate_arima_vector_country_ranking_df$OA
X = aggregate_arima_vector_country_ranking_df[ , 1:378]
# remove columns containing NAs
X = X[ , colSums(is.na(X)) == 0]; dim(X)  # [1]  31 378

#### 10-fold cross validation: for the LASSO

registerDoParallel(6)

set.seed(4321)
cvLASSO_lim = cv.glmnet(data.matrix(X[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_lim)
mtext("CV LASSO (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_lim <-  predict(cvLASSO_lim, s = 3, # cvLASSO_lim$lambda.min, 
                          newx = data.matrix(X[ , 1:(42*9)]))
coefList_lim <- coef(cvLASSO_lim, s=3) # 'lambda.min')
coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1],coefList_lim@x)
names(coefList_lim) <- c('Feature','EffectSize')
arrange(coefList_lim, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_lim[, 1])  # 0.84
################################################################################

varImp(cvLASSO_lim, lambda = cvLASSO_lim$lambda.min)
#2   Feature_1_ArimaVec_8 -2.3864299
#3  Feature_19_ArimaVec_8  2.0871310
#4  Feature_16_ArimaVec_3  2.0465254
#5  Feature_13_ArimaVec_8 -1.7348553
#6  Feature_15_ArimaVec_4 -1.4588173
#7  Feature_22_ArimaVec_4 -1.1068801
#8  Feature_25_ArimaVec_5  0.9336800
#9  Feature_35_ArimaVec_4 -0.9276244
#10 Feature_25_ArimaVec_4 -0.8486434

#coefList_lim <- coef(cvLASSO_lim, s='lambda.min')
#coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1], coefList_lim@x)
#names(coefList_lim) <- c('Feature','EffectSize')
#arrange(coefList_lim, -abs(EffectSize))[2:10, ]
#
#      9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

# [1] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels"                                                     
# [2] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"     
# [3] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Tertiary education (levels 5-8)"                                           
# [4] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
# [5] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, All ISCED 2011 levels"                                                       
# [6] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"       
# [7] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Tertiary education (levels 5-8)"                                             
# [8] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"  
# [9] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels"  
#[10] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" 
#[11] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Tertiary education (levels 5-8)"        
#[12] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
#[13] "All ISCED 2011 levels " 
# [14] "All ISCED 2011 levels, Females"  
# [15] "All ISCED 2011 levels, Males"
# [16] "Capital transfers, payable"  
# [17] "Capital transfers, receivable"  
# [18] "Compensation of employees, payable"  
# [19] "Current taxes on income, wealth, etc., receivable"
#[20] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels " 
# [21] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"            
# [22] "Other current transfers, payable"
# [23] "Other current transfers, receivable" 
# [24] "Property income, payable" 
# [25] "Property income, receivable" 
# [26] "Savings, gross" 
# [27] "Subsidies, payable"
# [28] "Taxes on production and imports, receivable"
# [29] "Total general government expenditure"
# [30] "Total general government revenue"
# [31] "Unemployment , Females, From 15-64 years, Total"
# [32] "Unemployment , Males, From 15-64 years" 
# [33] "Unemployment , Males, From 15-64 years, from 1 to 2 months" 
# [34] "Unemployment , Males, From 15-64 years, from 3 to 5 months"
# [35] "Unemployment , Males, From 15-64 years, from 6 to 11 months"  
# [36] "Unemployment , Total, From 15-64 years, From 1 to 2 months"
# [37] "Unemployment , Total, From 15-64 years, From 12 to 17 months"
# [38] "Unemployment , Total, From 15-64 years, From 3 to 5 months" 
# [39] "Unemployment , Total, From 15-64 years, From 6 to 11 months"  
# [40] "Unemployment , Total, From 15-64 years, Less than 1 month" 
# [41] "Unemployment by sex, age, duration. DurationNA not started"
# [42] "VAT, receivable"   

coef(cvLASSO_lim, s = 3) %>%  # "lambda.min"
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

validation_lim <- data.frame(matrix(NA, nrow = dim(predLASSO_lim)[1], ncol=2), row.names=countryNames)
validation_lim [ , 1] <- Y; validation_lim[ , 2] <- predLASSO_lim[, 1]
colnames(validation_lim) <- c("Orig_Y", "LASSO")
dim(validation_lim); head(validation_lim)

# add Top_30_Ranking_Indicator
validation_lim <- as.data.frame(cbind(validation_lim, ifelse (validation_lim[, 1]<=30, 1, 0)))
colnames(validation_lim)[3] <- "Top30Rank"
head(validation_lim)

# Prediction correlations: 
cor(validation_lim[ , 1], validation_lim[, 2])  # Y=observed OA rank vs. LASSO-pred  0.98 (lim) 0.84

# Plot observed Y (Overall Counry ranking) vs. LASSO (9-parameters) predicted Y^
linFit_lim <- lm(validation_lim[ , 1] ~ validation_lim[, 2])
plot(validation_lim[ , 1] ~ validation_lim[, 2],
     col="blue", xaxt='n', yaxt='n', pch = 16, cex=3,
     xlab="Observed Country Overall Ranking", ylab="LASSO (42*9 +8) param model",
     main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", 
                    cor(validation_lim[ , 1], validation_lim[, 2])))
abline(linFit_lim, lwd=3, col="red")

# Plot
myPlot <- ggplot(as.data.frame(validation_lim), aes(x=validation_lim[ , 1],
            y=validation_lim[ , 2], label=rownames(validation_lim))) +
     geom_smooth(method='lm') +
     geom_point() +
     # Color by groups
     # geom_text(aes(color=factor(rownames(validation_lim)))) +
     geom_label_repel(aes(label = rownames(validation_lim),
                    fill = factor(Top30Rank)), color = 'black', size = 5,  
                    point.padding = unit(0.3, "lines")) +
     # theme(legend.position = "bottom") +
     theme(legend.position = c(0.1, 0.9), 
           legend.text = element_text(colour="black", size=12, face="bold"),
           legend.title = element_text(colour="black", size=14, face="bold")) +
     scale_fill_discrete(name = "Country Overall Ranking", 
                         labels = c("Below 30 Rank", "Top 30 Rank")) +
     labs(title=sprintf("Spacetime LASSO Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f",  cor(validation_lim[ , 1], validation_lim[, 2])),
          x ="Observed Overall Country Ranking (1 is 'best')", 
          y = "Spacetime LASSO Predicted")

Nil-Phase Synthesis and LASSO model estimation …

IFT_FT_aggregate_arima_vector_country_ranking_df <- 
  kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                  TRUE, FT_aggregate_arima_vector_country_ranking_df$phases)
# Check IFT(FT) == I: 
# ifelse(aggregate_arima_vector_country_ranking_df[5,4] -
#     Re(IFT_FT_aggregate_arima_vector_country_ranking_df[5,4]) < 0.001, "Perfect Syntesis", "Problems!!!")

##############################################
# Nil-Phase Synthesis and LASSO model estimation
# 1. Nil-Phase data synthesys (reconstruction)
temp_Data <- aggregate_arima_vector_country_ranking_df
nilPhase_FT_aggregate_arima_vector <- 
  array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2]))
dim(nilPhase_FT_aggregate_arima_vector)    # ;  head(nilPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_NilPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

#       Invert back to spacetime the 
# FT_aggregate_arima_vector_country_ranking_df$magnitudes[ , i] signal with nil-phase
IFT_NilPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, nilPhase_FT_aggregate_arima_vector))

colnames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_NilPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_NilPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_NilPhase_FT_aggregate_arima_vector)
# IFT_NilPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_NilPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
# library(forecast)
set.seed(123)
cvLASSO_nil_kime = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:378]), 
           # IFT_NilPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE)
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_nil_kime)
mtext("(Spacekime, Nil-phase) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)
# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_nil_kime <-  predict(cvLASSO_nil_kime, s = cvLASSO_nil_kime$lambda.min, 
        newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:378])); predLASSO_nil_kime
# testMSE_LASSO_nil_kime <- mean((predLASSO_nil_kime - IFT_NilPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_nil_kime

# Plot Regression Coefficients: create variable names for plotting 
library("arm")
# par(mar=c(2, 13, 1, 1))   # extra large left margin # par(mar=c(5,5,5,5))
# varNames <- colnames(X); varNames; length(varNames)
#betaHatLASSO_kime = as.double(coef(cvLASSO_kime, s=cvLASSO_kime$lambda.min))
#cvLASSO_kime$lambda.1se
#
#coefplot(betaHatLASSO_kime[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", 
#         main = "(Spacekime) LASSO-Regularized Regression Coefficient Estimates",
#         varnames = varNames[377:386])

varImp(cvLASSO_nil_kime, lambda = cvLASSO_nil_kime$lambda.min)

coefList_nil_kime <- coef(cvLASSO_nil_kime, s='lambda.min')
coefList_nil_kime <- data.frame(coefList_nil_kime@Dimnames[[1]][coefList_nil_kime@i+1], coefList_nil_kime@x)
names(coefList_nil_kime) <- c('Feature','EffectSize')
arrange(coefList_nil_kime, -abs(EffectSize))[1:9, ]
#                 Feature  EffectSize
#1           (Intercept) 26.385520159
#2 Feature_12_ArimaVec_8 -9.312528495
#3 Feature_11_ArimaVec_4  8.561417371
#4 Feature_12_ArimaVec_4 -5.220797416
#5 Feature_30_ArimaVec_4  2.623218791
#6 Feature_26_ArimaVec_6  1.927773213
#7 Feature_39_ArimaVec_5 -1.534741402
#8 Feature_34_ArimaVec_5 -1.171720008
#9 Feature_37_ArimaVec_2  0.004213823
#     ARIMA-spacetime:     4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekimeNil:  2=forecast_avg, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 8=non-seasonal Diff
#9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_nil_kime, s = "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

Swapped Feature Phases and then synthesize the data (reconstruction)

# temp_Data <- aggregate_arima_vector_country_ranking_df
swappedPhase_FT_aggregate_arima_vector <- FT_aggregate_arima_vector_country_ranking_df$phases
dim(swappedPhase_FT_aggregate_arima_vector)    # ;  head(swappedPhase_FT_aggregate_arima_vector)
# [1] 31 387
IFT_SwappedPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2]))

set.seed(12345)   # sample randomly Phase-columns for each of the 131 covariates (X)
swappedPhase_FT_aggregate_arima_vector1 <- as.data.frame(cbind(
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 1:378]))],  # mix ARIMA signature phases
  swappedPhase_FT_aggregate_arima_vector[ , 
          sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 379:386]))],# mix the meta-data phases
  swappedPhase_FT_aggregate_arima_vector[ , 387]))                          # add correct Outcome phase
swappedPhase_FT_aggregate_arima_vector <- swappedPhase_FT_aggregate_arima_vector1

colnames(swappedPhase_FT_aggregate_arima_vector) <- colnames(temp_Data)
colnames(swappedPhase_FT_aggregate_arima_vector); dim(swappedPhase_FT_aggregate_arima_vector)
# 31 387

#       Invert back to spacetime the 
# FT_aggregate_arima_vector$magnitudes[ , i] signal with swapped-X-phases (Y-phase is fixed)
IFT_SwappedPhase_FT_aggregate_arima_vector <- 
  Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, 
                     TRUE, swappedPhase_FT_aggregate_arima_vector))

colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  colnames(aggregate_arima_vector_country_ranking_df)
rownames(IFT_SwappedPhase_FT_aggregate_arima_vector) <-
  rownames(aggregate_arima_vector_country_ranking_df)
dim(IFT_SwappedPhase_FT_aggregate_arima_vector)
dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes)
colnames(IFT_SwappedPhase_FT_aggregate_arima_vector)
# IFT_SwappedPhase_FT_aggregate_arima_vector[1:5, 1:4];  temp_Data[1:5, 1:4]

# 2. Perform LASSO modeling on IFT_SwappedPhase_FT_aggregate_arima_vector; 
# report param estimates and quality metrics AIC/BIC
set.seed(12)
cvLASSO_kime_swapped = 
  cv.glmnet(data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378]), 
           # IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE)
           Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_kime_swapped)
mtext("(Spacekime, Swapped-Phases) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)
# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_kime_swapped <-  predict(cvLASSO_kime_swapped, s = 3,    # cvLASSO_kime_swapped$lambda.min, 
        newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378]))
# testMSE_LASSO_kime_swapped <- 
#        mean((predLASSO_kime_swapped - IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387])^2)
# testMSE_LASSO_kime_swapped
predLASSO_kime_swapped

# Plot Regression Coefficients: create variable names for plotting 
betaHatLASSO_kime_swapped = as.double(coef(cvLASSO_kime_swapped,
                   s=3))   # cvLASSO_kime_swapped$lambda.min))
#cvLASSO_kime_swapped$lambda.1se
#coefplot(betaHatLASSO_kime_swapped[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", 
#         main = "(Spacekime, Swapped-Phases) LASSO-Regularized Regression Coefficient Estimates",
#         varnames = varNames[377:386])
varImp(cvLASSO_kime_swapped, lambda = 3)  #cvLASSO_kime_swapped$lambda.min)

coefList_kime_swapped <- coef(cvLASSO_kime_swapped, s=3)    # 'lambda.min')
coefList_kime_swapped <- data.frame(coefList_kime_swapped@Dimnames[[1]][coefList_kime_swapped@i+1], coefList_kime_swapped@x)
names(coefList_kime_swapped) <- c('Feature','EffectSize')
arrange(coefList_kime_swapped, -abs(EffectSize))[2:10, ]
#                 Feature  EffectSize
#2   Feature_3_ArimaVec_8  5.4414240889
#3  Feature_24_ArimaVec_5 -4.9895032906
#4  Feature_41_ArimaVec_6  1.7580440109
#5   Feature_7_ArimaVec_6 -1.6317407164
#6  Feature_41_ArimaVec_3 -0.4666980893
#7  Feature_42_ArimaVec_3  0.4100416326
#8   Feature_6_ArimaVec_3 -0.2052325091
#9   Feature_7_ArimaVec_1 -0.0007922646
#10 Feature_24_ArimaVec_1 -0.0002003192
#
#     ARIMA-spacetime:        4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#     ARIMA-spacekime_nill:   3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
#    ARIMA-spacekime_swapped: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA
# 9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

coef(cvLASSO_kime_swapped, s = 3) %>%  # "lambda.min") %>%
  broom::tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(100, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value), color = value > 0)) +
  geom_point(show.legend = FALSE, aes(size = abs(value))) +
  ggtitle("(Spacekime, Swapped-Phases) Top 9 salient features (LASSO penalty)") +
  xlab("Effect-size") +
  ylab(NULL)

3 Unsupervised clustering

Use hierarchical, k-means and spectral clustering to generate derived-computed phenotypes of countries. Do these derived lables relate (correspond to) the overall (OA) country ranking?

3.1 Spacetime analytics

  • 1.1 Lasso features selection
eudata <- aggregate_arima_vector_country_ranking_df
colnames(eudata) <- c("country",colnames(eudata[,-1]))
eudata <- eudata[ , -ncol(eudata)]
Y<-aggregate_arima_vector_country_ranking_df$OA

# Compelte data 386 features (378 + 8)
X<-eudata[,-ncol(eudata)]; dim(X)
# TS-derivative features only (378)
X378 <- X[, -c(379:386)]; dim(X378)
countryinfo<-as.character(X[,1])
countryinfo[11]<-"Germany"
X<-X[,-1]
keeplist<-NULL
for (i in 1:ncol(X)) {
  if(FALSE %in% (X[,i]==mean(X[,i]))) {keeplist<-c(keeplist,i)}
}
X<-X[,keeplist]; dim(X)

# Reduced to 378 features 
#countryinfo<-as.character(X378[,1])
#countryinfo[11]<-"Germany"
#X378<-X378[,-1]
#keeplist<-NULL
#for (i in 1:ncol(X378)) {
#    if(FALSE %in% (X378[,i]==mean(X378[,i]))) {keeplist<-c(keeplist,i)}
#}
#X378<-X378[,keeplist]; dim(X378)

library(glmnet)
fitLASSO <- glmnet(as.matrix(X), Y, alpha = 1)
library(doParallel)
registerDoParallel(5)
#cross-validation
cvLASSO = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE)

# fitLASSO <- glmnet(as.matrix(X378), Y, alpha = 1)
#library(doParallel)
#registerDoParallel(5)
#cross-validation
#cvLASSO = cv.glmnet(data.matrix(X378), Y, alpha = 1, parallel=TRUE)

# To choose features we like to have based on lasso
chooselambda <- function(cvlasso, option, k=NULL) {
  lambmat<-cbind(cvlasso$glmnet.fit$df,cvlasso$glmnet.fit$lambda)
  result<-tapply(lambmat[,2],lambmat[,1],max)
  kresult<-result[which(names(result)==as.factor(k))]
  if(option==1) {return(result)}
  else if (option==2) {return(kresult)}
  else (return("Not a valid option"))
}
showfeatures <- function(object, lambda, k ,...) {
  lam<-lambda[which(names(lambda)==as.factor(k))]
  beta <- predict(object, s = lam, type = "coef")
  if(is.list(beta)) {
    out <- do.call("cbind", lapply(beta, function(x) x[,1]))
    out <- as.data.frame(out)
    s <- rowSums(out)
    out <- out[which(s)!=0,,drop=FALSE]
  } else  {out<-data.frame(Overall = beta[,1])
  out<-out[which(out!=0),,drop=FALSE]
  }
  out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE])
  out
}
  • ** 1.2 Comparison of different ML algorithms of different feature numbers**
#test training data setup
randchoose <- function(matr) {
  leng<-nrow(matr)
  se<-seq(1:leng)
  sam<-sample(se,as.integer(leng*0.6))
  return(sam)
}

eusample<-X
eusample$Rank<-as.factor(ifelse(Y<30, 1, 0))
set.seed(1234)
eutrain<-eusample[randchoose(eusample), ]
set.seed(1234)
eutest<-eusample[-randchoose(eusample), ]

eusample378 <- X378
eusample378$Rank <- as.factor(ifelse(Y<30, 1, 0))
set.seed(1234)
eutrain378 <- eusample378[randchoose(eusample378), ]
set.seed(1234)
eutest378 <- eusample378[-randchoose(eusample378), ]

# Load Libraries
library(e1071)
library("randomForest")
library(ada); library(adabag)
library(caret)
library(kernlab)
library(cluster)
library(ipred)
library(ggplot2)

MLcomp <- function(fitlas, cvlas, trn, test, option=1) {
  allfeat<-as.numeric(names(chooselambda(cvlasso = cvlas, option = 1)))
  allfeat<-allfeat[which(allfeat>4)]
  trainlist<-as.list(NULL)
  for (i in 1:length(allfeat)) {
    trainlist[[i]]<-trn[,which(colnames(trn) %in% 
                                 c(row.names(showfeatures(fitlas, chooselambda(cvlas = cvlas,1), allfeat[i])), "Rank"))]
  }
resultframe<-data.frame(origin=rep(NA,length(allfeat)))
rownames(resultframe)<-allfeat
resultframe$Decision_tree_bagging<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    eubag<-ipred::bagging(Rank~.,data = trainlist[[i]],nbagg=100)
    bagtest<-predict(eubag, eutest)
    bagagg<-bagtest==eutest$Rank
    accuracy<-prop.table(table(bagagg))[c("TRUE")]
    resultframe$Decision_tree_bagging[i]<-accuracy
  }
resultframe$Random_forest<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    eurf<-randomForest(Rank~.,data=trainlist[[i]])
    rftest<-predict(eurf,eutest)
    rfagg<-rftest==eutest$Rank
    accuracy<-prop.table(table(rfagg))[c("TRUE")]
    resultframe$Random_forest[i]<-accuracy
  }
resultframe$Decision_tree_adaboost<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    enada<-ada(Rank~.,data = trainlist[[i]],iter=50)
    adatest<-predict(enada,eutest)
    adaagg<-adatest==eutest$Rank
    accuracy<-prop.table(table(adaagg))[c("TRUE")]
    resultframe$Decision_tree_adaboost[i]<-accuracy
  }
resultframe$GLM<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    euglm<-glm(Rank~.,data = trainlist[[i]],family = "binomial")
    glmtest<-predict(euglm,eutest)
    glmtest<-ifelse(glmtest<0,0,1)
    glmagg<-glmtest==eutest$Rank
    accuracy<-prop.table(table(glmagg))[c("TRUE")]
    resultframe$GLM[i]<-accuracy
  }
resultframe$SVM_best_Gamma_Cost<-rep(NA,length(allfeat))  
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    svmtune<-tune.svm(Rank~.,data = trainlist[[i]],gamma = 10^(-6:1),cost = 10^(-10:10))
    svmed<-svm(Rank~.,data=trainlist[[i]],gamma=svmtune$best.parameters[1],cost=svmtune$best.parameters[2])
    svmtest<-predict(svmed,eutest)
    svmagg<-svmtest==eutest$Rank
    accuracy<-prop.table(table(svmagg))[c("TRUE")]
    resultframe$SVM_best_Gamma_Cost[i]<-accuracy
  }
  resultframe$origin<-NULL
  if(option==1){return(resultframe)}
}
resultframe <- MLcomp(fitLASSO, cvLASSO, eutrain, eutest, 1)
resultframe_386_ST <- resultframe
# View(resultframe_386_ST)

# resultframe_378_ST <- MLcomp(fitLASSO, cvLASSO, eutrain378, eutest378, 1)

# Display results
resultframe$features<-as.factor(as.numeric(rownames(resultframe)))
ppframe<-data.frame(NULL)
for (i in 1:5) {
  FM <- data.frame(resultframe[,i], resultframe$features,
                   Methods<-rep(colnames(resultframe)[i], nrow(resultframe)))
  ppframe<-rbind(ppframe, FM)
}
colnames(ppframe)<-c("Accuracy", "Features", "Methods")
ggplot(ppframe, aes(x=Features, y=Accuracy, colour=Methods, 
                    group=Methods, shape=Methods))+
  geom_line(position=position_dodge(0.2), lwd=2)+
  ylim(0.2, 1.0) +
  geom_point(size=5, position=position_dodge(0.2))+
  theme(legend.position="top", legend.text=element_text(size=16))+
  ggtitle("Spacetime (386 features): Compare ML Forecasting Results")+
  theme(
    axis.text=element_text(size=16),
    plot.title = element_text(size=18, face="bold.italic"),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold"))

# spacetime (ST) 378_ST
resultframe_378_ST$features<-as.factor(as.numeric(rownames(resultframe_378_ST)))
ppframe_378_ST<-data.frame(NULL)
for (i in 1:5) {
  FM_378_ST <- data.frame(resultframe_378_ST[,i], resultframe_378_ST$features,
          Methods<-rep(colnames(resultframe_378_ST)[i], nrow(resultframe_378_ST)))
  ppframe_378_ST<-rbind(ppframe_378_ST, FM_378_ST)
}
colnames(ppframe_378_ST)<-c("Accuracy", "Features", "Methods")

  • 1.3 Clustering

3.2 Spacekime - Nil-Phase

  • ** 2.1 Lasso features selection**

  • ** 2.2 Comparison of different ML algorithms of different feature numbers**

  • ** 2.3 Clustering**

3.3 Spacekime - Swapped-Phase

  • ** 3.1 Lasso features selection**
dim(IFT_SwappedPhase_FT_aggregate_arima_vector)
# [1]  31 387
eudata_SwappedPhase <- IFT_SwappedPhase_FT_aggregate_arima_vector
colnames(eudata_SwappedPhase) <- c("country", colnames(eudata_SwappedPhase[,-1]))
eudata_SwappedPhase <- as.data.frame(eudata_SwappedPhase[ , -ncol(eudata_SwappedPhase)])
Y <- as.data.frame(IFT_SwappedPhase_FT_aggregate_arima_vector)$OA

# Compelte data 386 features (378 + 8)
X <- eudata_SwappedPhase
countryinfo<-as.character(X[,1])
countryinfo[11]<-"Germany"
X<-X[,-1]
keeplist<-NULL
for (i in 1:ncol(X)) {
  if(FALSE %in% (X[,i]==mean(X[,i]))) {keeplist<-c(keeplist,i)}
}
X<-X[,keeplist]; dim(X)   # 31 343

# Reduced to 378 features 
# TS-derivative features only (378)
# X378 <- X[, -c(379:386)]; dim(X378)
#countryinfo<-as.character(X378[,1])
#countryinfo[11]<-"Germany"
#X378<-X378[,-1]
#keeplist<-NULL
#for (i in 1:ncol(X378)) {
#    if(FALSE %in% (X378[,i]==mean(X378[,i]))) {keeplist<-c(keeplist,i)}
#}
#X378<-X378[,keeplist]; dim(X378)

library(glmnet)
fitLASSO_X <- glmnet(as.matrix(X), Y, alpha = 1)
library(doParallel)
registerDoParallel(5)
#cross-validation
cvLASSO_X = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE)

# fitLASSO_X <- glmnet(as.matrix(X378), Y, alpha = 1)
#library(doParallel)
#registerDoParallel(5)
#cross-validation
#cvLASSO_X = cv.glmnet(data.matrix(X378), Y, alpha = 1, parallel=TRUE)

# To choose features we like to have based on lasso
chooselambda <- function(cvlasso, option, k=NULL) {
  lambmat<-cbind(cvlasso$glmnet.fit$df,cvlasso$glmnet.fit$lambda)
  result<-tapply(lambmat[,2],lambmat[,1],max)
  kresult<-result[which(names(result)==as.factor(k))]
  if(option==1) {return(result)}
  else if (option==2) {return(kresult)}
  else (return("Not a valid option"))
}
showfeatures <- function(object, lambda, k ,...) {
  lam<-lambda[which(names(lambda)==as.factor(k))]
  beta <- predict(object, s = lam, type = "coef")
  if(is.list(beta)) {
    out <- do.call("cbind", lapply(beta, function(x) x[,1]))
    out <- as.data.frame(out)
    s <- rowSums(out)
    out <- out[which(s)!=0,,drop=FALSE]
  } else  {out<-data.frame(Overall = beta[,1])
  out<-out[which(out!=0),,drop=FALSE]
  }
  out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE])
  out
}
  • ** 3.2 Comparison of different ML algorithms of different feature numbers**
#test training data setup
randchoose <- function(matr) {
  leng<-nrow(matr)
  se<-seq(1:leng)
  sam<-sample(se,as.integer(leng*0.6))
  return(sam)
}

Xsample <- X
Xsample$Rank <- as.factor(ifelse(Y<30, 1, 0))
set.seed(1234)
Xtrain <- Xsample[randchoose(Xsample), ]
set.seed(1234)
Xtest <- Xsample[-randchoose(Xsample), ]

#Xsample378 <- X378
#Xsample378$Rank <- as.factor(ifelse(Y<30, 1, 0))
#set.seed(1234)
#Xtrain378 <- Xsample378[randchoose(Xsample378), ]
#set.seed(1234)
#Xtest378 <- Xsample378[-randchoose(Xsample378), ]

# Load Libraries
library(e1071)
library("randomForest")
library(ada); library(adabag)
library(caret)
library(kernlab)
library(cluster)
library(ipred)
library(ggplot2)

# resultXframe <- MLcomp(fitLASSO, cvLASSO, Xtrain, Xtest, 1)
MLcompX <- function(fitlas, cvlas, trn, test, option=1) {
  allfeat<-as.numeric(names(chooselambda(cvlasso = cvlas, option = 1)))
  allfeat<-allfeat[which(allfeat>4)]
  trainlist<-as.list(NULL)
  for (i in 1:length(allfeat)) {
    trainlist[[i]]<-trn[,which(colnames(trn) %in% 
                                 c(row.names(showfeatures(fitlas, chooselambda(cvlas = cvlas,1), allfeat[i])), "Rank"))]
  }
 
  resultXframe<-data.frame(origin=rep(NA,length(allfeat)))
  rownames(resultXframe)<-allfeat
  resultXframe$Decision_tree_bagging<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    #ERROR HANDLING
    possibleError <- tryCatch(
        function () {
          set.seed(1234)
          Xbag<-ipred::bagging(Rank~ . ,data = trainlist[[i]], nbagg=100, 
                               control=rpart.control(minsplit=2, cp=0.1, xval=10))
          bagtest<-predict(Xbag, Xtest)
          bagagg<-bagtest==Xtest$Rank
          accuracy<-prop.table(table(bagagg))[c("TRUE")]
          resultXframe$Decision_tree_bagging[i]<-accuracy
        },
        error=function(e) e
    )
    if(inherits(possibleError, "error")) next
    # print(i)
  }

  resultXframe$Random_forest<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    Xrf<-randomForest(Rank~.,data=trainlist[[i]])
    rftest<-predict(Xrf,test)
    rfagg<-rftest==test$Rank
    accuracy<-prop.table(table(rfagg))[c("TRUE")]
    resultXframe$Random_forest[i]<-accuracy
  }

  resultXframe$Decision_tree_adaboost<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    Xada<-ada(Rank~.,data = trainlist[[i]],iter=50)
    adatest<-predict(Xada,test)
    adaagg<-adatest==test$Rank
    accuracy<-prop.table(table(adaagg))[c("TRUE")]
    resultXframe$Decision_tree_adaboost[i]<-accuracy
  }

  resultXframe$GLM<-rep(NA,length(allfeat))
  for (i in 1:length(allfeat)) {
    euglm<-glm(Rank~.,data = trainlist[[i]],family = "binomial")
    glmtest<-predict(euglm,test)
    glmtest<-ifelse(glmtest<0,0,1)
    glmagg<-glmtest==test$Rank
    accuracy<-prop.table(table(glmagg))[c("TRUE")]
    resultXframe$GLM[i]<-accuracy
  }

  resultXframe$SVM_best_Gamma_Cost<-rep(NA,length(allfeat))  
  for (i in 1:length(allfeat)) {
    set.seed(1234)
    svmtune<-tune.svm(Rank~.,data = trainlist[[i]],gamma = 10^(-6:1),cost = 10^(-10:10))
    svmed<-svm(Rank~.,data=trainlist[[i]],gamma=svmtune$best.parameters[1],cost=svmtune$best.parameters[2])
    svmtest<-predict(svmed,test)
    svmagg<-svmtest==test$Rank
    accuracy<-prop.table(table(svmagg))[c("TRUE")]
    resultXframe$SVM_best_Gamma_Cost[i]<-accuracy
  }
  resultXframe$origin<-NULL
  if(option==1){return(resultXframe)}
}

resultXframe <- MLcompX(fitLASSO_X, cvLASSO_X, Xtrain, Xtest, 1)
resultXframe_386_SK_Swapped <- resultXframe
# View(resultXframe_386_SK_Swapped)

# resultXframe_378_SK_Swapped <- MLcompX(fitLASSO_X, cvLASSO_X, Xtrain378, Xtest378, 1)

  • ** 3.3 Clustering**

SOCR Resource Visitor number Web Analytics SOCR Email