SOCR ≫ TCIU Website ≫ TCIU GitHub ≫

1 Data Preprocessing

Find all “Common” freatures (highly-observed and congruent Econ indicators)

# 1. Find all "Common" freatures (highly-observed and congruent Econ indicators)
countryNames <- unique(time_series$country); length(countryNames); # countryNames
# initialize 3D array of DF's that will store the data for each of hte countries into a 2D frame
countryData <- list()  # countryData[[listID==Country]][1-time-72, 1-feature-197]

for (i in 1:length(countryNames)) {
  countryData[[i]] <- filter(time_series, country == countryNames[i])
}
# Check countryData[[2]][2, 3] == Belgium[2,3]

list_of_dfs_CommonFeatures <- list()  # list of data for supersampled countries 360 * 197

# 2. General function that ensures the XReg predictors for ALL 31 EU countries are homologous
completeHomologousX_features <- function (list_of_dfs) {
  # delete features that are missing at all time points
  for (j in 1:length(list_of_dfs)) {
    print(paste0("Pre-processing Country: ...", countryNames[j], "... "))
    data = list_of_dfs[[j]]
    data = data[ , colSums(is.na(data)) != nrow(data)]
    data = select(data, -time, -country)
    DataMatrix = as.matrix(data)
    DataMatrix = cleardata(DataMatrix)
    DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value
    DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0
    # Supersample 72 --*5--> 360 timepoints 
    DataMatrix = splinecreate(DataMatrix)
    DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data
    # remove some of features  
    DataSuperSample = DataSuperSample[, -c(50:80)]; dim(X)  # 360 167
    # ensure full-rank design matrix, DataSuperSample
    DataSuperSample <- 
      DataSuperSample[ , qr(DataSuperSample)$pivot[seq_len(qr(DataSuperSample)$rank)]]
    print(paste0("dim()=(", dim(DataSuperSample)[1], ",", dim(DataSuperSample)[2], ") ..."))
    # update the current DF/Country
    list_of_dfs_CommonFeatures[[j]] <- DataSuperSample
  }

  # Identify All Xreg features that are homologous (same feature columns) across All 31 countries
  # Identify Common Columns (freatures)
  comCol <- Reduce(intersect, lapply(list_of_dfs_CommonFeatures, colnames))
  list_of_dfs_CommonFeatures <- lapply(list_of_dfs_CommonFeatures, function(x) x[comCol])

  for (j in 1:length(list_of_dfs_CommonFeatures)) {
    list_of_dfs_CommonFeatures[[j]] <- subset(list_of_dfs_CommonFeatures[[j]], select = comCol)
    print(paste0("dim(", countryNames[j], ")=(", dim(list_of_dfs_CommonFeatures[[j]])[1], 
                 ",", dim(list_of_dfs_CommonFeatures[[j]])[2], ")!"))  # 72 * 197
  }
  return(list_of_dfs_CommonFeatures)
}
# Test completeHomologousX_features: dim(AllCountries)=(360,42)!
list_of_dfs_CommonFeatures <- completeHomologousX_features(countryData); 
length(list_of_dfs_CommonFeatures); dim(list_of_dfs_CommonFeatures[[1]]) # Austria data matrix 360*42

For each country (\(n\)) and each common feature (\(k\)), fit ARIMA model and estimate the parameters \((p,d,q)\) (non-exogenous, just the timeseries model for this feature), (p,d,q) triples for non-seasonal and seasonal effects. For each (Country, Feature) pair, the 9 ARIMA-derived vector includes: ** (ts_avg, forecast_avg, non-seasonal AR, non-seasonal MA, seasonal AR, seasonal MA, period, non-seasonal Diff, seasonal differences)**.

# 3. For each country (n) and each common feature (k), compute (p,d,q) ARIMA models (non-exogenous, 
# just the timeseries model for this feature), (p,d,q) triples
# Country * Feature
arimaModels_DF <- list() 
#data.frame(matrix(NA, nrow = length(countryNames), 
#  ncol = dim(list_of_dfs_CommonFeatures[[1]])[2]), row.names=countryNames, stringsAsFactors=T)
colnames(arimaModels_DF) <- colnames(list_of_dfs_CommonFeatures[[1]])
list_index <- 1
arimaModels_ARMA_coefs <- list()  # array( , c(31, 9*dim(list_of_dfs_CommonFeatures[[1]])[2]))
# dim(arimaModels_ARMA_coefs) # [1]  31 x 378 == 31 x (9 * 42)
# For each (Country, feature) index, the 9 ARIMA-derived vector includes:
# (ts_avg, forecast_avg, non-seasonal AR, non-seasonal MA, seasonal AR, seasonal MA, period, non-seasonal Diff, seasonal differences)

for(n in 1:(length(list_of_dfs_CommonFeatures))) {          # for each Country 1<=n<=31
  for (k in 1:(dim(list_of_dfs_CommonFeatures[[1]])[2])) {  # for each feature 1<=k<=42
    # extract one timeseries (the feature+country time course)
    ts = ts(list_of_dfs_CommonFeatures[[n]][ , k],
            frequency=20, start=c(2000,1), end=c(2017,20))
    set.seed(1234)
    arimaModels_DF[[list_index]] <- auto.arima(ts)
      # pred_arimaModels_DF = forecast(arimaModels_DF[[list_index]])
      # ts_pred_arimaModels_DF <- 
      #  ts(pred_arimaModels_DF$mean, frequency=20, start=c(2015,1), end=c(2017,20))
      # ts_pred_arimaModels_DF
    arimaModels_ARMA_coefs[[list_index]] <- c (
        mean(ts),                           # time-series average (retrospective) 
        mean(forecast(arimaModels_DF[[list_index]])$mean), # forecasted TS average (prospective)
        arimaModels_DF[[list_index]]$arma)  # 7 ARMA estimated parameters
    cat("arimaModels_ARMA_coefs[country=", countryNames[n], ", feature=",
             colnames(list_of_dfs_CommonFeatures[[1]])[k],
             "] Derived-Features=(", round(arimaModels_ARMA_coefs[[list_index]], 2), ") ...")
    #print(paste0("arimaModels_DF[country=", countryNames[i], ", feature=",
    #         colnames(list_of_dfs_CommonFeatures[[1]])[k],
    #         "]$arma =", arimaModels_DF[[list_index]]$arma))
    list_index <- list_index + 1
  }
}

length(arimaModels_ARMA_coefs) == 31*42  # [1] TRUE  # Each list-element consists of 9 values, see above
#  == dim(list_of_dfs_CommonFeatures[[1]])[1] * dim(list_of_dfs_CommonFeatures[[1]])[2]
# Maps to convert between 1D indices and 2D (Country, Feature) pairs
index2CountryFeature <- function(indx=1) { 
  if (indx<1 | indx>length(arimaModels_ARMA_coefs)) {
    cat("Index out of bounds: indx=", indx, "; must be between 1 and ", 
        length(arimaModels_ARMA_coefs), " ... Exiting ...")
    return (NULL)
  } else { 
    feature = (indx-1) %% (dim(list_of_dfs_CommonFeatures[[1]])[2])
    country = floor((indx - feature)/(dim(list_of_dfs_CommonFeatures[[1]])[2]))
    return(list("feature"=(feature+1), "country"=(country+1)))  }
}

countryFeature2Index <- function(countryIndx=1, featureIndx=1) { 
  if (countryIndx<1 | countryIndx>(dim(list_of_dfs_CommonFeatures[[1]])[1]) |
      featureIndx<1 | featureIndx>(dim(list_of_dfs_CommonFeatures[[1]])[2])) {
    cat("Indices out of bounds: countryIndx=", countryIndx, "; featureIndx=", featureIndx, "; Exiting ...")
    return (NULL)
  } else { return (featureIndx + (countryIndx-1)*(dim(list_of_dfs_CommonFeatures[[1]])[2])) }
}

# test forward and reverse index mapping functions
index2CountryFeature(42);  index2CountryFeature(45)$country;  index2CountryFeature(45)$feature
countryFeature2Index(countryIndx=2, featureIndx=3)

# Column/Feature Names: colnames(list_of_dfs_CommonFeatures[[1]])
# Country/Row Names: countryNames
arimaModels_ARMA_coefs[[1]] # Austria/Feature1  1:9 feature vector
# Cuntry2=Bulgaria, feature 5, 1:9 vector
arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=2, featureIndx=5)]]

Convert list of ARIMA models to a Data.Frame [Countries, megaFeatures] that can be put through ML data analytics. Augment the features using the EU_SOCR_Country_Ranking_Data_2011 dataset.

# 4. Add the country ranking as a new feature, using the OA ranks here:
# (http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_2008_World_CountriesRankings)
EU_SOCR_Country_Ranking_Data_2011 <- read.csv2("E:/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_SOCR_Country_Ranking_Data_2011.csv", header=T, sep=",")

length(arimaModels_ARMA_coefs) # 31*42 
arima_df <- data.frame(matrix(NA, 
          nrow=length(countryNames), ncol=length(colnames(list_of_dfs_CommonFeatures[[1]]))*9))
dim(arima_df)   # [1]  31 378

for(n in 1:dim(arima_df)[1]) {        # for each Country 1<=n<=31
  for (k in 1:length(colnames(list_of_dfs_CommonFeatures[[1]]))) {     # for each feature 1<=k<=42
    for (l in 1:9) {                  # for each arima vector 1:9, see above
        arima_df[n, (k-1)*9 + l] <- 
          round(arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=n, featureIndx=k)]][l], 1)
        if (n==dim(arima_df)[1]) colnames(arima_df)[(k-1)*9 + l] <- 
            print(paste0("Feature_",k, "_ArimaVec_",l))
    }
  }
}

# DF Conversion Validation
arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=3, featureIndx=5)]][2] == arima_df[3, (5-1)*9 + 2]
# [1] 1802.956

# Aggregate 2 datasets
dim(EU_SOCR_Country_Ranking_Data_2011)  # [1] 31 10

aggregate_arima_vector_country_ranking_df <- 
  as.data.frame(cbind(arima_df, EU_SOCR_Country_Ranking_Data_2011[ , -1]))
dim(aggregate_arima_vector_country_ranking_df)   # [1]  Country=31 * Features=387 (ARIMA=378 + Ranking=9)
# View(aggregate_arima_vector_country_ranking_df)
rownames(aggregate_arima_vector_country_ranking_df) <- countryNames
write.csv(aggregate_arima_vector_country_ranking_df, row.names = T, fileEncoding = "UTF-16LE",
           "E:/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_aggregate_arima_vector_country_ranking.csv")

2 Regularized Linear Modeling

2.1 Stronger Signal Analytics

Using all 386 features (378 ARIMA signatures + 8 meta-data).

Use Model-based and Model-free methods to predict the overall (OA) country ranking.

# 1. LASSO regression/feature extraction

# subset test data
Y = aggregate_arima_vector_country_ranking_df$OA
X = aggregate_arima_vector_country_ranking_df[ , -387]
# remove columns containing NAs
X = X[ , colSums(is.na(X)) == 0]; dim(X)  # [1]  31 386

fitRidge = glmnet(as.matrix(X), Y, alpha = 0)  # Ridge Regression
fitLASSO = glmnet(as.matrix(X), Y, alpha = 1)  # The LASSO
# LASSO
plot(fitLASSO, xvar="lambda", label="TRUE", lwd=3)
# add label to upper x-axis
mtext("LASSO regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Ridge
plot(fitRidge, xvar="lambda", label="TRUE", lwd=3)
# add label to upper x-axis
mtext("Ridge regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

#### 10-fold cross validation ####
# LASSO
library(doParallel)
registerDoParallel(6)
set.seed(1234)  # set seed 
# (10-fold) cross validation for the LASSO
cvLASSO = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE)
cvRidge = cv.glmnet(data.matrix(X), Y, alpha = 0, parallel=TRUE)
plot(cvLASSO)
mtext("CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO <-  predict(cvLASSO, s = cvLASSO$lambda.min, newx = data.matrix(X))
testMSE_LASSO <- mean((predLASSO - Y)^2); testMSE_LASSO

predLASSO = predict(cvLASSO, s = cvLASSO$lambda.min, newx = data.matrix(X))
predRidge = predict(fitRidge, s = cvRidge$lambda.min, newx = data.matrix(X))

# calculate test set MSE
testMSELASSO = mean((predLASSO - Y)^2)
testMSERidge = mean((predRidge - Y)^2)
##################################Use only ARIMA effects, no SOCR meta-data#####
set.seed(4321)
cvLASSO_lim = cv.glmnet(data.matrix(X[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE)
plot(cvLASSO_lim)
mtext("CV LASSO (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5)

# Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome
predLASSO_lim <-  predict(cvLASSO_lim, s = 3, # cvLASSO_lim$lambda.min, 
                          newx = data.matrix(X[ , 1:(42*9)]))
coefList_lim <- coef(cvLASSO_lim, s=3) # 'lambda.min')
coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1],coefList_lim@x)
names(coefList_lim) <- c('Feature','EffectSize')
arrange(coefList_lim, -abs(EffectSize))[2:10, ]
cor(Y, predLASSO_lim[, 1])  # 0.84
################################################################################

# Plot Regression Coefficients: create variable names for plotting 

# par(mar=c(2, 13, 1, 1))   # extra large left margin # par(mar=c(5,5,5,5))
varNames <- colnames(X); varNames; length(varNames)

betaHatLASSO = as.double(coef(fitLASSO, s = cvLASSO$lambda.min))  # cvLASSO$lambda.1se
betaHatRidge = as.double(coef(fitRidge, s = cvRidge$lambda.min))

#coefplot(betaHatLASSO[2:386], sd = rep(0, 385), pch=0, cex.pts = 3, main = "LASSO-Regularized Regression Coefficient Estimates", varnames = varNames)
coefplot(betaHatLASSO[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", main = "LASSO-Regularized Regression Coefficient Estimates", varnames = varNames[377:386])
coefplot(betaHatRidge[377:386], sd = rep(0, 10), pch=2, add = TRUE, col.pts = "blue", cex.pts = 3)
legend("bottomleft", c("LASSO", "Ridge"), col = c("red", "blue"), pch = c(1 , 2), bty = "o", cex = 2)

varImp <- function(object, lambda = NULL, ...) {
  ## skipping a few lines
  beta <- predict(object, s = lambda, type = "coef")
  if(is.list(beta)) {
    out <- do.call("cbind", lapply(beta, function(x) x[,1]))
    out <- as.data.frame(out)
  } else out <- data.frame(Overall = beta[,1])
  out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE])
  out
}

varImp(cvLASSO, lambda = cvLASSO$lambda.min)

coefList <- coef(cvLASSO, s='lambda.min')
coefList <- data.frame(coefList@Dimnames[[1]][coefList@i+1],coefList@x)
names(coefList) <- c('Feature','EffectSize')
arrange(coefList, -abs(EffectSize))[2:10, ]
#                      var        val  # Feature names: colnames(list_of_dfs_CommonFeatures[[1]])
#1            (Intercept) 49.4896874
#2   Feature_1_ArimaVec_8 -2.4050811   # Feature 1 = Active population: Females 15 to 64 years
#3  Feature_20_ArimaVec_8 -1.4015001   # Feature 20= "Employment: Females 15 to 64 years
#4            IncomeGroup -1.2271177
#5   Feature_9_ArimaVec_8 -1.0629835   # Feature 9= Active population: Total 15 to 64 years
#6                     ED -0.7481041
#7                     PE -0.5167668
#8  Feature_25_ArimaVec_5  0.4416775   # Feature 25= Property income
#9   Feature_9_ArimaVec_4 -0.2217804
#10                   QOL -0.1965342
#                                     ARIMA: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff
#
#9 ARIMA-derived vector includes:
# (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 
#  7=period, 8=non-seasonal Diff, 9=seasonal differences)

# [1] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels"                                                     
# [2] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"     
# [3] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Tertiary education (levels 5-8)"                                           
# [4] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
# [5] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, All ISCED 2011 levels"                                                       
# [6] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"       
# [7] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Tertiary education (levels 5-8)"                                             
# [8] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"  
# [9] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels"  
#[10] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" 
#[11] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Tertiary education (levels 5-8)"        
#[12] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)"
#[13] "All ISCED 2011 levels " 
# [14] "All ISCED 2011 levels, Females"  
# [15] "All ISCED 2011 levels, Males"
# [16] "Capital transfers, payable"  
# [17] "Capital transfers, receivable"  
# [18] "Compensation of employees, payable"  
# [19] "Current taxes on income, wealth, etc., receivable"
#[20] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels " 
# [21] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)"            
# [22] "Other current transfers, payable"
# [23] "Other current transfers, receivable" 
# [24] "Property income, payable" 
# [25] "Property income, receivable" 
# [26] "Savings, gross" 
# [27] "Subsidies, payable"