--- title: "Spacekime Analytics (Time Complexity and Inferential Uncertainty)" subtitle: "European Economic Spacekime Analysis Part 1 – Longitudinal Modeling" author: "SOCR Team " date: "`r format(Sys.time(),'%m/%d/%Y')`" output: html_document: theme: spacelab highlight: tango includes: before_body: TCIU_header.html toc: true number_sections: true toc_depth: 2 toc_float: collapsed: false smooth_scroll: true code_folding: hide --- ```{r error=F, message=F, warning=F} library(dplyr) library(plotly) library(pracma) library(ggplot2) require(magrittr) library(gridExtra) ``` # Data Preprocessing {.tabset .tabset-fade .tabset-dropdown} - Load the data - Find all "Common" features (highly-observed and congruent Econ indicators) - Establish homologies between the XReg predictors for ALL 31 EU countries. ```{r} load("EU_Econ_rawdata.RData") ``` ```{r,echo = T, results = 'hide', message=F, warning=F} #super sample the dataset cleardata <- function(mat) { for (i in 1:ncol(mat)) { mat[is.na(mat[,i]),i]<-mean(mat[,i],na.rm = T) + rnorm(sum(is.na(mat[,i])),sd = sd(mat[,i],na.rm = T)) } return(mat) } # 1. Find all "Common" features (highly-observed and congruent Econ indicators) countryNames <- unique(time_series$country); length(countryNames); # countryNames # initialize 3D array of DF's that will store the data for each of the countries into a 2D frame countryData <- list() # countryData[[listID==Country]][1-time-72, 1-feature-197] for (i in 1:length(countryNames)) { countryData[[i]] <- filter(time_series, country == countryNames[i]) } # Check countryData[[2]][2, 3] == Belgium[2,3] list_of_dfs_CommonFeatures <- list() # list of data for supersampled countries 360 * 197 # 2. General function that ensures the XReg predictors for ALL 31 EU countries are homologous completeHomologousX_features <- function (list_of_dfs) { # delete features that are missing at all time points for (j in 1:length(list_of_dfs)) { print(paste0("Pre-processing Country: ...", countryNames[j], "... ")) data = list_of_dfs[[j]] data = data[ , colSums(is.na(data)) != nrow(data)] data = dplyr::select(data, !any_of(c("time", "country"))) DataMatrix = as.matrix(data) DataMatrix = cleardata(DataMatrix) DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0 # Supersample 72 --*5--> 360 timepoints #DataMatrix = splinecreate(DataMatrix) DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data # remove some of features #DataSuperSample = DataSuperSample[, -c(50:80)]; dim(X) # 360 167 # ensure full-rank design matrix, DataSuperSample DataSuperSample <- DataSuperSample[ , qr(DataSuperSample)$pivot[seq_len(qr(DataSuperSample)$rank)]] print(paste0("dim()=(", dim(DataSuperSample)[1], ",", dim(DataSuperSample)[2], ") ...")) # update the current DF/Country list_of_dfs_CommonFeatures[[j]] <- DataSuperSample } # Identify All Xreg features that are homologous (same feature columns) across All 31 countries # Identify Common Columns (features) comCol <- Reduce(intersect, lapply(list_of_dfs_CommonFeatures, colnames)) list_of_dfs_CommonFeatures <- lapply(list_of_dfs_CommonFeatures, function(x) x[comCol]) for (j in 1:length(list_of_dfs_CommonFeatures)) { list_of_dfs_CommonFeatures[[j]] <- subset(list_of_dfs_CommonFeatures[[j]], select = comCol) print(paste0("dim(", countryNames[j], ")=(", dim(list_of_dfs_CommonFeatures[[j]])[1], ",", dim(list_of_dfs_CommonFeatures[[j]])[2], ")!")) # 72 * 197 } return(list_of_dfs_CommonFeatures) } # Test completeHomologousX_features: dim(AllCountries)=(360,42)! list_of_dfs_CommonFeatures <- completeHomologousX_features(countryData); length(list_of_dfs_CommonFeatures); dim(list_of_dfs_CommonFeatures[[1]]) # Austria data matrix 360*42 ``` # Laplace Transform Define the *forward* and *reverse* *Laplace Transforms* (LT/ILT). ```{r} x2 = seq(from = 1, to = 11, length.out = 50) # drop the first row to avoid real part value of 0 y2 = seq(from = -5, to = 5, length.out = 50) # drop the first column to avoid imaginary part value of 0 XY = expand.grid(X=x2, Y=y2) complex_xy = mapply(complex, real=XY$X,imaginary=XY$Y) X<-1:72 time_points <- seq(0+0.01, 2*pi, length.out = 72) ``` ```{r} # create the LT NuLT = function(datax, datay, inputz, k = 3, fitwarning = FALSE, mirror = FALSE, range = 2*pi) { datax = as.numeric(datax) datay = as.numeric(datay) n = length(datax) x1 = n/(n+0.5)*((datax-min(datax))/(max(datax)-min(datax)))*range if(mirror){ x1 = c(x1,rev(2*range - x1))/2 n = 2*n datay = c(datay, rev(datay)) #plot(x1, datay) } #generate the coefficients in indefinite integral of t^n*exp(-zt) coef = 1; coefm = as.matrix(coef) for(i in 1:k){ coefm = cbind(coefm,0) coef = c(coef*i,1) coefm = rbind(coefm,coef) } # these coefficients ordered by ^0, ^1, ^2, ... in column format # compute 1, z, z^2...,z^k zz = cbind(1,inputz) zt = inputz for (i in 2:k){ zt = zt*inputz zz = cbind(zz,zt) } zd = zt*inputz # compute 1, x, x^2...,x^k tx = x1; xm = cbind(1,x1) for (i in 2:k){ tx = tx*x1 xm = cbind(xm,tx) } # sum over intervals result = 0*inputz ii = 1 while(ii+k<=n) { A = xm[seq(ii,ii+k),c(0:k+1)] b = datay[seq(ii,ii+k)] # polyfit might be faster when using polynomial basis, while matrix inverse, `solve()`, # is the more general approach that works for any function basis polyc = as.numeric(solve(A,b)) #ordered by ^0, ^1, ^2, ... in column format # Enter a new function variable qualityCheck=FALSE # check fit quality; this step can be skipped for speed/efficiency # if (qualityCheck) { .... } if (fitwarning){ xx = seq(A[1,2],A[k+1,2],length.out = 100); yy = polyval(rev(polyc),xx) if(max(abs(yy-mean(b)))>2*max(abs(b-mean(b)))){ print(c("Warning: Poor Fit at ",ii,", Largest Deviation is",max(abs(yy-mean(b))))) print(c("Spline Polynomial is", polyc),3); #print(c(polyval(rev(polyc),A[,2]),b)) plot(xx, yy, main="Polynomial fit", ylab="", type="l", col="blue") lines(A[,2],b, col="red") legend("topleft",c("fit","data"),fill=c("blue","red")) print(" ") } } # Use vector/matrix operations to avoid looping, # some of the expressions look weird # May need to actually compare the efficiency/speed of # vector based vs. standard numeric calculations m1 = t(t(polyc*coefm)*A[1,]) m11 = as.numeric(tapply(m1, col(m1)-row(m1), sum))[0:k+1] m2 = t(t(polyc*coefm)*A[k+1,]) m22 = as.numeric(tapply(m2, col(m2)-row(m2), sum))[0:k+1] intgl = (exp(-inputz*A[1,2])*colSums(t(zz)*m11)- exp(-inputz*A[k+1,2])*colSums(t(zz)*m22))/zd result = result+intgl ii=ii+k } # Computations over the last interval if(ii% layout(title=commonefeature[31]) for (j in 1:5) { for (i in 1:6){ xx2<-1:50+50*(j-1) yy2<-1:50+50*(i-1) p_feature <- p_feature %>% add_trace(x=xx2,y=yy2, z =magnitude_feature[[31]][j+(i-1)*5,,], surfacecolor=phase_feature[[31]][j+(i-1)*5,,], colorscale=colorscale, #Phase-based color type = 'surface',name=substr(countryNames[j+(i-1)*5],0,50),opacity=0.7,showlegend=TRUE) } } ``` ```{r,message=F, warning=F} p_feature ``` # Inverse Laplace Transform ```{r} ctILT = function( LTF, tini = 0.001, tend = 9, nnt = 200){ if (TRUE){ a=8; ns=100; nd=29; } #% implicit parameters N = ns+nd+1 radt=seq(tini,tend*nnt /(nnt + 0.5),length.out = nnt); # time vector if (tini==0){ #radt=radt[c(2:nnt)] } # t=0 is not allowed #tic % measure the CPU time alfa = seq(1,ns+1+nd) beta = alfa for (j in seq(1,ns+1+nd)){# % prepare necessary coefficients alfa[j]=a+(j-1)*pi*1i; beta[j]=-exp(a)*(-1)^j; } #print(beta) n = c(1:nd) bdif=rev(cumsum(gamma(nd+1)/gamma(nd+2-n)/gamma(n)))/(2^nd) #print(beta[ns+2:ns+1+nd]) temp = beta[seq(ns+2,ns+1+nd)]*bdif print(temp) beta[seq(ns+2,ns+1+nd)]= temp beta[1]=beta[1]/2; ft2 = seq(1,nnt) Qz = c() for (kt in seq(1,nnt)){ # cycle for time t tt=radt[kt]; s=alfa/tt; # complex frequency s Qz = c(Qz,s) } LTQz = LTF(Qz) for (kt in seq(1,nnt)){ # cycle for time t tt=radt[kt]; s=alfa/tt; # complex frequency s bt=beta/tt; #btF=bt*NuLT(datax, datay, s); # functional value F(s) btF = bt*LTQz[seq((kt-1)*N+1,kt*N)] ft2[kt]=sum(Re(btF)); # original f(tt) if(is.na(ft2[kt])){ print(kt) print(LTQz[seq((kt-1)*N+1,kt*N)]) print(btF) } } return(ft2) } ``` ```{r,echo = T, results = 'hide'} rge = 2*pi tnd = 2*pi z2_funct<- function(z) NuLT(time_points,list_of_dfs_CommonFeatures[[2]][,31],inputz = z,mirror = TRUE, range = rge) inv_result<-ctILT(z2_funct,tend = tnd, nnt=72*2) z2_funct<- function(z) NuLT(time_points,list_of_dfs_CommonFeatures[[3]][,31],inputz = z,mirror = TRUE, range = rge) inv_result_2<-ctILT(z2_funct,tend = tnd, nnt=72*2) z2_funct<- function(z) NuLT(time_points,list_of_dfs_CommonFeatures[[10]][,31],inputz = z,mirror = TRUE, range = rge) inv_result_3<-ctILT(z2_funct,tend = tnd, nnt=72*2) z2_funct<- function(z) NuLT(time_points,list_of_dfs_CommonFeatures[[21]][,31],inputz = z,mirror = TRUE, range = rge) inv_result_4<-ctILT(z2_funct,tend = tnd, nnt=72*2) ``` ```{r} valsn_df_1 <- as.data.frame(cbind(inv_result=inv_result[1:72], time_series=list_of_dfs_CommonFeatures[[2]][,31], time_points=time_points)) x <- list( title = "Time" ) y <- list( title = "GDP of Belgium" ) p1=plot_ly(valsn_df_1, x = ~time_points,y = ~time_series, name = 'original_ts',mode = 'lines', type = 'scatter') p1<- p1 %>% add_trace(y = ~ inv_result, name = 'inv_result',mode = 'lines', line=list(dash='dot')) p1 <- p1 %>% layout(xaxis = x, yaxis = y) ``` ```{r} valsn_df_2 <- as.data.frame(cbind(inv_result=inv_result_2[1:72], time_series=list_of_dfs_CommonFeatures[[3]][,31], time_points=time_points)) x <- list( title = "Time" ) y <- list( title = "GDP of Bulgaria" ) p2=plot_ly(valsn_df_2, x = ~time_points,y = ~time_series, name = 'original_ts',mode = 'lines', type = 'scatter') p2<- p2 %>% add_trace(y = ~ inv_result, name = 'inv_result',mode = 'lines', line=list(dash='dot')) p2 <- p2 %>% layout(xaxis = x, yaxis = y) ``` ```{r} valsn_df_3 <- as.data.frame(cbind(inv_result=inv_result_3[1:72], time_series=list_of_dfs_CommonFeatures[[10]][,31], time_points=time_points)) x <- list( title = "Time" ) y <- list( title = "GDP of France" ) p3=plot_ly(valsn_df_3, x = ~time_points,y = ~time_series, name = 'original_ts',mode = 'lines', type = 'scatter') p3 <- p3 %>% add_trace(y = ~ inv_result, name = 'inv_result',mode = 'lines', line=list(dash='dot')) p3 <- p3 %>% layout(xaxis = x, yaxis = y) ``` ```{r} valsn_df_4 <- as.data.frame(cbind(inv_result=inv_result_4[1:72], time_series=list_of_dfs_CommonFeatures[[21]][,31], time_points=time_points)) x <- list( title = "Time" ) y <- list( title = "GDP of Netherlands" ) p4=plot_ly(valsn_df_4, x = ~time_points,y = ~time_series, name = 'original_ts',mode = 'lines', type = 'scatter') p4 <- p4 %>% add_trace(y = ~ inv_result, name = 'inv_result',mode = 'lines', line=list(dash='dot')) p4 <- p4 %>% layout(xaxis = x, yaxis = y) ``` ```{r} fig<-subplot( p1, p2, p3, p4, nrows = 2, titleY = TRUE,margin = 0.05,shareX = TRUE )%>% layout(title ="Original Time-series and Reconstructed Time-series by ILT") fig ``` # Data Preprocessing {.tabset .tabset-fade .tabset-dropdown} Loading all packages that are required. ```{r, message=F, warning=F} library(dplyr) library(arm) library(tidyr) library(ggplot2) library(ggrepel) library(plot3D) library(scatterplot3d) library(plotly) library(fastDummies) library(forecast) # Load Previously Computed Workspace: load("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_SpaceKime.RData") ``` Load the data and preprocess the data. ```{r message=F, error=F, warning=F} setwd("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics") eu <- read.csv("Master_Aggregate_EU_Econ_Data_11_29_2018_TimeTransform.csv", stringsAsFactors = F)[,-5] colnames(eu) <- c("country","time","feature","value") eu <- filter(eu,!country %in% c("European Union (25 countries)","D1_Country","")) eu$value <- sapply(c(1:nrow(eu)),function(x) as.numeric(gsub(":|,","",eu$value[x]))) eu <- filter(eu, feature != "") dim(eu) ``` # Reformat the data into a 3D array (country \* feature \* time) ```{r warning=F} unq_country <- sort(unique(eu$country)) unq_time <- sort(unique(eu$time)) unq_fea <- sort(unique(eu$feature)) num_country <- length(unq_country) num_time <- length(unq_time) num_fea <- length(unq_fea) eu <- arrange(eu,country,time,feature) eu_3d_array <- array(NA,dim = c(num_country,num_time,num_fea),dimnames = list(unq_country,unq_time,unq_fea)) for (i in 1:num_country){ for (j in 1:num_time){ for (k in 1:num_fea){ eu_3d_array[i,j,k] = eu$value[(i-1)*num_time*num_fea + (j-1)*num_fea + k] } } } eu_3d_array[1:10,1:10,1] ``` # Data visualization ```{r warning=F} eu <- arrange(eu,time,feature,country) eu_visualization <- dplyr::select(eu,time,feature,country,value) eu_visualization$time <- sapply(c(1:nrow(eu_visualization)),function(x) as.numeric(gsub("Q",".",eu_visualization$time[x]))) eu_visualization$feature <- as.factor(eu_visualization$feature) eu_visualization$country <- as.factor(eu_visualization$country) eu_visualization$value <- as.numeric(eu_visualization$value) eu_visualization$feature <- sapply(c(1:nrow(eu_visualization)),function(x) substr(eu_visualization$feature[x],1,20)) #plot_ly(eu_visualization, x = ~time, y = ~country, z = ~value, color = ~feature,split = ~ country,type = 'scatter3d', mode = 'lines') ``` # Time series format ```{r warning=F} #Find the duplicates eu_time_series <- na.omit(eu) allFeatures = as.character(unique(eu_time_series$feature)) allTime = unique(eu_time_series$time) allCountry = as.character(unique(eu_time_series$country)) allCombination = length(allFeatures)*length(allTime)*length(allCountry) dup = c() for (i in 1:length(allFeatures)){ for (j in 1:length(allCountry)){ for (k in 1:length(allTime)){ if (nrow(filter(eu_time_series,country == allCountry[j] & feature == allFeatures[i] & time == allTime[k]))>1){ dup = c(dup,as.character(allFeatures[i])) break } } break } } dup #These features have multiple observations at the same time point ``` Remove duplicates. ```{r warning=F} removeDup = filter(eu_time_series, feature != "Employment by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels" & feature != "Labor cost for LCI excluding bonuses" & feature != "Labor costs other than wages or salaries" & feature != "Labour cost for LCI (compensation of employees plus taxes minus subsidies)" & feature != "Labour cost for LCI excluding bonuses" & feature != "Labour costs other than wages and salaries" & feature != "Wages and salaries (total)") time_series = spread(removeDup,feature,value) dim(time_series) ``` # Fit the ARIMA model for each country ## Belgium Extract Belgium data. ```{r warning=F} #Chose Belgium and fit the arima belgium = filter(time_series,country == "Belgium") belgium = belgium[, colSums(is.na(belgium)) != nrow(belgium)] #delete the feature that is missing at all the time point belgium = dplyr::select(belgium,-time,-country) dim(belgium) ``` Manually clean (preprocess) the Belgium data and fit ARIMAX model $$Y = BelguimSuperSample\$'Unemployment,\ Females,\ From\ 15-64\ years,\ Total',$$ $$X = XReg\ (all\ other\ covariates\ -starts\_with('Unemployment')).$$ ```{r warning=F} #super sample the dataset cleardata <- function(mat) { for (i in 1:ncol(mat)) { mat[is.na(mat[,i]),i]<-mean(mat[,i],na.rm = T) + rnorm(sum(is.na(mat[,i])),sd = sd(mat[,i],na.rm = T)) } return(mat) } #use spline regression model to expand the dataset splinecreate <- function(mat) { res<-NULL for (i in 1:ncol(mat)) { sp<-smooth.spline(seq(1:72),mat[,i]) spresult<-predict(sp,seq(from=0,by=1/5,length.out = 360)) spfeat<-spresult$y+rnorm(360,sd=sd(mat[,i])) res<-cbind(res,spfeat) } colnames(res)<-colnames(mat) return(res) } BelguimMatrix = as.matrix(belgium) BelguimMatrix = cleardata(BelguimMatrix) BelguimMatrix = splinecreate(BelguimMatrix) BelguimSuperSample = as.data.frame(BelguimMatrix) ############################# # Outcome Features to examine, predict, forecast, spacekime-analyze # "Gross domestic product at market prices" # "Unemployment , Females, From 15-64 years, Total" # "Capital transfers, payable" # "Capital transfers, receivable" # "Debt securities" # "Government consolidated gross debt" # ... View(colnames(BelguimMatrix)) ############################# ################################################ # "Unemployment , Females, From 15-64 years, Total" Y = dplyr::select(BelguimSuperSample, "Unemployment , Females, From 15-64 years, Total"); dim(Y) X = dplyr::select(BelguimSuperSample, -starts_with("Unemployment")); dim(X) fitArimaX = auto.arima(Y, xreg=as.matrix(X[,-112])); fitArimaX$arma ################################################ # "Government consolidated gross debt" #Y = select(BelguimSuperSample, "Government consolidated gross debt"); dim(Y) #X = select(BelguimSuperSample, -matches("debt|Debt")); dim(X) # X_scale <- scale(X); Matrix::rankMatrix(X_scale); any(is.na(X_scale)); any(is.infinite(X_scale)) # remove columns with infinite or missing value # X_scale_1 <- X_scale[ , !is.infinite(colSums(X_scale)) & !is.na(colSums(X_scale))] #X_1 <- X[ , !is.infinite(colSums(X)) & !is.na(colSums(X))]; dim(X_1); Matrix::rankMatrix(X_1) #X_2 <- X_1[, qr(X_1)$pivot[seq_len(qr(X_1)$rank)]]; dim(X_2) # #fitArimaX = auto.arima(Y, xreg=as.matrix(X_2), method = "CSS", # "SANN", method = "CSS", optim.method = "BFGS" # optim.control=list(maxit = 20000, temp = 20), optim.method = "BFGS") #fitArimaX; View(sort(fitArimaX$coef)[1:10]); fitArimaX$arma ################################################ # "Gross domestic product at market prices" Y = dplyr::select(BelguimSuperSample, "Gross domestic product at market prices"); dim(Y) X = dplyr::select(BelguimSuperSample, -matches("debt|Debt")); dim(X) # 360 167 X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X) ts_Y <- ts(Y, start=c(2000,1), end=c(2017, 20), frequency = 20); length(ts_Y) set.seed(1234) fitArimaX = auto.arima(ts_Y, xreg=as.matrix(X[ , -c(50:60)])); fitArimaX$arma # 5 0 2 0 20 0 0 # sigma^2 estimated as 57.04: log likelihood=-1132.78 AIC=2593.55 AICc=2871.09 BIC=3230.88 pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60 <- predict(fitArimaX, n.ahead = 60, newxreg = as.matrix(X[301:360 , -c(50:60)]))$pred plot(forecast(fitArimaX, xreg = as.matrix(X[301:360 , -c(50:60)])), # ARIMA forecast include=120, lwd=4, lty=3, xlab="Time", ylab="GDP", ylim=c(50, 150), main = "ARIMAX Analytics (Train: 2000-2017; Test: 2018-2020) GDP Forecasting\n based on fitting ARIMAX Models on raw (spline interpolated) Belgium data") lines(pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60, col = "red", lwd = 4, lty=3) legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2017)", "ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)", "ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)"), col=c("black", "blue", "red"), lty=c(3,3,3), lwd=c(4,4,4), cex=1.2, x.intersp=1.5, y.intersp=0.6) text(2015, 60, expression(atop(paste("Training Region (2000-2017)"), paste(Model(Unempl) %->% "ARIMAX(p, q, r) ; ", XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5) text(2019.5, 60, expression(atop(paste("Validation Region (2018-2020)"), paste(hat(Unempl) %<-% "ARIMAX(5,0 ,2); ", XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5) ``` * *GDP Description*: [GDP](https://datamarket.com/data/set/196v/gross-domestic-product-at-market-prices) (gross domestic product) is an indicator for a nation's economic situation. It reflects the total value of all goods and services produced less the value of goods and services used for intermediate consumption in their production. Expressing GDP in PPS (purchasing power standards) eliminates differences in price levels between countries, and calculations on a per head basis allows for the comparison of economies significantly different in absolute size. [GDP unit of measure](https://ec.europa.eu/eurostat/statistics-explained/index.php/Glossary:GDP_per_capita_in_purchasing_power_standards) represents the *Current prices, euro per capita*. The volume index of GDP per capita in Purchasing Power Standards (PPS) is intended for cross-country comparisons rather than for temporal comparisons. GDP per capita when expressed in PPS eliminates the differences in price levels between countries allowing meaningful volume comparisons of GDP between countries. Expressed in relation to the European Union (*EU27 GDP = 100*), a country with an index that is higher than $100$ or lower than $100$ corresponds to that country's level of GDP per head being higher or lower than the EU average, respectively. ## Bulgaria Define a new function that (1) cleans the data, and (2) fits the ARIMA model estimating the seasonal and non-seasonal time-series parameters $(p,d,q)$ and the effect-sizes ($\beta$'s) for the exogenous regression features ($X$). ```{r warning=F} library(dplyr) #write a function of clean the data and fit the ARIMA model Fit_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20, feature="Unemployment , Females, From 15-64 years, Total") { #delete features that are missing at all time points countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)] countryData = dplyr::select(countryData, -time, -country) DataMatrix = as.matrix(countryData) DataMatrix = cleardata(DataMatrix) DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove feature that only has one value DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove feature that all the values are 0 DataMatrix = splinecreate(DataMatrix) DataSuperSample = as.data.frame(DataMatrix) if (feature=="Unemployment , Females, From 15-64 years, Total") { Y = dplyr::select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total") X = dplyr::select(DataSuperSample, -starts_with("Unemployment")) } else if (feature=="Gross domestic product at market prices") { Y = dplyr::select(DataSuperSample, "Gross domestic product at market prices"); dim(Y) X = dplyr::select(DataSuperSample, -matches("debt|Debt")); dim(X) # 360 167 print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], "); ", " dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ...")) X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X) # ensure full-rank design matrix, X } else { print(paste0("This feature ", feature, " is not implemented yet! Exiting Fit_ARIMA() method ...")) return(NULL) } ts_Y <- ts(Y, start=c(start, 1), end=c(end, frequency), frequency = frequency); length(ts_Y) set.seed(1234) fitArimaX = auto.arima(ts_Y, xreg=as.matrix(X)) return(fitArimaX) } Bulgaria = filter(time_series,country == "Bulgaria") BulgariaARIMA = Fit_ARIMA(countryData=Bulgaria, start=2000, end=2017, frequency=20, feature="Gross domestic product at market prices") BulgariaARIMA$arma # Extend the Fit-ARIMA method to ensure testing-training modeling/assessment for 2 countries works preprocess_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20, feature="Unemployment , Females, From 15-64 years, Total") { #delete features that are missing at all time points countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)] countryData = dplyr::select(countryData, !any_of(c("time", "country"))) DataMatrix = as.matrix(countryData) DataMatrix = cleardata(DataMatrix) DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0 DataMatrix = splinecreate(DataMatrix) DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data print(paste0("Processing feature: ...", feature, "... ")) if (feature=="Unemployment , Females, From 15-64 years, Total") { Y = dplyr::select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total") X = dplyr::select(DataSuperSample, -starts_with("Unemployment")) } else if (feature=="Gross domestic product at market prices") { Y = dplyr::select(DataSuperSample, "Gross domestic product at market prices"); dim(Y) X = dplyr::select(DataSuperSample, -matches("debt|Debt")); X <- X [, -c(50:80)]; dim(X) # 360 167 } else { print(paste0("This feature: ...", feature, "... is not implemented yet! Exiting preprocess_ARIMA() method ...")) return(NULL) } # reduce the number of observations (matrix rows) to specified time range len_1 <- (end + 1 - start) * frequency; print(paste0("dim(X)[1]=", len_1)) X <- X[1:len_1 , qr(X[1:len_1 , ])$pivot[seq_len(qr(X[1:len_1 , ])$rank)]]; dim(X) # ensure full-rank design matrix, X Y <- as.data.frame(Y[1:len_1 , ]) print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], "); ", # 300 136 " dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ...")) # 300 1 return(list("X"=X, "Y"=Y)) } # Outcome Variable to be modeled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures) # Spline interpolation *5; 2000-01 - 2014-20 (300 observations for training): 60 observations (2015-2017) for Testing Belgium <- filter(time_series, country == "Belgium") Bulgaria <- filter(time_series, country == "Bulgaria") Netherlands <- filter(time_series, country == "Netherlands") # Test preprocess_ARIMA() #preprocess_Belgium <- preprocess_ARIMA(countryData=Belgium, start=2000, end=2014, # frequency=20, feature="Gross domestic product at market prices") #preprocess_Bulgaria <- preprocess_ARIMA(countryData=Bulgaria, start=2000, end=2014, # frequency=20, feature="Gross domestic product at market prices") # General function that ensures the XReg predictors for 2 countries are homologous homologousX_features <- function (X_Country1, X_Country2){ # Check if the Belgium and Bulgaria Xreg are homologous (same feature columns) common_cols <- intersect(colnames(X_Country1), colnames(X_Country2)) X_Country1 <- subset(X_Country1, select = common_cols) X_Country2 <- subset(X_Country2, select = common_cols) print(paste0("dim(X1)=(", dim(X_Country1)[1], ",", dim(X_Country1)[2], "); ", # 300 131 " dim(X2)=(", dim(X_Country2)[1], ",", dim(X_Country2)[2], ")!")) # 300 131 return(list("X_Country1"=X_Country1, "X_Country2"=X_Country2)) } # Test homologousX_features # homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X) # X_Belgium <- homoFeat$X_Country1 # X_Bulgaria <- homoFeat$X_Country2 fit_ARIMA <- function(country1Data=Belgium, country2Data=Bulgaria, start=2000, end=2014, frequency=20, feature="Gross domestic product at market prices") { preprocess_Country1 <- preprocess_ARIMA(countryData=country1Data, start=start, end=end, frequency=frequency, feature=feature) preprocess_Country2 <- preprocess_ARIMA(countryData=country2Data, start=start, end=end, frequency=frequency, feature=feature) ts_Y_Country1 <- ts(preprocess_Country1$Y, start=c(start, 1), end=c(end, frequency), frequency = frequency); length(ts_Y_Country1) homoFeat <- homologousX_features(preprocess_Country1$X, preprocess_Country2$X) X_Country1 <- homoFeat$X_Country1 X_Country2 <- homoFeat$X_Country2 set.seed(1234) fitArimaX_Country1 = auto.arima(ts_Y_Country1, xreg=as.matrix(X_Country1)) return(fitArimaX_Country1) } # Belgium = filter(time_series,country == "Belgium") BelgiumARIMA = fit_ARIMA(country1Data=Belgium, country2Data=Bulgaria, # country2Data=Netherlands, start=2000, end=2014, frequency=20, feature="Gross domestic product at market prices") BelgiumARIMA$arma # [1] 4 0 2 0 20 0 0 # sigma^2 estimated as 45.99: log likelihood=-919.13 AIC=2116.26 AICc=2359.51 BIC=2631.09 # Outcome Variable to be modeled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures) # Spline interpolation x5; 2000-01 - 2014-20 (300 observations for training) # ts_Y_Belgium_train <- ts(Y_Belgium_train, start=c(2000,1), end=c(2014, 20), frequency=20) # length(ts_Y_Belgium_train) # ts_Y_Belgium_test <- ts(Y_Belgium_test, start=c(2015,1), end=c(2017, 20), frequency = 20) #length(ts_Y_Belgium_test) # Find ARIMAX model # arimaX_Belgium_train <- auto.arima(ts_Y_Belgium_train, xreg=as.matrix(X_Belgium_train)); # arimaX_Belgium_train$arma ``` # TS Forecasting ## Perform ARIMAX modeling Predict `Y={Gross domestic product at market prices}`, using all other features not directly related to ***GDP**. Recall the core data organization: - 131 (common BE + BG) `Xreg` Predictors and 1 Outcome Variable to be modeled as a timeseries: 2000 - 2014 (15 years, Quarterly measures, 5-fold spline interpolation, $15*4*5=300$); - Spline interpolation x5 (freq=20 observations per year); 2000-01 to 2014-20 (300 timepoint observations over 15 years, for `training`). The remaining 60 timepoints used for `testing` (2015-01 to 2017-20). Report `ARIMA(p,d,q)` params and quality metrics AIC/BIC. ```{r message=F, error=F, warning=F} # Previously, we already extracted the Belgium and Bulgaria Data, preprocess it, # and fit the ARIMAX (Belgium-training) model. Now, we will assess the model on Bulgaria_testing sets # BelgiumARIMA = fit_ARIMA(country1Data=Belgium, country2Data=Bulgaria, # start=2000, end=2014, frequency=20, # feature="Gross domestic product at market prices") # BelgiumARIMA$arma # [1] 4 0 2 0 20 0 0 # View rank-ordered ARIMAX effects: # View(BelgiumARIMA$coef[order(BelgiumARIMA$coef)]) sort(BelgiumARIMA$coef)[1:10] # Unemployment , Females, From 15-64 years, From 18 to 23 months # -1.5203281 # ar2 # -1.1808472 # Labor cost other than wages and salaries # -0.8380554 # Unemployment , Females, From 15-64 years, From 12 to 17 months # -0.7037336 # ar4 # -0.6360880 # sar2 # -0.6260002 # Unemployment , Females, From 15-64 years, From 3 to 5 months # -0.5262376 # Labor cost for LCI (compensation of employees plus taxes minus subsidies) # -0.2885038 # Unemployment , Females, From 15-64 years, 48 months or over # -0.2773650 # Unemployment , Males, From 15-64 years, from 3 to 5 months # -0.2567934 #Get the Prospective Xreg=X design matrices ready (2015-2017, 60 rows) preprocess_Belgium <- preprocess_ARIMA(countryData=Belgium, start=2000, end=2017, frequency=20, feature="Gross domestic product at market prices") preprocess_Bulgaria <- preprocess_ARIMA(countryData=Bulgaria, #Netherlands start=2000, end=2017, frequency=20, feature="Gross domestic product at market prices") homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X) X_Belgium_test <- homoFeat$X_Country1[301:360, ]; dim(X_Belgium_test) X_Bulgaria_test <- homoFeat$X_Country2[301:360, ]; dim(X_Bulgaria_test) # Get Predictions pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60 <- forecast(BelgiumARIMA, xreg = as.matrix(X_Bulgaria_test))$mean pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60 <- forecast(BelgiumARIMA, xreg = as.matrix(X_Belgium_test))$mean pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60 <- forecast(BelgiumARIMA, xreg = as.matrix(X_Bulgaria_test))$mean + mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) - mean(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60) cor(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, ts_Y_Belgium_test) # 0.11 mean(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60) # [1] 118 # Alternative predictions: # X_Country1 <- X_Belgium_test; X_Country2 <- X_Bulgaria_test # pred_arimaX_1_0_2_Y_Belgium_train300_Bulgaria_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Bulgaria_test)$pred # pred_arimaX_1_0_2_Y_Belgium_train300_Belgium_test60 <- predict(BelgiumARIMA, n.ahead = 60, newxreg = X_Belgium_test)$pred ``` ## Display the alternative (spacetime) analytical models (ARIMAX) Plot only the last 5-years of training (2010-2014), $100TimePoints=5Years\times 4Quarters\times 5SuperSample$ and the 3-year prospective forecasting $60TimePoints=3Years\times 4Quarters\times 5SuperSample$. ```{r message=F, error=F, warning=F, fig.width=8, fig.height=8} ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ], start=c(2015,1), end=c(2017, 20), frequency = 20) length(ts_Y_Belgium_test) # windows(width=14, height=10) plot(forecast(BelgiumARIMA, xreg = as.matrix(X_Belgium_test)), # ARIMA forecast include=100, lwd=4, lty=3, xlab="Time", ylab="GDP Purchasing Power Standards (PPS)", ylim=c(25, 150), main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n based on fitting ARIMAX Models on raw (spline interpolated) Belgium data") lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, col = "green", lwd = 4, lty=2) # Belgium train+test lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60, col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60, col = "orange", lwd = 4, lty=1) # Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375 lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1) # Observed Y_Test timeseries legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)", "ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)", "ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)", "ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)", "Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)", "Belgium Official Reported GDP (2015-2017)"), col=c("black", "blue", "green", "purple", "orange", "red"), lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7) text(2012.5, 30, expression(atop(paste("Training Region (2000-2014)"), paste(Model(GDP) %->% "ARIMAX(p, q, r) ; ", XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2) text(2016.5, 30, expression(atop(paste("Validation Region (2015-2017)"), paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ", XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2) ``` Plot entire 15-year training time-span (2000-2014), $300TimePoints=15Years\times 4Quarters\times 5SuperSample$ and the 3-year prospective forecasting $60TimePoints=3Years\times 4Quarters\times 5SuperSample$. ```{r message=F, error=F, warning=F, fig.width=8, fig.height=8} ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ], start=c(2015,1), end=c(2017, 20), frequency = 20) length(ts_Y_Belgium_test) # windows(width=14, height=10) plot(forecast(BelgiumARIMA, xreg = as.matrix(X_Belgium_test)), # ARIMA forecast lwd=4, lty=3, xlab="Time", ylab="GDP Purchasing Power Standards (PPS)", ylim=c(25, 150), main = "ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n based on fitting ARIMAX Models on raw (spline interpolated) Belgium data") lines(pred_arimaX_4_0_2_Y_Belgium_train300_Belgium_test60, col = "green", lwd = 4, lty=2) # Belgium train+test lines(pred_arimaX_4_0_2_Y_Belgium_train300_Bulgaria_test60, col = "purple", lwd = 4, lty=1) # Belgium train+Bulgaria test lines(pred_arimaX_4_0_2_Y_Belgium_train300_Offset_Bulgaria_test60, col = "orange", lwd = 4, lty=1) # Belgium train+ Offset Bulgaria test: 188.3753 - 416.5375 lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1) # Observed Y_Test timeseries legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)", "ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)", "ARIMAX(4,0,2) Belgium train + XReg=Belgium test (2015-2017)", "ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)", "Offset ARIMAX(4,0,2) Belgium train + XReg=Bulgaria test (2015-2017)", "Belgium Official Reported GDP (2015-2017)"), col=c("black", "blue", "green", "purple", "orange", "red"), lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7) text(2005, 30, expression(atop(paste("Training Region (2000-2014)"), paste(Model(GDP) %->% "ARIMAX(p, q, r) ; ", XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2) text(2015, 30, expression(atop(paste("Validation Region (2015-2017)"), paste(hat(GDP) %<-% "ARIMAX(4, 0, 2); ", XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2) ``` # Spacekime analytics ## Generic K-Space transformations (FT/IFT) Let's start by defining the generic k-space transformation. ```{r message=F, error=F, warning=F, fig.width=12, fig.height=12} # FT/Spacekime Analytics # 1D timeseries FFT SHIFT fftshift1D <- function(img_ff) { rows <- length(img_ff) rows_half <- ceiling(rows/2) return(append(img_ff[(rows_half+1):rows], img_ff[1:rows_half])) } # Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain) kSpaceTransform <- function(data, inverse = FALSE, reconPhases = NULL) { # ForwardFT (rawData, FALSE, NULL) # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL) FT_data <- array(complex(), c(dim(data)[1], dim(data)[2])) mag_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2])) phase_FT_data <- array(complex(), c(dim(data)[1], dim(data)[2])) IFT_reconPhases_data <- array(complex(), c(dim(data)[1], dim(data)[2])) for (i in 1:dim(data)[2]) { if (inverse == FALSE | is.null(reconPhases)) { FT_data[ , i] <- fft(data[ , i], inverse) X2 <- FT_data[ , i] # plot(fftshift1D(log(Re(X2)+2)), main = "log(fftshift1D(Re(FFT(timeseries))))") mag_FT_data[ , i] <- sqrt(Re(X2)^2+Im(X2)^2); # plot(log(fftshift1D(Re(mag_FT_MCSI_data))), main = "log(Magnitude(FFT(timeseries)))") phase_FT_data[ , i] <- atan2(Im(X2), Re(X2)); # plot(Re(fftshift1D(phase_FT_MCSI_data[ , 1])), main = "Shift(Phase(FFT(timeseries)))") } else { # for IFT synthesis using user-provided Phases, typically from kime-phase aggregators Real <- data[ , i] * cos(reconPhases[ , i]) Imaginary <- data[ , i] * sin(reconPhases[ , i]) IFT_reconPhases_data[ ,i] <- Re(fft(Real+1i*Imaginary, inverse = TRUE)/length(data[ , i])) } } ######### Test the FT-IFT analysis-synthesis back-and-forth transform process # to confirm calculations # X2 <- FT_data[ , 1]; mag_FT_data[ , 1] <- sqrt(Re(X2)^2+Im(X2)^2); # phase_FT_data[ , 1] <- atan2(Im(X2), Re(X2)); # Real2 = mag_FT_data[ , 1] * cos(phase_FT_data[ , 1]) # Imaginary2 = mag_FT_data[ , 1] * sin(phase_FT_data[ , 1]) # man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2)) # ifelse(abs(man_hat_X2[5] - data[5, 1]) < 0.001, "Perfect Synthesis", "Problems!!!") ######### if (inverse == FALSE | is.null(reconPhases)) { return(list("magnitudes"=mag_FT_data, "phases"=phase_FT_data)) # Use kSpaceTransform$magnitudes & kSpaceTransform$phases to retrieve teh Mags and Phases } else { return(IFT_reconPhases_data) # Use Re(kSpaceTransform) to extract spacetime Real-valued reconstructed data } } ``` ## Kime-Phase Distributions Examine the Kime-direction Distributions of the Phases for all *Belgium* features (predictors + outcome). Define a generic function that plots the Phase distributions. ```{r message=F, error=F, warning=F, fig.width=12, fig.height=12} library(tidyr) library(ggplot2) plotPhaseDistributions <- function (dataFT, dataColnames, size=10, ...) { df.phase <- as.data.frame(Re(dataFT$phases)) df.phase %>% gather() %>% head() colnames(df.phase) <- dataColnames phaseDistributions <- gather(df.phase) colnames(phaseDistributions) <- c("Feature", "Phase") if (is.null(size)) size=10 # map the value as our x variable, and use facet_wrap to separate by the key column: ggplot(phaseDistributions, aes(Phase)) + # geom_histogram(bins = 10) + geom_histogram(aes(y=..density..), bins = 10) + facet_wrap( ~Feature, scales = 'free_x') + xlim(-pi, pi) + theme(strip.text.x = element_text(size = size, colour = "black", angle = 0)) } # homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X) X_Belgium <- homoFeat$X_Country1; dim(X_Belgium) Y_Belgium <- preprocess_Belgium$Y; dim(Y_Belgium) FT_Belgium <- kSpaceTransform(cbind(X_Belgium, Y_Belgium), FALSE, NULL) dataColnames <- c(colnames(X_Belgium), "Y_GDP_Belgium") plotPhaseDistributions(FT_Belgium, dataColnames) IFT_FT_Belgium <- kSpaceTransform(FT_Belgium$magnitudes, TRUE, FT_Belgium$phases) # Check IFT(FT) == I: # ifelse(abs(cbind(X_Belgium, Y_Belgium)[5,4] - Re(IFT_FT_Belgium[5,4])) < 0.001, "Perfect Synthesis", "Problems!!!") ``` ## Nil-Phase Synthesis and ARIMAX re-modeling Perform Nil-Phase reconstruction - `IFT_NilPhase_FT_Belgium` - and then re-fit the ARIMAX model ```{r message=F, error=F, warning=F} # 1. Nil-Phase data synthesis (reconstruction) temp_Data <- cbind(X_Belgium, Y_Belgium) nilPhase_FT_data <- array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2])) dim(nilPhase_FT_data) # ; head(nilPhase_FT_data) # [1] 360 132 IFT_NilPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) # Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with nil-phase IFT_NilPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, nilPhase_FT_data)) colnames(IFT_NilPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium") dim(IFT_NilPhase_FT_Belgium); dim(FT_Belgium$magnitudes) # colnames(IFT_NilPhase_FT_Belgium); head(IFT_NilPhase_FT_Belgium); # head(temp_Data) # 2. Perform ARIMAX modeling on IFT_NilPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC # library(forecast) IFT_NilPhase_FT_Belgium_Y_train <- IFT_NilPhase_FT_Belgium[1:300, 132]; length(IFT_NilPhase_FT_Belgium_Y_train) IFT_NilPhase_FT_Belgium_Y_test <- IFT_NilPhase_FT_Belgium[301:360]; length(IFT_NilPhase_FT_Belgium_Y_test) # Training and Testing Data Covariates explaining the longitudinal outcome (Y) IFT_NilPhase_FT_Belgium_X_train <- as.data.frame(IFT_NilPhase_FT_Belgium)[1:300, 1:131]; dim(IFT_NilPhase_FT_Belgium_X_train) IFT_NilPhase_FT_Belgium_X_test <- as.data.frame(IFT_NilPhase_FT_Belgium)[301:360, 1:131]; dim(IFT_NilPhase_FT_Belgium_X_test) # Outcome Variable to be ARIMAX-modeled, as a timeseries ts_IFT_NilPhase_FT_Belgium_Y_train <- ts(IFT_NilPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20) # Find ARIMAX model: 2 1 2 0 20 0 0 set.seed(1234) modArima_IFT_NilPhase_FT_Belgium_Y_train <- auto.arima(ts_IFT_NilPhase_FT_Belgium_Y_train, xreg=as.matrix(IFT_NilPhase_FT_Belgium_X_train)) modArima_IFT_NilPhase_FT_Belgium_Y_train$arma # Regression with ARIMA(2,0,1)(2,0,0)[20] errors # Coefficients: # ar1 ar2 ma1 sar1 sar2 Acquisitions less disposals of non-financial non-produced assets # -1.4232 -0.8592 -0.987 0.0941 -0.3451 0.0123 #s.e. 0.0341 0.0311 NaN 0.0688 0.0742 0.0007 # sigma^2 estimated as 0.8622: log likelihood=-320.39 AIC=914.79 AICc=1148.19 BIC=1422.2 pred_arimax_2_0_1_Nil <- forecast(modArima_IFT_NilPhase_FT_Belgium_Y_train, xreg = as.matrix(IFT_NilPhase_FT_Belgium_X_test)) pred_arimax_2_0_1_Nil_2015_2017 <- ts(pred_arimax_2_0_1_Nil$mean, frequency=20, start=c(2015,1), end=c(2017,20)) pred_arimax_2_0_1_Nil_2015_2017 # alternatively: # pred_arimax_1_0_1_2015_2017 <- predict(modArima_IFT_NilPhase_FT_Belgium_Y_train, # n.ahead = 3*20, newxreg = IFT_NilPhase_FT_Belgium_X_test)$pred sort(modArima_IFT_NilPhase_FT_Belgium_Y_train$coef)[1:10] # Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-1.5972295 # ar1, effect=-1.4231617 # Labor cost other than wages and salaries, effect=-1.2213214 # ma1, effect=-0.9869571 # ar2, effect=-0.8591937 # Unemployment , Females, From 15-64 years, From 12 to 17 months, effect=-0.7075454 # Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.5797656 # Unemployment , Males, From 15-64 years, from 3 to 5 months, effect=-0.5026139 # sar2, effect=-0.3450866 # Unemployment , Males, From 15-64 years, from 24 to 47 months, effect=-0.2965540 cor(pred_arimax_2_0_1_Nil$mean, ts_Y_Belgium_test) # 0.14 mean(pred_arimax_2_0_1_Nil_2015_2017) # [1] 105 ``` ## Swapped-Phase Synthesis and ARIMAX Modeling Space-time reconstructions by inverting back the `mag_FT` for each feature signal after swapping the feature-phases. In other words, we randomly shuffle the `columns` of the Phases-matrix (Training & Testing XReg Data) and use these *swapped* phases to synthesize the design covariate matrix ($Xreg$). ```{r message=F, error=F, warning=F} # 1. Swap Feature Phases and then synthesize the data (reconstruction) # temp_Data <- cbind(X_Belgium, Y_Belgium) swapped_phase_FT_Belgium_data <- FT_Belgium$phases colnames(swapped_phase_FT_Belgium_data) <- c(colnames(X_Belgium), "Y_GDP_Belgium") swapped_phase_FT_Belgium_data1 <- swapped_phase_FT_Belgium_data dim(swapped_phase_FT_Belgium_data) # ; head(swappedPhase_FT_data) # [1] 360 132 IFT_SwappedPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) set.seed(12345) # sample randomly Phase-columns for each of the 131 covariates (X) #swap_phase_FT_Belgium_indices <- sample(ncol(swapped_phase_FT_Belgium_data)-1) # for (j in 1:131) { # for all columns of the design Xreg matrix, excluding Y, randomly swap columns phases # swapped_phase_FT_Belgium_data1[ , j] <- swapped_phase_FT_Belgium_data[, swap_phase_FT_Belgium_indices[j]] #} swapped_phase_FT_Belgium_data1 <- as.data.frame(cbind( swapped_phase_FT_Belgium_data[ , sample(ncol(swapped_phase_FT_Belgium_data[ , 1:131]))], swapped_phase_FT_Belgium_data[ , 132])) swapped_phase_FT_Belgium_data <- swapped_phase_FT_Belgium_data1 colnames(swapped_phase_FT_Belgium_data)[132] <- "Y_GDP_Belgium" # colnames(swapped_phase_FT_Belgium_data) dim(swapped_phase_FT_Belgium_data); dim(FT_Belgium$phases) # Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal using the feature swapped phases IFT_SwappedPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, swapped_phase_FT_Belgium_data)) colnames(IFT_SwappedPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium") dim(IFT_SwappedPhase_FT_Belgium); dim(FT_Belgium$magnitudes) colnames(IFT_SwappedPhase_FT_Belgium); tail(IFT_SwappedPhase_FT_Belgium); # tail(temp_Data) # 2. Perform ARIMAX modeling on IFT_SwappedPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC # library(forecast) IFT_SwappedPhase_FT_Belgium_Y_train <- IFT_SwappedPhase_FT_Belgium[1:300, 132]; length(IFT_SwappedPhase_FT_Belgium_Y_train) IFT_SwappedPhase_FT_Belgium_Y_test <- IFT_SwappedPhase_FT_Belgium[301:360]; length(IFT_SwappedPhase_FT_Belgium_Y_test) # Training and Testing Data Covariates explaining the longitudinal outcome (Y) IFT_SwappedPhase_FT_Belgium_X_train <- as.data.frame(IFT_SwappedPhase_FT_Belgium)[1:300, 1:131] dim(IFT_SwappedPhase_FT_Belgium_X_train) IFT_SwappedPhase_FT_Belgium_X_test <- as.data.frame(IFT_SwappedPhase_FT_Belgium)[301:360, 1:131] dim(IFT_SwappedPhase_FT_Belgium_X_test) # Outcome Variable to be ARIMAX-modeled, as a timeseries ts_IFT_SwappedPhase_FT_Belgium_Y_train <- ts(IFT_SwappedPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20) # Find ARIMAX model: 1 0 2 0 20 0 0 set.seed(1234) modArima_IFT_SwappedPhase_FT_Belgium_Y_train <- auto.arima(ts_IFT_SwappedPhase_FT_Belgium_Y_train, xreg=as.matrix(IFT_SwappedPhase_FT_Belgium_X_train)) modArima_IFT_SwappedPhase_FT_Belgium_Y_train$arma # Regression with ARIMA(1,0,0)(2,0,0)[20] errors # Coefficients: # ar1 sar1 sar2 intercept Acquisitions less disposals of non-financial non-produced assets # -0.3837 -0.2196 0.2827 46.1704 0.0063 #s.e. 0.1113 0.0903 0.0779 36.8492 0.0080 # sigma^2 estimated as 70: log likelihood=-976.01 AIC=2224.02 AICc=2452.63 BIC=2727.73 pred_arimax_1_0_0_Swapped <- forecast(modArima_IFT_SwappedPhase_FT_Belgium_Y_train, xreg = as.matrix(IFT_SwappedPhase_FT_Belgium_X_test)) pred_arimax_1_0_0_Swapped_2015_2017 <- ts(pred_arimax_1_0_0_Swapped$mean, frequency=20, start=c(2015,1), end=c(2017,20)) pred_arimax_1_0_0_Swapped_2015_2017 # alternatively: # pred_arimax_1_0_0_Swapped_2015_2017 <- predict(modArima_IFT_SwappedPhase_FT_Belgium_Y_train, # n.ahead = 3*20, newxreg = IFT_SwappedPhase_FT_Belgium_X_test)$pred sort(modArima_IFT_SwappedPhase_FT_Belgium_Y_train$coef)[1:10] # ar1, effect=-0.38372043 # Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.31137514 # Labor cost other than wages and salaries, effect=-0.31094561 # sar1, effect=-0.21964957 # Unemployment , Females, From 15-64 years, From 6 to 11 months, effect=-0.20878853 #Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.12497311 # Unemployment , Males, From 15-64 years, Less than 1 month, effect=-0.10849013 # Unemployment , Total, From 15-64 years, 48 months or over, effect=-0.09066684 # Unemployment , Total, From 15-64 years, From 1 to 2 months, effect=-0.05852382 # Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.05695172 cor(pred_arimax_1_0_0_Swapped$mean, ts_Y_Belgium_test) # 0.0 mean(pred_arimax_1_0_0_Swapped_2015_2017) # [1] 105 ``` ## Random-Phase Synthesis and ARIMAX re-modeling Perform Random-Phase reconstruction - `IFT_RandPhase_FT_Belgium` - by randomly sampling from the phase distributions for each feature and then re-fitting the ARIMAX model ```{r message=F, error=F, warning=F} # 1. Random-Phase data synthesis (reconstruction) # temp_Data <- cbind(X_Belgium, Y_Belgium) randPhase_FT_data <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) dim(randPhase_FT_data) # ; head(randPhase_FT_data) # [1] 360 132 IFT_RandPhase_FT_Belgium <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) randPhase_FT_data <- FT_Belgium$phases for (i in 1:(dim(randPhase_FT_data)[2] -1)) { if (i < dim(randPhase_FT_data)[2]) { set.seed(12345) # sample randomly Phases for each of the 131 predictors covariates (X) randPhase_FT_data[ , i] <- FT_Belgium$phases[sample(nrow(FT_Belgium$phases)), i] } else { } # for the Y outcome (Last Column) - do not change the phases of the Y } # Invert back to spacetime the FT_Belgium$magnitudes[ , i] signal with avg-phase IFT_RandPhase_FT_Belgium <- Re(kSpaceTransform(FT_Belgium$magnitudes, TRUE, randPhase_FT_data)) colnames(IFT_RandPhase_FT_Belgium) <- c(colnames(X_Belgium), "Y_GDP_Belgium") dim(IFT_RandPhase_FT_Belgium); dim(FT_Belgium$magnitudes) # colnames(IFT_RandPhase_FT_Belgium); tail(IFT_RandPhase_FT_Belgium); # tail(temp_Data) dim(IFT_RandPhase_FT_Belgium); head(Re(IFT_RandPhase_FT_Belgium)); tail(Re(IFT_RandPhase_FT_Belgium)) # 2. Perform ARIMAX modeling on IFT_RandPhase_FT_Belgium; report (p,d,q) params and quality metrics AIC/BIC # library(forecast) IFT_RandPhase_FT_Belgium_Y_train <- IFT_RandPhase_FT_Belgium[1:300, 132]; length(IFT_RandPhase_FT_Belgium_Y_train) IFT_RandPhase_FT_Belgium_Y_test <- IFT_RandPhase_FT_Belgium[301:360]; length(IFT_RandPhase_FT_Belgium_Y_test) # Training and Testing Data Covariates explaining the longitudinal outcome (Y) IFT_RandPhase_FT_Belgium_X_train <- as.data.frame(IFT_RandPhase_FT_Belgium)[1:300, 1:131]; dim(IFT_RandPhase_FT_Belgium_X_train) IFT_RandPhase_FT_Belgium_X_test <- as.data.frame(IFT_RandPhase_FT_Belgium)[301:360, 1:131]; dim(IFT_RandPhase_FT_Belgium_X_test) # Outcome Variable to be ARIMAX-modeled, as a timeseries ts_IFT_RandPhase_FT_Belgium_Y_train <- ts(IFT_RandPhase_FT_Belgium_Y_train, start=c(2000,1), end=c(2014, 20), frequency = 20) # Find ARIMAX model: 0 0 2 0 20 0 0 set.seed(1234) modArima_IFT_RandPhase_FT_Belgium_Y_train <- auto.arima(ts_IFT_RandPhase_FT_Belgium_Y_train, xreg=as.matrix(IFT_RandPhase_FT_Belgium_X_train)) modArima_IFT_RandPhase_FT_Belgium_Y_train$arma # Regression with ARIMA(0,0,0)(2,0,0)[20] errors # Coefficients: # sar1 sar2 Acquisitions less disposals of non-financial non-produced assets # -0.0743 0.5766 0.0162 #s.e. 0.0625 0.0752 0.0100 #sigma^2 estimated as 72.17: log likelihood=-988.06 AIC=2244.12 AICc=2463.4 BIC=2740.43 pred_arimax_0_0_0_Rand <- forecast(modArima_IFT_RandPhase_FT_Belgium_Y_train, xreg = as.matrix(IFT_RandPhase_FT_Belgium_X_test)) pred_arimax_0_0_0_Rand_2015_2017 <- ts(pred_arimax_0_0_0_Rand$mean, frequency=20, start=c(2015,1), end=c(2017,20)) pred_arimax_0_0_0_Rand_2015_2017 # alternatively: # pred_arimax_1_0_1_Rand_2015_2017 <- predict(modArima_IFT_RandPhase_FT_Belgium_Y_train, # n.ahead = 3*20, newxreg = IFT_RandPhase_FT_Belgium_X_test)$pred sort(modArima_IFT_RandPhase_FT_Belgium_Y_train$coef)[1:10] # Labor cost for LCI (compensation of employees plus taxes minus subsidies), effect=-0.71989958 # Unemployment , Females, From 15-64 years, From 18 to 23 months, effect=-0.54541627 # Unemployment , Females, From 15-64 years, 48 months or over, effect=-0.44230677 # Labor cost other than wages and salaries, effect=-0.32854422 # Unemployment , Males, From 15-64 years, from 6 to 11 months, effect=-0.24511374 # Unemployment , Total, From 15-64 years, From 24 to 47 months, effect=-0.19283037 # Agriculture, forestry and fishing - Employers' social contributions, effect=-0.11994897 # Unemployment , Females, From 15-64 years, From 24 to 47 months, effect=-0.10835175 # Unemployment , Females, From 15-64 years, From 1 to 2 months, effect=-0.09093252 # Unemployment , Total, From 15-64 years, From 18 to 23 months, effect=-0.07427297 cor(pred_arimax_0_0_0_Rand$mean, ts_Y_Belgium_test) # -0.15 mean(pred_arimax_0_0_0_Rand_2015_2017) # [1] 87.74201 ``` ## Result Visualization ```{r message=F, error=F, warning=F, fig.width=8, fig.height=8} ## Plot the results of the model ARIMAX fitting ts_Y_Belgium_test <- ts(preprocess_Belgium$Y[301:360, ], start=c(2015,1), end=c(2017, 20), frequency = 20) length(ts_Y_Belgium_test) # windows(width=14, height=10) plot(forecast(BelgiumARIMA, xreg = as.matrix(X_Belgium_test)), # ARIMA forecast include=60, lwd=4, lty=3, xlab="Time", ylab="GDP Purchasing Power Standards (PPS)", ylim=c(60, 160), main = "Spacekime ARIMAX Analytics (Train: 2000-2014; Test: 2015-2017) GDP (PPS) Forecasting\n based on fitting ARIMAX Models on spline interpolated & kime-transformed Belgium data") lines(pred_arimax_2_0_1_Nil_2015_2017, col = "green", lwd = 4, lty=2) # Belgium Xreg Nil-Phase Reconstructions lines(pred_arimax_1_0_0_Swapped_2015_2017, col = "purple", lwd = 4, lty=1) # Belgium Xreg Swapped-Phase Reconstructions lines(pred_arimax_0_0_0_Rand_2015_2017, col = "orange", lwd = 4, lty=1) # Belgium Xreg Random-Phase Reconstructions lines(ts_Y_Belgium_test, col = "red", lwd = 6, lty=1) # Observed Y_Test timeseries legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2014)", "ARIMAX(4,0,2)-model GDP Forecasting (2015-2017)", "ARIMAX(2,0,1) Belgium Xreg Nil-Phase Reconstruction (2015-2017)", "ARIMAX(1,0,0) Belgium Xreg Swapped-Phase Reconstructions (2015-2017)", "ARIMAX(0,0,0)(2,0,0)[20] Belgium Xreg Random-Phase Reconstructions (2015-2017)", "Belgium Official Reported GDP (2015-2017)"), col=c("black", "blue", "green", "purple", "orange", "red"), lty=c(3,1,2,1, 1, 1), lwd=c(4,4,4,4,4, 6), cex=1.2, x.intersp=1.5, y.intersp=0.7) text(2013.5, 65, expression(atop(paste("Training Region (2000-2014)"), paste(Model(GDP) %->% "ARIMAX(p, q, r) ; ", XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2) text(2016.5, 65, expression(atop(paste("Validation Region (2015-2017)"), paste(hat(GDP) %<-% "ARIMAX(., ., .); ", XReg %==% X[i], " ", i %in% {1 : 131}))), cex=1.2) ``` # ML Data Analytics ## Data Preprocessing Find all "Common" features (highly-observed and congruent Econ indicators) ```{r message=F, error=F, warning=F} # 1. Find all "Common" features (highly-observed and congruent Econ indicators) countryNames <- unique(time_series$country); length(countryNames); # countryNames # initialize 3D array of DF's that will store the data for each of the countries into a 2D frame countryData <- list() # countryData[[listID==Country]][1-time-72, 1-feature-197] for (i in 1:length(countryNames)) { countryData[[i]] <- filter(time_series, country == countryNames[i]) } # Check countryData[[2]][2, 3] == Belgium[2,3] list_of_dfs_CommonFeatures <- list() # list of data for supersampled countries 360 * 197 # 2. General function that ensures the XReg predictors for ALL 31 EU countries are homologous completeHomologousX_features <- function (list_of_dfs) { # delete features that are missing at all time points for (j in 1:length(list_of_dfs)) { print(paste0("Pre-processing Country: ...", countryNames[j], "... ")) data = list_of_dfs[[j]] data = data[ , colSums(is.na(data)) != nrow(data)] data = dplyr::select(data, !any_of(c("time", "country"))) DataMatrix = as.matrix(data) DataMatrix = cleardata(DataMatrix) DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0 # Supersample 72 --*5--> 360 timepoints DataMatrix = splinecreate(DataMatrix) DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data # remove some of features DataSuperSample = DataSuperSample[, -c(50:80)]; dim(X) # 360 167 # ensure full-rank design matrix, DataSuperSample DataSuperSample <- DataSuperSample[ , qr(DataSuperSample)$pivot[seq_len(qr(DataSuperSample)$rank)]] print(paste0("dim()=(", dim(DataSuperSample)[1], ",", dim(DataSuperSample)[2], ") ...")) # update the current DF/Country list_of_dfs_CommonFeatures[[j]] <- DataSuperSample } # Identify All Xreg features that are homologous (same feature columns) across All 31 countries # Identify Common Columns (features) comCol <- Reduce(intersect, lapply(list_of_dfs_CommonFeatures, colnames)) list_of_dfs_CommonFeatures <- lapply(list_of_dfs_CommonFeatures, function(x) x[comCol]) for (j in 1:length(list_of_dfs_CommonFeatures)) { list_of_dfs_CommonFeatures[[j]] <- subset(list_of_dfs_CommonFeatures[[j]], select = comCol) print(paste0("dim(", countryNames[j], ")=(", dim(list_of_dfs_CommonFeatures[[j]])[1], ",", dim(list_of_dfs_CommonFeatures[[j]])[2], ")!")) # 72 * 197 } return(list_of_dfs_CommonFeatures) } # Test completeHomologousX_features: dim(AllCountries)=(360,42)! list_of_dfs_CommonFeatures <- completeHomologousX_features(countryData); length(list_of_dfs_CommonFeatures); dim(list_of_dfs_CommonFeatures[[1]]) # Austria data matrix 360*42 ``` For each country ($n$) and each common feature ($k$), fit ARIMA model and estimate the parameters $(p,d,q)$ (non-exogenous, just the timeseries model for this feature), (p,d,q) triples for non-seasonal and seasonal effects. For each (Country, Feature) pair, the 9 ARIMA-derived vector includes: ** (ts_avg, forecast_avg, non-seasonal AR, non-seasonal MA, seasonal AR, seasonal MA, period, non-seasonal Diff, seasonal differences)**. ```{r message=F, error=F, warning=F} # 3. For each country (n) and each common feature (k), compute (p,d,q) ARIMA models (non-exogenous, # just the timeseries model for this feature), (p,d,q) triples # Country * Feature arimaModels_DF <- list() #data.frame(matrix(NA, nrow = length(countryNames), # ncol = dim(list_of_dfs_CommonFeatures[[1]])[2]), row.names=countryNames, stringsAsFactors=T) # colnames(arimaModels_DF) <- colnames(list_of_dfs_CommonFeatures[[1]]) # list_index <- 1 # arimaModels_ARMA_coefs <- list() # array( , c(31, 9*dim(list_of_dfs_CommonFeatures[[1]])[2])) # dim(arimaModels_ARMA_coefs) # [1] 31 x 378 == 31 x (9 * 42) # For each (Country, feature) index, the 9 ARIMA-derived vector includes: # (ts_avg, forecast_avg, non-seasonal AR, non-seasonal MA, seasonal AR, seasonal MA, period, non-seasonal Diff, seasonal differences) for(n in 1:(length(list_of_dfs_CommonFeatures))) { # for each Country 1<=n<=31 for (k in 1:(dim(list_of_dfs_CommonFeatures[[1]])[2])) { # for each feature 1<=k<=42 # extract one timeseries (the feature+country time course) ts = ts(list_of_dfs_CommonFeatures[[n]][ , k], frequency=20, start=c(2000,1), end=c(2017,20)) set.seed(1234) arimaModels_DF[[list_index]] <- auto.arima(ts) # pred_arimaModels_DF = forecast(arimaModels_DF[[list_index]]) # ts_pred_arimaModels_DF <- # ts(pred_arimaModels_DF$mean, frequency=20, start=c(2015,1), end=c(2017,20)) # ts_pred_arimaModels_DF arimaModels_ARMA_coefs[[list_index]] <- c ( mean(ts), # time-series average (retrospective) mean(forecast(arimaModels_DF[[list_index]])$mean), # forecasted TS average (prospective) arimaModels_DF[[list_index]]$arma) # 7 ARMA estimated parameters # cat("arimaModels_ARMA_coefs[country=", countryNames[n], ", feature=", # colnames(list_of_dfs_CommonFeatures[[1]])[k], # "] Derived-Features=(", round(arimaModels_ARMA_coefs[[list_index]], 2), ") ...") #print(paste0("arimaModels_DF[country=", countryNames[i], ", feature=", # colnames(list_of_dfs_CommonFeatures[[1]])[k], # "]$arma =", arimaModels_DF[[list_index]]$arma)) list_index <- list_index + 1 } } length(arimaModels_ARMA_coefs) == 31*42 # [1] TRUE # Each list-element consists of 9 values, see above # == dim(list_of_dfs_CommonFeatures[[1]])[1] * dim(list_of_dfs_CommonFeatures[[1]])[2] # Maps to convert between 1D indices and 2D (Country, Feature) pairs index2CountryFeature <- function(indx=1) { if (indx<1 | indx>length(arimaModels_ARMA_coefs)) { cat("Index out of bounds: indx=", indx, "; must be between 1 and ", length(arimaModels_ARMA_coefs), " ... Exiting ...") return (NULL) } else { feature = (indx-1) %% (dim(list_of_dfs_CommonFeatures[[1]])[2]) country = floor((indx - feature)/(dim(list_of_dfs_CommonFeatures[[1]])[2])) return(list("feature"=(feature+1), "country"=(country+1))) } } countryFeature2Index <- function(countryIndx=1, featureIndx=1) { if (countryIndx<1 | countryIndx>(dim(list_of_dfs_CommonFeatures[[1]])[1]) | featureIndx<1 | featureIndx>(dim(list_of_dfs_CommonFeatures[[1]])[2])) { cat("Indices out of bounds: countryIndx=", countryIndx, "; featureIndx=", featureIndx, "; Exiting ...") return (NULL) } else { return (featureIndx + (countryIndx-1)*(dim(list_of_dfs_CommonFeatures[[1]])[2])) } } # test forward and reverse index mapping functions index2CountryFeature(42); index2CountryFeature(45)$country; index2CountryFeature(45)$feature countryFeature2Index(countryIndx=2, featureIndx=3) # Column/Feature Names: colnames(list_of_dfs_CommonFeatures[[1]]) # Country/Row Names: countryNames arimaModels_ARMA_coefs[[1]] # Austria/Feature1 1:9 feature vector # Cuntry2=Bulgaria, feature 5, 1:9 vector arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=2, featureIndx=5)]] ``` Convert list of ARIMA models to a Data.Frame `[Countries, megaFeatures]` that can be put through ML data analytics. Augment the features using the [EU_SOCR_Country_Ranking_Data_2011 dataset](http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_2008_World_CountriesRankings). ```{r message=F, error=F, warning=F} # 4. Add the country ranking as a new feature, using the OA ranks here: # (http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_2008_World_CountriesRankings) EU_SOCR_Country_Ranking_Data_2011 <- read.csv2("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_SOCR_Country_Ranking_Data_2011.csv", header=T, sep=",") length(arimaModels_ARMA_coefs) # 31*42 arima_df <- data.frame(matrix(NA, nrow=length(countryNames), ncol=length(colnames(list_of_dfs_CommonFeatures[[1]]))*9)) dim(arima_df) # [1] 31 378 for(n in 1:dim(arima_df)[1]) { # for each Country 1<=n<=31 for (k in 1:length(colnames(list_of_dfs_CommonFeatures[[1]]))) { # for each feature 1<=k<=42 for (l in 1:9) { # for each arima vector 1:9, see above arima_df[n, (k-1)*9 + l] <- round(arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=n, featureIndx=k)]][l], 1) if (n==dim(arima_df)[1]) colnames(arima_df)[(k-1)*9 + l] <- paste0("Feature_",k, "_ArimaVec_",l) } } } # DF Conversion Validation arimaModels_ARMA_coefs[[countryFeature2Index(countryIndx=3, featureIndx=5)]][2] == arima_df[3, (5-1)*9 + 2] # [1] 1802.956 # Aggregate 2 datasets dim(EU_SOCR_Country_Ranking_Data_2011) # [1] 31 10 aggregate_arima_vector_country_ranking_df <- as.data.frame(cbind(arima_df, EU_SOCR_Country_Ranking_Data_2011[ , -1])) dim(aggregate_arima_vector_country_ranking_df) # [1] Country=31 * Features=387 (ARIMA=378 + Ranking=9) # View(aggregate_arima_vector_country_ranking_df) rownames(aggregate_arima_vector_country_ranking_df) <- countryNames write.csv(aggregate_arima_vector_country_ranking_df, row.names = T, fileEncoding = "UTF-16LE", "C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_aggregate_arima_vector_country_ranking.csv") ``` ## `Spacetime` Analytics ### Supervised classification Use [Model-based and Model-free methods](https://dspa.predictive.space/) to predict the *overall* (OA) country ranking. ```{r message=F, error=F, warning=F, fig.width=12, fig.height=12} # 1. LASSO regression/feature extraction library(glmnet) library(arm) library(knitr) # subset test data Y = aggregate_arima_vector_country_ranking_df$OA X = aggregate_arima_vector_country_ranking_df[ , -387] # remove columns containing NAs and convert character DF to numeric type X = as.data.frame(apply(X[ , colSums(is.na(X)) == 0], 2, as.numeric)); dim(X) # [1] 31 386 fitRidge = glmnet(as.matrix(X), Y, alpha = 0) # Ridge Regression fitLASSO = glmnet(as.matrix(X), Y, alpha = 1) # The LASSO # LASSO plot(fitLASSO, xvar="lambda", label="TRUE", lwd=3) # add label to upper x-axis mtext("LASSO regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Ridge plot(fitRidge, xvar="lambda", label="TRUE", lwd=3) # add label to upper x-axis mtext("Ridge regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5) #### 10-fold cross validation #### # LASSO library(doParallel) registerDoParallel(6) set.seed(1234) # set seed # (10-fold) cross validation for the LASSO cvLASSO = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE) cvRidge = cv.glmnet(data.matrix(X), Y, alpha = 0, parallel=TRUE) plot(cvLASSO) mtext("CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO <- predict(cvLASSO, s = cvLASSO$lambda.min, newx = data.matrix(X)) testMSE_LASSO <- mean((predLASSO - Y)^2); testMSE_LASSO predLASSO = predict(cvLASSO, s = cvLASSO$lambda.min, newx = data.matrix(X)) predRidge = predict(fitRidge, s = cvRidge$lambda.min, newx = data.matrix(X)) # calculate test set MSE testMSELASSO = mean((predLASSO - Y)^2) testMSERidge = mean((predRidge - Y)^2) ##################################Use only ARIMA effects, no SOCR meta-data##### set.seed(4321) cvLASSO_lim = cv.glmnet(data.matrix(X[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE) plot(cvLASSO_lim) mtext("CV LASSO (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_lim <- predict(cvLASSO_lim, s = 3, # cvLASSO_lim$lambda.min, newx = data.matrix(X[ , 1:(42*9)])) coefList_lim <- coef(cvLASSO_lim, s=3) # 'lambda.min') coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1],coefList_lim@x) names(coefList_lim) <- c('Feature','EffectSize') arrange(coefList_lim, -abs(EffectSize))[2:10, ] cor(Y, predLASSO_lim[, 1]) # 0.84 ################################################################################ # Plot Regression Coefficients: create variable names for plotting library("arm") # par(mar=c(2, 13, 1, 1)) # extra large left margin # par(mar=c(5,5,5,5)) varNames <- colnames(X); # varNames; length(varNames) betaHatLASSO = as.double(coef(fitLASSO, s = cvLASSO$lambda.min)) # cvLASSO$lambda.1se betaHatRidge = as.double(coef(fitRidge, s = cvRidge$lambda.min)) #coefplot(betaHatLASSO[2:386], sd = rep(0, 385), pch=0, cex.pts = 3, main = "LASSO-Regularized Regression Coefficient Estimates", varnames = varNames) coefplot(betaHatLASSO[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", main = "LASSO-Regularized Regression Coefficient Estimates", varnames = varNames[377:386]) coefplot(betaHatRidge[377:386], sd = rep(0, 10), pch=2, add = TRUE, col.pts = "blue", cex.pts = 3) legend("bottomleft", c("LASSO", "Ridge"), col = c("red", "blue"), pch = c(1 , 2), bty = "o", cex = 2) varImp <- function(object, lambda = NULL, ...) { ## skipping a few lines beta <- predict(object, s = lambda, type = "coef") if(is.list(beta)) { out <- do.call("cbind", lapply(beta, function(x) x[,1])) out <- as.data.frame(out) } else out <- data.frame(Overall = beta[,1]) out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE]) out } varImp(cvLASSO, lambda = cvLASSO$lambda.min) coefList <- coef(cvLASSO, s='lambda.min') coefList <- data.frame(coefList@Dimnames[[1]][coefList@i+1],coefList@x) names(coefList) <- c('Feature','EffectSize') arrange(coefList, -abs(EffectSize))[2:10, ] # var val # Feature names: colnames(list_of_dfs_CommonFeatures[[1]]) #1 (Intercept) 49.4896874 #2 Feature_1_ArimaVec_8 -2.4050811 # Feature 1 = Active population: Females 15 to 64 years #3 Feature_20_ArimaVec_8 -1.4015001 # Feature 20= "Employment: Females 15 to 64 years #4 IncomeGroup -1.2271177 #5 Feature_9_ArimaVec_8 -1.0629835 # Feature 9= Active population: Total 15 to 64 years #6 ED -0.7481041 #7 PE -0.5167668 #8 Feature_25_ArimaVec_5 0.4416775 # Feature 25= Property income #9 Feature_9_ArimaVec_4 -0.2217804 #10 QOL -0.1965342 # ARIMA: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff # #9 ARIMA-derived vector includes: # (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, # 7=period, 8=non-seasonal Diff, 9=seasonal differences) # [1] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels" # [2] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" # [3] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Tertiary education (levels 5-8)" # [4] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)" # [5] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, All ISCED 2011 levels" # [6] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" # [7] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Tertiary education (levels 5-8)" # [8] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)" # [9] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels" #[10] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" #[11] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Tertiary education (levels 5-8)" #[12] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)" #[13] "All ISCED 2011 levels " # [14] "All ISCED 2011 levels, Females" # [15] "All ISCED 2011 levels, Males" # [16] "Capital transfers, payable" # [17] "Capital transfers, receivable" # [18] "Compensation of employees, payable" # [19] "Current taxes on income, wealth, etc., receivable" #[20] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels " # [21] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" # [22] "Other current transfers, payable" # [23] "Other current transfers, receivable" # [24] "Property income, payable" # [25] "Property income, receivable" # [26] "Savings, gross" # [27] "Subsidies, payable" # [28] "Taxes on production and imports, receivable" # [29] "Total general government expenditure" # [30] "Total general government revenue" # [31] "Unemployment , Females, From 15-64 years, Total" # [32] "Unemployment , Males, From 15-64 years" # [33] "Unemployment , Males, From 15-64 years, from 1 to 2 months" # [34] "Unemployment , Males, From 15-64 years, from 3 to 5 months" # [35] "Unemployment , Males, From 15-64 years, from 6 to 11 months" # [36] "Unemployment , Total, From 15-64 years, From 1 to 2 months" # [37] "Unemployment , Total, From 15-64 years, From 12 to 17 months" # [38] "Unemployment , Total, From 15-64 years, From 3 to 5 months" # [39] "Unemployment , Total, From 15-64 years, From 6 to 11 months" # [40] "Unemployment , Total, From 15-64 years, Less than 1 month" # [41] "Unemployment by sex, age, duration. DurationNA not started" # [42] "VAT, receivable" coef(cvLASSO, s = "lambda.min") %>% broom::tidy() %>% filter(row != "(Intercept)") %>% top_n(100, wt = abs(value)) %>% ggplot(aes(value, reorder(row, value), color = value > 0)) + geom_point(show.legend = FALSE, aes(size = abs(value))) + ggtitle("Top 9 salient features (LASSO penalty)") + xlab("Effect-size") + ylab(NULL) validation <- data.frame(matrix(NA, nrow = dim(predLASSO)[1], ncol=3), row.names=countryNames) validation [ , 1] <- Y; validation [ , 2] <- predLASSO_lim[, 1]; validation [ , 3] <- predRidge[, 1] colnames(validation) <- c("Y", "LASSO", "Ridge") dim(validation) head(validation) # Prediction correlations: cor(validation[ , 1], validation[, 2]) # Y=observed OA rank vs. LASSO-pred 0.96 (lim) 0.84 cor(validation[ , 1], validation[, 3]) # Y=observed OA rank vs. Ridge-pred 0.95 # Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^ linFit1 <- lm(validation[ , 1] ~ predLASSO) plot(validation[ , 1] ~ predLASSO, col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="LASSO 9/(42*9) param model", main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", cor(validation[ , 1], validation[, 2]))) abline(linFit1, lwd=3, col="red") # Plot observed LASSO (9-parameters) predicted Y^ vs. Y (Overall Country ranking) linFit1 <- lm(predLASSO_lim ~ validation[ , 1]) plot(predLASSO_lim ~ validation[ , 1], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="LASSO 9/(42*9) param model", main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", cor(validation[ , 1], validation[, 2]))) abline(linFit1, lwd=3, col="red") ``` ### Dimensionality Reduction PCA and t-SNE 2D and 3D projections using [SOCR Dimensionality Reduction Webapp](https://socr.umich.edu/HTML5/SOCR_TensorBoard_UKBB/UsersData.html). - the *tensor* file `EU_Econ_TensorData_31Countries_By_386Features.txt` contains 31 rows (countries) and 386 = 42*9(ARIMA) + 8(meta-data) columns - the *labels* file `EU_Econ_Labels_31Countries_and OverallRankingOutcome.txt` includes 31 rows with 2 columns specifying the Countries and their OA (Overall rankings). - The reader can load these *tensor* data files into the [SOCR Dimensionality Reduction Webapp](https://socr.umich.edu/HTML5/SOCR_TensorBoard_UKBB/UsersData.html) and generate dynamic views of the 2D or 3D projections (PCA=Euclidean) and (t-SNE manifold). ```{r message=F, error=F, warning=F} # https://www.socr.umich.edu/people/dinov/courses/DSPA_notes/05_DimensionalityReduction.html # ... ``` ## `Spacekime` Analytics ### Supervised classification Use [Model-based and Model-free methods](https://dspa.predictive.space/) to predict the *overall* (OA) country ranking. ```{r message=F, error=F, warning=F, fig.width=12, fig.height=12} # Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain): kSpaceTransform(data, inverse = FALSE, reconPhases = NULL) # ForwardFT (rawData, FALSE, NULL) # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL) # DATA # subset test data Y = aggregate_arima_vector_country_ranking_df$OA X = aggregate_arima_vector_country_ranking_df[ , -387] # remove columns containing NAs X = as.data.frame(apply(X[ , colSums(is.na(X)) == 0], 2, as.numeric)); dim(X) # [1] 31 386 length(Y); dim(X) FT_aggregate_arima_vector_country_ranking_df <- kSpaceTransform(as.data.frame(apply( aggregate_arima_vector_country_ranking_df[ , colSums(is.na(aggregate_arima_vector_country_ranking_df)) == 0], 2, as.numeric)), inverse = FALSE, reconPhases = NULL) ## Kime-Phase Distributions # Examine the Kime-direction Distributions of the Phases for all *Belgium* features (predictors + outcome). Define a generic function that plots the Phase distributions. # plotPhaseDistributions(dataFT, dataColnames) plotPhaseDistributions(FT_aggregate_arima_vector_country_ranking_df, colnames(aggregate_arima_vector_country_ranking_df), size=4, cex=0.1) IFT_FT_aggregate_arima_vector_country_ranking_df <- kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, TRUE, FT_aggregate_arima_vector_country_ranking_df$phases) # Check IFT(FT) == I: # ifelse(aggregate_arima_vector_country_ranking_df[5,4] - # Re(IFT_FT_aggregate_arima_vector_country_ranking_df[5,4]) < 0.001, "Perfect Synthesis", "Problems!!!") ############################################## # Nil-Phase Synthesis and LASSO model estimation # 1. Nil-Phase data synthesis (reconstruction) temp_Data <- aggregate_arima_vector_country_ranking_df nilPhase_FT_aggregate_arima_vector <- array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2])) dim(nilPhase_FT_aggregate_arima_vector) # ; head(nilPhase_FT_aggregate_arima_vector) # [1] 31 387 IFT_NilPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) # Invert back to spacetime the # FT_aggregate_arima_vector_country_ranking_df$magnitudes[ , i] signal with nil-phase IFT_NilPhase_FT_aggregate_arima_vector <- Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, TRUE, nilPhase_FT_aggregate_arima_vector)) colnames(IFT_NilPhase_FT_aggregate_arima_vector) <- colnames(aggregate_arima_vector_country_ranking_df) rownames(IFT_NilPhase_FT_aggregate_arima_vector) <- rownames(aggregate_arima_vector_country_ranking_df) dim(IFT_NilPhase_FT_aggregate_arima_vector) dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes) colnames(IFT_NilPhase_FT_aggregate_arima_vector) # IFT_NilPhase_FT_aggregate_arima_vector[1:5, 1:4]; temp_Data[1:5, 1:4] # 2. Perform LASSO modeling on IFT_NilPhase_FT_aggregate_arima_vector; # report param estimates and quality metrics AIC/BIC # library(forecast) set.seed(54321) cvLASSO_kime = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , -387]), # IFT_NilPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE) Y, alpha = 1, parallel=TRUE) plot(cvLASSO_kime) mtext("(Spacekime, Nil-phase) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_kime <- predict(cvLASSO_kime, s = cvLASSO_kime$lambda.min, newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , -387])); predLASSO_kime # testMSE_LASSO_kime <- mean((predLASSO_kime - IFT_NilPhase_FT_aggregate_arima_vector[ , 387])^2) # testMSE_LASSO_kime predLASSO_kime = predict(cvLASSO_kime, s = exp(1/3), # cvLASSO_kime$lambda.min, newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , -387])); predLASSO_kime ##################################Use only ARIMA effects, no SOCR meta-data##### set.seed(12345) cvLASSO_kime_lim = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE) plot(cvLASSO_kime_lim) mtext("CV LASSO Nil-Phase (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_kime_lim <- predict(cvLASSO_kime_lim, s = 1, newx = data.matrix(X[ , 1:(42*9)])) coefList_kime_lim <- coef(cvLASSO_kime_lim, s=1) coefList_kime_lim <- data.frame(coefList_kime_lim@Dimnames[[1]][coefList_kime_lim@i+1],coefList_kime_lim@x) names(coefList_kime_lim) <- c('Feature','EffectSize') arrange(coefList_kime_lim, -abs(EffectSize))[2:10, ] cor(Y, predLASSO_kime_lim[, 1]) # 0.1142824 ################################################################################ # Plot Regression Coefficients: create variable names for plotting library("arm") # par(mar=c(2, 13, 1, 1)) # extra large left margin # par(mar=c(5,5,5,5)) # varNames <- colnames(X); varNames; length(varNames) betaHatLASSO_kime = as.double(coef(cvLASSO_kime, s=cvLASSO_kime$lambda.min)) #cvLASSO_kime$lambda.1se coefplot(betaHatLASSO_kime[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", main = "(Spacekime) LASSO-Regularized Regression Coefficient Estimates", varnames = varNames[377:386]) varImp(cvLASSO_kime, lambda = cvLASSO_kime$lambda.min) coefList_kime <- coef(cvLASSO_kime, s=1) # 'lambda.min') coefList_kime <- data.frame(coefList_kime@Dimnames[[1]][coefList_kime@i+1], coefList_kime@x) names(coefList_kime) <- c('Feature','EffectSize') arrange(coefList_kime, -abs(EffectSize))[1:9, ] # Feature EffectSize #1 (Intercept) 26.069326257 #2 Feature_12_ArimaVec_8 -8.662856430 #3 Feature_11_ArimaVec_4 8.585283751 #4 Feature_12_ArimaVec_4 -5.023601842 #5 Feature_30_ArimaVec_4 2.242157842 #6 Feature_26_ArimaVec_6 1.760267216 #7 Feature_39_ArimaVec_5 -1.256101950 #8 Feature_34_ArimaVec_5 -1.148865337 #9 Feature_37_ArimaVec_2 0.001322367 # ARIMA-spacetime: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff # ARIMA-spacekimeNil: 2=forecast_avg, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 8=non-seasonal Diff #9 ARIMA-derived vector includes: # (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, # 7=period, 8=non-seasonal Diff, 9=seasonal differences) coef(cvLASSO_kime, s = 1/5) %>% ### "lambda.min") %>% broom::tidy() %>% filter(row != "(Intercept)") %>% top_n(100, wt = abs(value)) %>% ggplot(aes(value, reorder(row, value), color = value > 0)) + geom_point(show.legend = FALSE, aes(size = abs(value))) + ggtitle("(Spacekime) Top 9 salient features (LASSO penalty)") + xlab("Effect-size") + ylab(NULL) # pack a 31*3 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, Y) validation_kime <- cbind(predLASSO_kime[, 1], IFT_NilPhase_FT_aggregate_arima_vector[ , 387], Y) colnames(validation_kime) <- c("predLASSO_kime", "IFT_NilPhase_FT_Y", "Orig_Y") head(validation_kime) # Prediction correlations: cor(validation_kime[ , 1], validation_kime[, 2]) # Y=predLASSO_kime OA rank vs. kime_LASSO_pred: 0.99 cor(validation_kime[ , 1], validation_kime[, 3]) # Y=predLASSO_kime OA rank vs. Orig_Y: 0.64 # Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^ linFit1_kime <- lm(predLASSO_kime ~ validation_kime[ , 3]) plot(predLASSO_kime ~ validation_kime[ , 3], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="IFT_NilPhase predLASSO_kime", main = sprintf("Observed (x) vs. IFT_NilPhase Predicted (y) Overall Country Ranking, cor=%.02f", cor(validation_kime[ , 1], validation_kime[, 3]))) abline(linFit1_kime, lwd=3, col="red") # abline(linFit1, lwd=3, col="green") ############################################## # 3. Swap Feature Phases and then synthesize the data (reconstruction) # temp_Data <- aggregate_arima_vector_country_ranking_df swappedPhase_FT_aggregate_arima_vector <- FT_aggregate_arima_vector_country_ranking_df$phases dim(swappedPhase_FT_aggregate_arima_vector) # ; head(swappedPhase_FT_aggregate_arima_vector) # [1] 31 387 IFT_SwappedPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) set.seed(12345) # sample randomly Phase-columns for each of the 131 covariates (X) swappedPhase_FT_aggregate_arima_vector1 <- as.data.frame(cbind( swappedPhase_FT_aggregate_arima_vector[ , sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 1:378]))], # mix ARIMA signature phases swappedPhase_FT_aggregate_arima_vector[ , sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 379:386]))],# mix the meta-data phases swappedPhase_FT_aggregate_arima_vector[ , 387])) # add correct Outcome phase swappedPhase_FT_aggregate_arima_vector <- swappedPhase_FT_aggregate_arima_vector1 colnames(swappedPhase_FT_aggregate_arima_vector) <- colnames(temp_Data) colnames(swappedPhase_FT_aggregate_arima_vector); dim(swappedPhase_FT_aggregate_arima_vector) # 31 387 # Invert back to spacetime the # FT_aggregate_arima_vector$magnitudes[ , i] signal with swapped-X-phases (Y-phase is fixed) IFT_SwappedPhase_FT_aggregate_arima_vector <- Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, TRUE, swappedPhase_FT_aggregate_arima_vector)) colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) <- colnames(aggregate_arima_vector_country_ranking_df) rownames(IFT_SwappedPhase_FT_aggregate_arima_vector) <- rownames(aggregate_arima_vector_country_ranking_df) dim(IFT_SwappedPhase_FT_aggregate_arima_vector) dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes) colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) # IFT_SwappedPhase_FT_aggregate_arima_vector[1:5, 1:4]; temp_Data[1:5, 1:4] # 2. Perform LASSO modeling on IFT_SwappedPhase_FT_aggregate_arima_vector; # report param estimates and quality metrics AIC/BIC set.seed(12345) cvLASSO_kime_swapped = cv.glmnet(data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , -387]), # IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE) Y, alpha = 1, parallel=TRUE) plot(cvLASSO_kime_swapped) mtext("(Spacekime, Swapped-Phases) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5) ##################################Use only ARIMA effects, no SOCR meta-data##### set.seed(12345) cvLASSO_kime_swapped_lim = cv.glmnet(data.matrix( IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE) plot(cvLASSO_kime_swapped_lim) mtext("CV LASSO Swapped-Phase (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_kime_swapped_lim <- predict(cvLASSO_kime_swapped_lim, s = cvLASSO_kime_swapped_lim$lambda.min, newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:(42*9)])) coefList_kime_swapped_lim <- coef(cvLASSO_kime_swapped_lim, s='lambda.min') coefList_kime_swapped_lim <- data.frame(coefList_kime_swapped_lim@Dimnames[[1]][coefList_kime_swapped_lim@i+1],coefList_kime_swapped_lim@x) names(coefList_kime_swapped_lim) <- c('Feature','EffectSize') arrange(coefList_kime_swapped_lim, -abs(EffectSize))[2:10, ] cor(Y, predLASSO_kime_swapped_lim[, 1]) # 0.86 ################################################################################ # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_kime_swapped <- predict(cvLASSO_kime_swapped, s = cvLASSO_kime_swapped$lambda.min, newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , -387])) # testMSE_LASSO_kime_swapped <- # mean((predLASSO_kime_swapped - IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387])^2) # testMSE_LASSO_kime_swapped predLASSO_kime_swapped = predict(cvLASSO_kime_swapped, s = 3, newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , -387])) predLASSO_kime_swapped # Plot Regression Coefficients: create variable names for plotting betaHatLASSO_kime_swapped = as.double(coef(cvLASSO_kime_swapped, s=cvLASSO_kime_swapped$lambda.min)) #cvLASSO_kime_swapped$lambda.1se coefplot(betaHatLASSO_kime_swapped[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", main = "(Spacekime, Swapped-Phases) LASSO-Regularized Regression Coefficient Estimates", varnames = varNames[377:386]) varImp(cvLASSO_kime_swapped, lambda = cvLASSO_kime_swapped$lambda.min) coefList_kime_swapped <- coef(cvLASSO_kime_swapped, s=3) # 'lambda.min') coefList_kime_swapped <- data.frame(coefList_kime_swapped@Dimnames[[1]][coefList_kime_swapped@i+1], coefList_kime_swapped@x) names(coefList_kime_swapped) <- c('Feature','EffectSize') arrange(coefList_kime_swapped, -abs(EffectSize))[2:10, ] # Feature EffectSize #2 Feature_32_ArimaVec_6 3.3820076 #3 Feature_1_ArimaVec_3 2.2133139 #4 Feature_21_ArimaVec_4 1.5376447 #5 Feature_22_ArimaVec_3 1.0546605 #6 Feature_14_ArimaVec_5 0.7428693 #7 ED 0.6525794 #8 Feature_24_ArimaVec_5 0.5987113 #9 Feature_12_ArimaVec_5 0.3177650 #10 Feature_37_ArimaVec_6 0.1598574 # # ARIMA-spacetime: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff # ARIMA-spacekime_nill: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA # ARIMA-spacekime_swapped: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA # 9 ARIMA-derived vector includes: # (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, # 7=period, 8=non-seasonal Diff, 9=seasonal differences) coef(cvLASSO_kime_swapped, s = 3) %>% # "lambda.min") %>% broom::tidy() %>% filter(row != "(Intercept)") %>% top_n(100, wt = abs(value)) %>% ggplot(aes(value, reorder(row, value), color = value > 0)) + geom_point(show.legend = FALSE, aes(size = abs(value))) + ggtitle("(Spacekime, Swapped-Phases) Top 9 salient features (LASSO penalty)") + xlab("Effect-size") + ylab(NULL) # pack a 31*4 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, # IFT_SwappedPhase_FT_aggregate_arima_vector_Y, Y) validation_kime_swapped <- cbind(predLASSO_lim[, 1], predLASSO_kime[ , 1], predLASSO_kime_swapped[ , 1], Y) colnames(validation_kime_swapped) <- c("predLASSO (spacetime)", "predLASSO_IFT_NilPhase", "predLASSO_IFT_SwappedPhase", "Orig_Y") head(validation_kime_swapped); dim(validation_kime_swapped) # Prediction correlations: cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4]) # predLASSO_IFT_SwappedPhase OA rank vs. predLASSO_spacekime: 0.7 cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 3]) # predLASSO (spacetime) vs. predLASSO_IFT_SwappedPhase OA rank: 0.83 # Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^ linFit1_kime_swapped <- lm(validation_kime_swapped[ , 4] ~ predLASSO) plot(validation_kime_swapped[ , 4] ~ predLASSO, col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="predLASSO_spacekime Country Overall Ranking", ylab="predLASSO_IFT_SwappedPhase_FT_Y", main = sprintf("Spacetime Predicted (x) vs. Kime IFT_SwappedPhase_FT_Y (y) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4]))) abline(linFit1_kime_swapped, lwd=3, col="red") #abline(linFit1_kime, lwd=3, col="green") # Plot Spacetime LASSO forecasting # Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^ linFit1_spacetime <- lm(validation_kime_swapped[ , 1] ~ validation_kime_swapped[ , 4]) plot(validation_kime_swapped[ , 1] ~ validation_kime_swapped[ , 4], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="predLASSO_spacetime", main = sprintf("Spacetime Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 4]))) abline(linFit1_spacetime, lwd=3, col="red") # test with using swapped-phases LASSO estimates linFit1_spacekime <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4]) plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="predLASSO_spacekime Swapped-Phases", main = sprintf("Spacekime Predicted, Swapped-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4]))) abline(linFit1_spacekime, lwd=3, col="red") # add Top_30_Ranking_Indicator validation_kime_swapped <- as.data.frame(cbind(validation_kime_swapped, ifelse (validation_kime_swapped[,4]<=30, 1, 0))) colnames(validation_kime_swapped)[5] <- "Top30Rank" head(validation_kime_swapped) library("ggrepel") # Spacetime LASSO modeling myPlotSpacetime <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y, y=`predLASSO (spacetime)`, label=rownames(validation_kime_swapped))) + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) + geom_label_repel(aes(label = rownames(validation_kime_swapped), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacetime LASSO Prediction (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 4])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacetime LASSO Rank Forecasting") myPlotSpacetime # NIL-PHASE KIME reconstruction myPlotNilPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y, y=predLASSO_kime, label=rownames(validation_kime_swapped))) + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) + geom_label_repel(aes(label = rownames(validation_kime_swapped), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacekime LASSO Predicted, Nil-Phases, (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 2], validation_kime_swapped[, 4])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacekime LASSO Predicted, using Nil-Phases") myPlotNilPhase # SWAPPED PHASE KIME reconstruction myPlotSwappedPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y, y=predLASSO_kime_swapped, label=rownames(validation_kime_swapped))) + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) + geom_label_repel(aes(label = rownames(validation_kime_swapped), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacekime LASSO Predicted, Swapped-Phases, (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacekime LASSO Predicted, using Swapped-Phases") myPlotSwappedPhase countryNames[11]<-"Germany" aggregateResults <- (rbind(cbind(as.character(countryNames), "predLASSO_spacetime", as.numeric(predLASSO)), cbind(as.character(countryNames), "predLASSO_lim", predLASSO_lim), cbind(as.character(countryNames), "predLASSO_nil", predLASSO_kime), cbind(as.character(countryNames), "predLASSO_swapped", predLASSO_kime_swapped), cbind(as.character(countryNames), "observed", Y) )) aggregateResults <- data.frame(aggregateResults[ , -3], as.numeric(aggregateResults[,3])) colnames(aggregateResults) <- c("country", "estimate_method", "ranking") ggplot(aggregateResults, aes(x=country, y=ranking, color=estimate_method)) + geom_point(aes(shape=estimate_method, color=estimate_method, size=estimate_method)) + geom_point(size = 5) + geom_line(data = aggregateResults[aggregateResults$estimate_method == "observed", ], aes(group = estimate_method), size=2, linetype = "dashed") + theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + # theme(legend.position = "bottom") + # scale_shape_manual(values = as.factor(aggregateResults$estimate_method)) + theme(text = element_text(size = 15), legend.position = c(0.3, 0.85), axis.text=element_text(size=16), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) # + scale_fill_discrete( # name="Country Overall Ranking", # breaks=c("predLASSO_spacetime", "predLASSO_lim", "predLASSO_nil", "predLASSO_swapped", "observed"), # labels=c(sprintf("predLASSO_spacetime LASSO Predicted (386), cor=%.02f", cor(predLASSO, Y)), # sprintf("predLASSO_lim LASSO Predicted (378), cor=%.02f", cor(predLASSO_lim, Y)), # sprintf("predLASSO_nil (spacekime) LASSO Predicted, cor=%.02f", cor(predLASSO_kime, Y)), # sprintf("predLASSO_swapped (spacekime) LASSO Predicted, cor=%.02f", cor(predLASSO_kime_swapped, Y)), # "observed")) ``` ### Core redesign Generic Functions ```{r message=F, error=F, warning=F} # Plotting the coefficients coef_plot <- function(betahat, varn, plotname) { betahat<-betahat[-1] P <- coefplot(betahat[which(betahat!=0)], sd = rep(0, length(betahat[which(betahat!=0)])), pch=0, cex.pts = 3, col="red", main = plotname, varnames = varn[which(betahat!=0)]) return(P) } # Plotting the coefficients for those two methods findfeatures <- function(lassobeta, ridgebeta=NULL) { lassobeta<-lassobeta[-1] feat1 <- which(lassobeta!=0) features <- feat1 if (!is.null(ridgebeta)) { ridgebeta<-ridgebeta[-1] feat2 <- order(abs(ridgebeta),decreasing = TRUE)[1:10] features <- union(feat1, feat2) } return(features) } varImp <- function(object, lambda = NULL, ...) { ## skipping a few lines beta <- predict(object, s = lambda, type = "coef") if(is.list(beta)) { out <- do.call("cbind", lapply(beta, function(x) x[,1])) out <- as.data.frame(out) s <- rowSums(out) out <- out[which(s)!=0,,drop=FALSE] } else { out<-data.frame(Overall = beta[,1]) out<-out[which(out!=0),,drop=FALSE] } out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE]) out } ``` #### Weak-signal Analytics Using only the 378 ARIMA signatures for the prediction (out of the total of 386 features). ##### Space-Time Analytics ```{r message=F, error=F, warning=F} # 1. LASSO regression/feature extraction library(glmnet) library(arm) library(knitr) # subset test data Y = aggregate_arima_vector_country_ranking_df$OA X = aggregate_arima_vector_country_ranking_df[ , 1:378] # remove columns containing NAs X = X[ , colSums(is.na(X)) == 0]; dim(X) # [1] 31 378 #### 10-fold cross validation: for the LASSO library("glmnet") library(doParallel) registerDoParallel(6) set.seed(4321) cvLASSO_lim = cv.glmnet(data.matrix(X[ , 1:(42*9)]), Y, alpha = 1, parallel=TRUE) plot(cvLASSO_lim) mtext("CV LASSO (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_lim <- predict(cvLASSO_lim, s = 3, # cvLASSO_lim$lambda.min, newx = data.matrix(X[ , 1:(42*9)])) coefList_lim <- coef(cvLASSO_lim, s=3) # 'lambda.min') coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1],coefList_lim@x) names(coefList_lim) <- c('Feature','EffectSize') arrange(coefList_lim, -abs(EffectSize))[2:10, ] cor(Y, predLASSO_lim[, 1]) # 0.84 ################################################################################ varImp(cvLASSO_lim, lambda = cvLASSO_lim$lambda.min) #2 Feature_1_ArimaVec_8 -2.3864299 #3 Feature_19_ArimaVec_8 2.0871310 #4 Feature_16_ArimaVec_3 2.0465254 #5 Feature_13_ArimaVec_8 -1.7348553 #6 Feature_15_ArimaVec_4 -1.4588173 #7 Feature_22_ArimaVec_4 -1.1068801 #8 Feature_25_ArimaVec_5 0.9336800 #9 Feature_35_ArimaVec_4 -0.9276244 #10 Feature_25_ArimaVec_4 -0.8486434 #coefList_lim <- coef(cvLASSO_lim, s='lambda.min') #coefList_lim <- data.frame(coefList_lim@Dimnames[[1]][coefList_lim@i+1], coefList_lim@x) #names(coefList_lim) <- c('Feature','EffectSize') #arrange(coefList_lim, -abs(EffectSize))[2:10, ] # # 9 ARIMA-derived vector includes: # (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, # 7=period, 8=non-seasonal Diff, 9=seasonal differences) # [1] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels" # [2] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" # [3] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Tertiary education (levels 5-8)" # [4] "Active population by sex, age and educational attainment level, Females, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)" # [5] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, All ISCED 2011 levels" # [6] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" # [7] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Tertiary education (levels 5-8)" # [8] "Active population by sex, age and educational attainment level, Males, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)" # [9] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, All ISCED 2011 levels" #[10] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" #[11] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Tertiary education (levels 5-8)" #[12] "Active population by sex, age and educational attainment level, Total, From 15 to 64 years, Upper secondary and post-secondary non-tertiary education (levels 3 and 4)" #[13] "All ISCED 2011 levels " # [14] "All ISCED 2011 levels, Females" # [15] "All ISCED 2011 levels, Males" # [16] "Capital transfers, payable" # [17] "Capital transfers, receivable" # [18] "Compensation of employees, payable" # [19] "Current taxes on income, wealth, etc., receivable" #[20] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, All ISCED 2011 levels " # [21] "Employment by sex, age and educational attainment level, Females, From 15 to 64 years, Less than primary, primary and lower secondary education (levels 0-2)" # [22] "Other current transfers, payable" # [23] "Other current transfers, receivable" # [24] "Property income, payable" # [25] "Property income, receivable" # [26] "Savings, gross" # [27] "Subsidies, payable" # [28] "Taxes on production and imports, receivable" # [29] "Total general government expenditure" # [30] "Total general government revenue" # [31] "Unemployment , Females, From 15-64 years, Total" # [32] "Unemployment , Males, From 15-64 years" # [33] "Unemployment , Males, From 15-64 years, from 1 to 2 months" # [34] "Unemployment , Males, From 15-64 years, from 3 to 5 months" # [35] "Unemployment , Males, From 15-64 years, from 6 to 11 months" # [36] "Unemployment , Total, From 15-64 years, From 1 to 2 months" # [37] "Unemployment , Total, From 15-64 years, From 12 to 17 months" # [38] "Unemployment , Total, From 15-64 years, From 3 to 5 months" # [39] "Unemployment , Total, From 15-64 years, From 6 to 11 months" # [40] "Unemployment , Total, From 15-64 years, Less than 1 month" # [41] "Unemployment by sex, age, duration. DurationNA not started" # [42] "VAT, receivable" coef(cvLASSO_lim, s = 3) %>% # "lambda.min" broom::tidy() %>% filter(row != "(Intercept)") %>% top_n(100, wt = abs(value)) %>% ggplot(aes(value, reorder(row, value), color = value > 0)) + geom_point(show.legend = FALSE, aes(size = abs(value))) + ggtitle("Top 9 salient features (LASSO penalty)") + xlab("Effect-size") + ylab(NULL) validation_lim <- data.frame(matrix(NA, nrow = dim(predLASSO_lim)[1], ncol=2), row.names=countryNames) validation_lim [ , 1] <- Y; validation_lim[ , 2] <- predLASSO_lim[, 1] colnames(validation_lim) <- c("Orig_Y", "LASSO") dim(validation_lim); head(validation_lim) # add Top_30_Ranking_Indicator validation_lim <- as.data.frame(cbind(validation_lim, ifelse (validation_lim[, 1]<=30, 1, 0))) colnames(validation_lim)[3] <- "Top30Rank" head(validation_lim) # Prediction correlations: cor(validation_lim[ , 1], validation_lim[, 2]) # Y=observed OA rank vs. LASSO-pred 0.98 (lim) 0.84 # Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^ linFit_lim <- lm(validation_lim[ , 1] ~ validation_lim[, 2]) plot(validation_lim[ , 1] ~ validation_lim[, 2], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="LASSO (42*9 +8) param model", main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", cor(validation_lim[ , 1], validation_lim[, 2]))) abline(linFit_lim, lwd=3, col="red") # Plot myPlot <- ggplot(as.data.frame(validation_lim), aes(x=validation_lim[ , 1], y=validation_lim[ , 2], label=rownames(validation_lim))) + geom_smooth(method='lm') + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_lim)))) + geom_label_repel(aes(label = rownames(validation_lim), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacetime LASSO Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_lim[ , 1], validation_lim[, 2])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacetime LASSO Predicted") myPlot ``` ##### Space-Kime Analytics **Nil-Phase Synthesis** and LASSO model estimation ... ```{r message=F, error=F, warning=F} # Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain): kSpaceTransform(data, inverse = FALSE, reconPhases = NULL) # ForwardFT (rawData, FALSE, NULL) # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL) # DATA # subset test data aggregate_arima_vector_country_ranking_df <- as.data.frame(apply( aggregate_arima_vector_country_ranking_df[ , colSums(is.na(aggregate_arima_vector_country_ranking_df)) == 0], 2, as.numeric)) Y = aggregate_arima_vector_country_ranking_df$OA X = aggregate_arima_vector_country_ranking_df[ , 1:378] # remove columns containing NAs # X = X[ , colSums(is.na(X)) == 0]; dim(X) # [1] 31 386 length(Y); dim(X) FT_aggregate_arima_vector_country_ranking_df <- kSpaceTransform(aggregate_arima_vector_country_ranking_df, inverse = FALSE, reconPhases = NULL) ## Kime-Phase Distributions # Examine the Kime-direction Distributions of the Phases for all *Belgium* features (predictors + outcome). Define a generic function that plots the Phase distributions. # plotPhaseDistributions(dataFT, dataColnames) plotPhaseDistributions(FT_aggregate_arima_vector_country_ranking_df, colnames(aggregate_arima_vector_country_ranking_df), size=4, cex=0.1) IFT_FT_aggregate_arima_vector_country_ranking_df <- kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, TRUE, FT_aggregate_arima_vector_country_ranking_df$phases) # Check IFT(FT) == I: # ifelse(aggregate_arima_vector_country_ranking_df[5,4] - # Re(IFT_FT_aggregate_arima_vector_country_ranking_df[5,4]) < 0.001, "Perfect Synthesis", "Problems!!!") ############################################## # Nil-Phase Synthesis and LASSO model estimation # 1. Nil-Phase data synthesis (reconstruction) temp_Data <- aggregate_arima_vector_country_ranking_df nilPhase_FT_aggregate_arima_vector <- array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2])) dim(nilPhase_FT_aggregate_arima_vector) # ; head(nilPhase_FT_aggregate_arima_vector) # [1] 31 387 IFT_NilPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) # Invert back to spacetime the # FT_aggregate_arima_vector_country_ranking_df$magnitudes[ , i] signal with nil-phase IFT_NilPhase_FT_aggregate_arima_vector <- Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, TRUE, nilPhase_FT_aggregate_arima_vector)) colnames(IFT_NilPhase_FT_aggregate_arima_vector) <- colnames(aggregate_arima_vector_country_ranking_df) rownames(IFT_NilPhase_FT_aggregate_arima_vector) <- rownames(aggregate_arima_vector_country_ranking_df) dim(IFT_NilPhase_FT_aggregate_arima_vector) dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes) colnames(IFT_NilPhase_FT_aggregate_arima_vector) # IFT_NilPhase_FT_aggregate_arima_vector[1:5, 1:4]; temp_Data[1:5, 1:4] # 2. Perform LASSO modeling on IFT_NilPhase_FT_aggregate_arima_vector; # report param estimates and quality metrics AIC/BIC # library(forecast) set.seed(123) cvLASSO_nil_kime = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:378]), # IFT_NilPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE) Y, alpha = 1, parallel=TRUE) plot(cvLASSO_nil_kime) mtext("(Spacekime, Nil-phase) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_nil_kime <- predict(cvLASSO_nil_kime, s = cvLASSO_nil_kime$lambda.min, newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:378])); predLASSO_nil_kime # testMSE_LASSO_nil_kime <- mean((predLASSO_nil_kime - IFT_NilPhase_FT_aggregate_arima_vector[ , 387])^2) # testMSE_LASSO_nil_kime # Plot Regression Coefficients: create variable names for plotting library("arm") # par(mar=c(2, 13, 1, 1)) # extra large left margin # par(mar=c(5,5,5,5)) # varNames <- colnames(X); varNames; length(varNames) #betaHatLASSO_kime = as.double(coef(cvLASSO_kime, s=cvLASSO_kime$lambda.min)) #cvLASSO_kime$lambda.1se # #coefplot(betaHatLASSO_kime[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", # main = "(Spacekime) LASSO-Regularized Regression Coefficient Estimates", # varnames = varNames[377:386]) varImp(cvLASSO_nil_kime, lambda = cvLASSO_nil_kime$lambda.min) coefList_nil_kime <- coef(cvLASSO_nil_kime, s='lambda.min') coefList_nil_kime <- data.frame(coefList_nil_kime@Dimnames[[1]][coefList_nil_kime@i+1], coefList_nil_kime@x) names(coefList_nil_kime) <- c('Feature','EffectSize') arrange(coefList_nil_kime, -abs(EffectSize))[1:9, ] # Feature EffectSize #1 (Intercept) 26.385520159 #2 Feature_12_ArimaVec_8 -9.312528495 #3 Feature_11_ArimaVec_4 8.561417371 #4 Feature_12_ArimaVec_4 -5.220797416 #5 Feature_30_ArimaVec_4 2.623218791 #6 Feature_26_ArimaVec_6 1.927773213 #7 Feature_39_ArimaVec_5 -1.534741402 #8 Feature_34_ArimaVec_5 -1.171720008 #9 Feature_37_ArimaVec_2 0.004213823 # ARIMA-spacetime: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff # ARIMA-spacekimeNil: 2=forecast_avg, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 8=non-seasonal Diff #9 ARIMA-derived vector includes: # (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, # 7=period, 8=non-seasonal Diff, 9=seasonal differences) coef(cvLASSO_nil_kime, s = "lambda.min") %>% broom::tidy() %>% filter(row != "(Intercept)") %>% top_n(100, wt = abs(value)) %>% ggplot(aes(value, reorder(row, value), color = value > 0)) + geom_point(show.legend = FALSE, aes(size = abs(value))) + ggtitle("(Spacekime) Top 9 salient features (LASSO penalty)") + xlab("Effect-size") + ylab(NULL) # pack a 31*3 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, Y) validation_nil_kime <- cbind(predLASSO_nil_kime[, 1], IFT_NilPhase_FT_aggregate_arima_vector[ , 387], Y) colnames(validation_nil_kime) <- c("predLASSO_kime", "IFT_NilPhase_FT_Y", "Orig_Y") rownames(validation_nil_kime)[11] <- "Germany" head(validation_nil_kime) validation_nil_kime <- as.data.frame(cbind(validation_nil_kime, ifelse (validation_nil_kime[,3]<=30, 1, 0))) colnames(validation_nil_kime)[4] <- "Top30Rank" head(validation_nil_kime) # Prediction correlations: # cor(validation_nil_kime[ , 1], validation_nil_kime[, 2]) # Y=predLASSO_kime OA rank vs. kime_LASSO_pred: 0.99 cor(validation_nil_kime[ , 1], validation_nil_kime[, 3]) # Y=predLASSO_kime OA rank vs. Orig_Y: 0.64 # Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^ linFit1_nil_kime <- lm(predLASSO_nil_kime ~ validation_nil_kime[ , 3]) plot(predLASSO_nil_kime ~ validation_nil_kime[ , 3], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="IFT_NilPhase predLASSO_kime", main = sprintf("Observed (x) vs. IFT_NilPhase Predicted (y) Overall Country Ranking, cor=%.02f", cor(validation_nil_kime[ , 1], validation_nil_kime[, 3]))) abline(linFit1_kime, lwd=3, col="red") # abline(linFit1, lwd=3, col="green") # Spacetime LASSO modeling myPlotNilPhase <- ggplot(as.data.frame(validation_nil_kime), aes(x=Orig_Y, y=predLASSO_nil_kime, label=rownames(validation_nil_kime))) + geom_smooth(method='lm') + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_nil_kime)))) + geom_label_repel(aes(label = rownames(validation_nil_kime), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacekime LASSO Predicted, Nil-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_nil_kime[ , 1], validation_nil_kime[, 3])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacekime LASSO Predicted, using Nil-Phases") myPlotNilPhase ``` **Swapped Feature Phases** and then synthesize the data (reconstruction) ```{r message=F, error=F, warning=F} # temp_Data <- aggregate_arima_vector_country_ranking_df swappedPhase_FT_aggregate_arima_vector <- FT_aggregate_arima_vector_country_ranking_df$phases dim(swappedPhase_FT_aggregate_arima_vector) # ; head(swappedPhase_FT_aggregate_arima_vector) # [1] 31 387 IFT_SwappedPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) set.seed(12345) # sample randomly Phase-columns for each of the 131 covariates (X) swappedPhase_FT_aggregate_arima_vector1 <- as.data.frame(cbind( swappedPhase_FT_aggregate_arima_vector[ , sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 1:378]))], # mix ARIMA signature phases swappedPhase_FT_aggregate_arima_vector[ , sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 379:386]))],# mix the meta-data phases swappedPhase_FT_aggregate_arima_vector[ , 387])) # add correct Outcome phase swappedPhase_FT_aggregate_arima_vector <- swappedPhase_FT_aggregate_arima_vector1 colnames(swappedPhase_FT_aggregate_arima_vector) <- colnames(temp_Data) colnames(swappedPhase_FT_aggregate_arima_vector); dim(swappedPhase_FT_aggregate_arima_vector) # 31 387 # Invert back to spacetime the # FT_aggregate_arima_vector$magnitudes[ , i] signal with swapped-X-phases (Y-phase is fixed) IFT_SwappedPhase_FT_aggregate_arima_vector <- Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, TRUE, swappedPhase_FT_aggregate_arima_vector)) colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) <- colnames(aggregate_arima_vector_country_ranking_df) rownames(IFT_SwappedPhase_FT_aggregate_arima_vector) <- rownames(aggregate_arima_vector_country_ranking_df) dim(IFT_SwappedPhase_FT_aggregate_arima_vector) dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes) colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) # IFT_SwappedPhase_FT_aggregate_arima_vector[1:5, 1:4]; temp_Data[1:5, 1:4] # 2. Perform LASSO modeling on IFT_SwappedPhase_FT_aggregate_arima_vector; # report param estimates and quality metrics AIC/BIC set.seed(12) cvLASSO_kime_swapped = cv.glmnet(data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378]), # IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE) Y, alpha = 1, parallel=TRUE) plot(cvLASSO_kime_swapped) mtext("(Spacekime, Swapped-Phases) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_kime_swapped <- predict(cvLASSO_kime_swapped, s = 3, # cvLASSO_kime_swapped$lambda.min, newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378])) # testMSE_LASSO_kime_swapped <- # mean((predLASSO_kime_swapped - IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387])^2) # testMSE_LASSO_kime_swapped predLASSO_kime_swapped # Plot Regression Coefficients: create variable names for plotting betaHatLASSO_kime_swapped = as.double(coef(cvLASSO_kime_swapped, s=3)) # cvLASSO_kime_swapped$lambda.min)) #cvLASSO_kime_swapped$lambda.1se #coefplot(betaHatLASSO_kime_swapped[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", # main = "(Spacekime, Swapped-Phases) LASSO-Regularized Regression Coefficient Estimates", # varnames = varNames[377:386]) varImp(cvLASSO_kime_swapped, lambda = 3) #cvLASSO_kime_swapped$lambda.min) coefList_kime_swapped <- coef(cvLASSO_kime_swapped, s=3) # 'lambda.min') coefList_kime_swapped <- data.frame(coefList_kime_swapped@Dimnames[[1]][coefList_kime_swapped@i+1], coefList_kime_swapped@x) names(coefList_kime_swapped) <- c('Feature','EffectSize') arrange(coefList_kime_swapped, -abs(EffectSize))[2:10, ] # Feature EffectSize #2 Feature_3_ArimaVec_8 5.4414240889 #3 Feature_24_ArimaVec_5 -4.9895032906 #4 Feature_41_ArimaVec_6 1.7580440109 #5 Feature_7_ArimaVec_6 -1.6317407164 #6 Feature_41_ArimaVec_3 -0.4666980893 #7 Feature_42_ArimaVec_3 0.4100416326 #8 Feature_6_ArimaVec_3 -0.2052325091 #9 Feature_7_ArimaVec_1 -0.0007922646 #10 Feature_24_ArimaVec_1 -0.0002003192 # # ARIMA-spacetime: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff # ARIMA-spacekime_nill: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA # ARIMA-spacekime_swapped: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA # 9 ARIMA-derived vector includes: # (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, # 7=period, 8=non-seasonal Diff, 9=seasonal differences) coef(cvLASSO_kime_swapped, s = 3) %>% # "lambda.min") %>% broom::tidy() %>% filter(row != "(Intercept)") %>% top_n(100, wt = abs(value)) %>% ggplot(aes(value, reorder(row, value), color = value > 0)) + geom_point(show.legend = FALSE, aes(size = abs(value))) + ggtitle("(Spacekime, Swapped-Phases) Top 9 salient features (LASSO penalty)") + xlab("Effect-size") + ylab(NULL) # pack a 31*4 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, # IFT_SwappedPhase_FT_aggregate_arima_vector_Y, Y) validation_kime_swapped <- cbind(predLASSO_lim[, 1], predLASSO_nil_kime[, 1], predLASSO_kime_swapped[ , 1], Y) colnames(validation_kime_swapped) <- c("predLASSO (spacetime)", "predLASSO_IFT_NilPhase", "predLASSO_IFT_SwappedPhase", "Orig_Y") head(validation_kime_swapped); dim(validation_kime_swapped) # Prediction correlations: cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4]) # predLASSO_IFT_SwappedPhase OA rank vs. predLASSO_spacekime: 0.7 cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 3]) # predLASSO (spacetime) vs. predLASSO_IFT_SwappedPhase OA rank: 0.83 # Plot observed Y (Overall Country ranking), x-axis vs. Kime-Swapped LASSO (9-parameters) predicted Y^ linFit1_kime_swapped <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4]) plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="predLASSO_IFT_SwappedPhase_FT_Y", ylab="predLASSO_spacekime_swapped Country Overall Ranking", main = sprintf("Observed (x) vs. Kime IFT_SwappedPhase_FT_Y (y) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4]))) abline(linFit1_kime_swapped, lwd=3, col="red") #abline(linFit1_kime, lwd=3, col="green") # Plot Spacetime LASSO forecasting # Plot observed Y (Overall Country ranking), x-axis vs. LASSO (9-parameters) predicted Y^, y-axis linFit1_spacetime <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4]) plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="predLASSO_spacetime", main = sprintf("Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4]))) abline(linFit1_spacetime, lwd=3, col="red") # add Top_30_Ranking_Indicator validation_kime_swapped <- as.data.frame(cbind(validation_kime_swapped, ifelse (validation_kime_swapped[,4]<=30, 1, 0))) colnames(validation_kime_swapped)[5] <- "Top30Rank" rownames(validation_kime_swapped)[11] <- "Germany" head(validation_kime_swapped) # Spacetime LASSO modeling myPlotSwappedPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y, y=validation_kime_swapped[, 3], label=rownames(validation_kime_swapped))) + geom_smooth(method='lm') + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) + geom_label_repel(aes(label = rownames(validation_kime_swapped), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacekime LASSO Predicted, Swapped-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacekime LASSO Predicted, using Swapped-Phases") myPlotSwappedPhase ``` ### Stronger Signal Analytics Using all 386 features (378 ARIMA signatures + 8 meta-data). ##### Space-Time Analytics ```{r message=F, error=F, warning=F} # 1. LASSO regression/feature extraction library(glmnet) library(arm) library(knitr) # subset test data Y = aggregate_arima_vector_country_ranking_df$OA X = aggregate_arima_vector_country_ranking_df[ , 1:386] # remove columns containing NAs X = X[ , colSums(is.na(X)) == 0]; dim(X) # [1] 31 378 #### 10-fold cross validation: for the LASSO set.seed(4321) cvLASSO_lim_all = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE) plot(cvLASSO_lim) mtext("CV LASSO (using only Timeseries data): Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_lim_all <- predict(cvLASSO_lim_all, s = 1.1, # cvLASSO_lim$lambda.min, newx = data.matrix(X)) coefList_lim_all <- coef(cvLASSO_lim_all, s='lambda.min') coefList_lim_all <- data.frame(coefList_lim_all@Dimnames[[1]][coefList_lim_all@i+1],coefList_lim_all@x) names(coefList_lim_all) <- c('Feature','EffectSize') arrange(coefList_lim_all, -abs(EffectSize))[2:10, ] cor(Y, predLASSO_lim_all[, 1]) # 0.9974121 varImp(cvLASSO_lim_all, lambda = 1.1) # cvLASSO_lim_all$lambda.min) #Feature_1_ArimaVec_8 2.7518241 #Feature_9_ArimaVec_4 0.2662136 #Feature_9_ArimaVec_8 1.0871240 #Feature_20_ArimaVec_8 1.6851990 #Feature_25_ArimaVec_5 0.5113345 #IncomeGroup 1.1787811 #ED 0.7508295 #QOL 0.2057181 #PE 0.5131427 coef(cvLASSO_lim_all, s = 1.1) %>% #"lambda.min") %>% broom::tidy() %>% filter(row != "(Intercept)") %>% top_n(100, wt = abs(value)) %>% ggplot(aes(value, reorder(row, value), color = value > 0)) + geom_point(show.legend = FALSE, aes(size = abs(value))) + ggtitle("Top 9 salient features (LASSO penalty)") + xlab("Effect-size") + ylab(NULL) countryNames[11] <- "Germany" validation_lim_all <- data.frame(matrix(NA, nrow = dim(predLASSO_lim_all)[1], ncol=2), row.names=countryNames) validation_lim_all [ , 1] <- Y; validation_lim_all[ , 2] <- predLASSO_lim_all[, 1] colnames(validation_lim_all) <- c("Orig_Y", "LASSO") dim(validation_lim_all); head(validation_lim_all) # add Top_30_Ranking_Indicator validation_lim_all <- as.data.frame(cbind(validation_lim_all, ifelse (validation_lim_all[, 1]<=30, 1, 0))) colnames(validation_lim_all)[3] <- "Top30Rank" head(validation_lim_all) # Prediction correlations: cor(validation_lim_all[ , 1], validation_lim_all[, 2]) # Y=observed OA rank vs. LASSO-pred 0.98 (lim) 0.84 # Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^ linFit_lim_all <- lm(validation_lim_all[ , 1] ~ validation_lim_all[, 2]) plot(validation_lim_all[ , 1] ~ validation_lim_all[, 2], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="LASSO (42*9 +8) param model", main = sprintf("Observed (X) vs. LASSO-Predicted (Y) Overall Country Ranking, cor=%.02f", cor(validation_lim_all[ , 1], validation_lim_all[, 2]))) abline(linFit_lim_all, lwd=3, col="red") # Plot myPlot_all <- ggplot(as.data.frame(validation_lim_all), aes(x=validation_lim_all[ , 1], y=validation_lim_all[ , 2], label=rownames(validation_lim_all))) + geom_smooth(method='lm') + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_lim_all)))) + geom_label_repel(aes(label = rownames(validation_lim_all), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacetime LASSO Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_lim_all[ , 1], validation_lim_all[, 2])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacetime LASSO Predicted") myPlot_all ``` ##### Space-Kime Analytics **Nil-Phase Synthesis** and LASSO model estimation ... ```{r message=F, error=F, warning=F} library(glmnet) # Generic function to Transform Data ={all predictors (X) and outcome (Y)} to k-space (Fourier domain): kSpaceTransform(data, inverse = FALSE, reconPhases = NULL) # ForwardFT (rawData, FALSE, NULL) # InverseFT(magnitudes, TRUE, reconPhasesToUse) or InverseFT(FT_data, TRUE, NULL) # DATA # subset test data Y = aggregate_arima_vector_country_ranking_df$OA X = aggregate_arima_vector_country_ranking_df[ , 1:386] # remove columns containing NAs # X = X[ , colSums(is.na(X)) == 0]; dim(X) # [1] 31 386 length(Y); dim(X) FT_aggregate_arima_vector_country_ranking_df <- kSpaceTransform(aggregate_arima_vector_country_ranking_df, inverse = FALSE, reconPhases = NULL) ## Kime-Phase Distributions # Examine the Kime-direction Distributions of the Phases for all *Belgium* features (predictors + outcome). Define a generic function that plots the Phase distributions. # plotPhaseDistributions(dataFT, dataColnames) plotPhaseDistributions(FT_aggregate_arima_vector_country_ranking_df, colnames(aggregate_arima_vector_country_ranking_df), size=4, cex=0.1) IFT_FT_aggregate_arima_vector_country_ranking_df <- kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, TRUE, FT_aggregate_arima_vector_country_ranking_df$phases) # Check IFT(FT) == I: # ifelse(aggregate_arima_vector_country_ranking_df[5,4] - # Re(IFT_FT_aggregate_arima_vector_country_ranking_df[5,4]) < 0.001, "Perfect Synthesis", "Problems!!!") ############################################## # Nil-Phase Synthesis and LASSO model estimation # 1. Nil-Phase data synthesis (reconstruction) temp_Data <- aggregate_arima_vector_country_ranking_df nilPhase_FT_aggregate_arima_vector <- array(complex(real=0, imaginary=0), c(dim(temp_Data)[1], dim(temp_Data)[2])) dim(nilPhase_FT_aggregate_arima_vector) # ; head(nilPhase_FT_aggregate_arima_vector) # [1] 31 387 IFT_NilPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) # Invert back to spacetime the # FT_aggregate_arima_vector_country_ranking_df$magnitudes[ , i] signal with nil-phase IFT_NilPhase_FT_aggregate_arima_vector <- Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, TRUE, nilPhase_FT_aggregate_arima_vector)) colnames(IFT_NilPhase_FT_aggregate_arima_vector) <- colnames(aggregate_arima_vector_country_ranking_df) rownames(IFT_NilPhase_FT_aggregate_arima_vector) <- rownames(aggregate_arima_vector_country_ranking_df) dim(IFT_NilPhase_FT_aggregate_arima_vector) dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes) colnames(IFT_NilPhase_FT_aggregate_arima_vector) # IFT_NilPhase_FT_aggregate_arima_vector[1:5, 1:4]; temp_Data[1:5, 1:4] # 2. Perform LASSO modeling on IFT_NilPhase_FT_aggregate_arima_vector; # report param estimates and quality metrics AIC/BIC # library(forecast) set.seed(123) cvLASSO_nil_kime_all = cv.glmnet(data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:386]), # IFT_NilPhase_FT_aggregate_arima_vector[ , 387], alpha = 1, parallel=TRUE) Y, alpha = 1, parallel=TRUE) plot(cvLASSO_nil_kime_all) mtext("(Spacekime, Nil-phase) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_nil_kime_all <- predict(cvLASSO_nil_kime_all, s = exp(-1/4), # cvLASSO_nil_kime$lambda.min, newx = data.matrix(IFT_NilPhase_FT_aggregate_arima_vector[ , 1:386])); predLASSO_nil_kime_all # testMSE_LASSO_nil_kime <- mean((predLASSO_nil_kime - IFT_NilPhase_FT_aggregate_arima_vector[ , 387])^2) # testMSE_LASSO_nil_kime varImp(cvLASSO_nil_kime_all, lambda = exp(-1/4)) # cvLASSO_nil_kime_all$lambda.min) coefList_nil_kime_all <- coef(cvLASSO_nil_kime_all, s=exp(-1/4)) # 'lambda.min') coefList_nil_kime_all <- data.frame(coefList_nil_kime_all@Dimnames[[1]][coefList_nil_kime_all@i+1], coefList_nil_kime_all@x) names(coefList_nil_kime_all) <- c('Feature','EffectSize') arrange(coefList_nil_kime_all, -abs(EffectSize))[1:9, ] # Feature EffectSize #1 (Intercept) 26.385520159 #Feature_2_ArimaVec_6 0.07190025 #Feature_6_ArimaVec_6 0.49799326 #Feature_11_ArimaVec_4 8.45132661 #Feature_12_ArimaVec_4 5.38002499 #Feature_12_ArimaVec_8 10.37633956 #Feature_26_ArimaVec_6 2.06530937 #Feature_30_ArimaVec_4 3.29579474 #Feature_34_ArimaVec_5 0.96507673 #Feature_37_ArimaVec_2 0.01033978 #Feature_39_ArimaVec_5 2.08659578 # ARIMA-spacetime: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff # ARIMA-spacekimeNil: 2=forecast_avg, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, 8=non-seasonal Diff #9 ARIMA-derived vector includes: # (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, # 7=period, 8=non-seasonal Diff, 9=seasonal differences) coef(cvLASSO_nil_kime_all, s = exp(-1/4)) %>% # "lambda.min") %>% broom::tidy() %>% filter(row != "(Intercept)") %>% top_n(100, wt = abs(value)) %>% ggplot(aes(value, reorder(row, value), color = value > 0)) + geom_point(show.legend = FALSE, aes(size = abs(value))) + ggtitle("(Spacekime) Top 9 salient features (LASSO penalty)") + xlab("Effect-size") + ylab(NULL) # pack a 31*3 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, Y) validation_nil_kime_all <- cbind(predLASSO_nil_kime_all[, 1], IFT_NilPhase_FT_aggregate_arima_vector[ , 387], Y) colnames(validation_nil_kime_all) <- c("predLASSO_kime", "IFT_NilPhase_FT_Y", "Orig_Y") rownames(validation_nil_kime_all)[11] <- "Germany" head(validation_nil_kime_all) validation_nil_kime_all <- as.data.frame(cbind(validation_nil_kime_all, ifelse (validation_nil_kime_all[,3]<=30, 1, 0))) colnames(validation_nil_kime_all)[4] <- "Top30Rank" head(validation_nil_kime_all) # Prediction correlations: # cor(validation_nil_kime[ , 1], validation_nil_kime[, 2]) # Y=predLASSO_kime OA rank vs. kime_LASSO_pred: 0.99 cor(validation_nil_kime_all[ , 1], validation_nil_kime_all[, 3]) # Y=predLASSO_kime OA rank vs. Orig_Y: 0.64 # Plot observed Y (Overall Country ranking) vs. LASSO (9-parameters) predicted Y^ linFit1_nil_kime_all <- lm(predLASSO_nil_kime_all ~ validation_nil_kime_all[ , 3]) plot(predLASSO_nil_kime_all ~ validation_nil_kime_all[ , 3], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="IFT_NilPhase predLASSO_kime", main = sprintf("Observed (x) vs. IFT_NilPhase Predicted (y) Overall Country Ranking, cor=%.02f", cor(validation_nil_kime_all[ , 1], validation_nil_kime_all[, 3]))) abline(linFit1_nil_kime_all, lwd=3, col="red") # abline(linFit1, lwd=3, col="green") # Spacetime LASSO modeling myPlotNilPhase_all <- ggplot(as.data.frame(validation_nil_kime_all), aes(x=Orig_Y, y=predLASSO_nil_kime_all, label=rownames(validation_nil_kime_all))) + geom_smooth(method='lm') + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_nil_kime)))) + geom_label_repel(aes(label = rownames(validation_nil_kime_all), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacekime LASSO Predicted, Nil-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_nil_kime_all[ , 1], validation_nil_kime_all[, 3])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacekime LASSO Predicted, using Nil-Phases") myPlotNilPhase_all ``` **Swapped Feature Phases** and then synthesize the data (reconstruction) ```{r message=F, error=F, warning=F} # temp_Data <- aggregate_arima_vector_country_ranking_df swappedPhase_FT_aggregate_arima_vector <- FT_aggregate_arima_vector_country_ranking_df$phases dim(swappedPhase_FT_aggregate_arima_vector) # ; head(swappedPhase_FT_aggregate_arima_vector) # [1] 31 387 IFT_SwappedPhase_FT_aggregate_arima_vector <- array(complex(), c(dim(temp_Data)[1], dim(temp_Data)[2])) set.seed(1234) # sample randomly Phase-columns for each of the 131 covariates (X) swappedPhase_FT_aggregate_arima_vector1 <- as.data.frame(cbind( swappedPhase_FT_aggregate_arima_vector[ , sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 1:378]))], # mix ARIMA signature phases swappedPhase_FT_aggregate_arima_vector[ , sample(ncol(swappedPhase_FT_aggregate_arima_vector[ , 379:386]))],# mix the meta-data phases swappedPhase_FT_aggregate_arima_vector[ , 387])) # add correct Outcome phase swappedPhase_FT_aggregate_arima_vector <- swappedPhase_FT_aggregate_arima_vector1 colnames(swappedPhase_FT_aggregate_arima_vector) <- colnames(temp_Data) colnames(swappedPhase_FT_aggregate_arima_vector); dim(swappedPhase_FT_aggregate_arima_vector) # 31 387 # Invert back to spacetime the # FT_aggregate_arima_vector$magnitudes[ , i] signal with swapped-X-phases (Y-phase is fixed) IFT_SwappedPhase_FT_aggregate_arima_vector <- Re(kSpaceTransform(FT_aggregate_arima_vector_country_ranking_df$magnitudes, TRUE, swappedPhase_FT_aggregate_arima_vector)) # Save IFT_SwappedPhase_FT_aggregate_arima_vector out for PCA/t-SNE SpaceKime modeling #options(digits = 2) #write.table(format(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:386]), # #scientific=FALSE), #, digits=2), # fileEncoding = "UTF-16LE", append = FALSE, quote = FALSE, sep = "\t", # eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = FALSE, # "C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_TensorData_31Countries_By_386Features_SpaceKime_SwappedPhase.txt") colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) <- colnames(aggregate_arima_vector_country_ranking_df) rownames(IFT_SwappedPhase_FT_aggregate_arima_vector) <- rownames(aggregate_arima_vector_country_ranking_df) dim(IFT_SwappedPhase_FT_aggregate_arima_vector) dim(FT_aggregate_arima_vector_country_ranking_df$magnitudes) colnames(IFT_SwappedPhase_FT_aggregate_arima_vector) # IFT_SwappedPhase_FT_aggregate_arima_vector[1:5, 1:4]; temp_Data[1:5, 1:4] # 2. Perform LASSO modeling on IFT_SwappedPhase_FT_aggregate_arima_vector; # report param estimates and quality metrics AIC/BIC set.seed(12) cvLASSO_kime_swapped = cv.glmnet(data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378]), Y, alpha = 1, parallel=TRUE) plot(cvLASSO_kime_swapped) mtext("(Spacekime, Swapped-Phases) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5) # Identify top predictors and forecast the Y=Overall (OA) Country ranking outcome predLASSO_kime_swapped <- predict(cvLASSO_kime_swapped, s = cvLASSO_kime_swapped$lambda.min, newx = data.matrix(IFT_SwappedPhase_FT_aggregate_arima_vector[ , 1:378])) # testMSE_LASSO_kime_swapped <- # mean((predLASSO_kime_swapped - IFT_SwappedPhase_FT_aggregate_arima_vector[ , 387])^2) # testMSE_LASSO_kime_swapped predLASSO_kime_swapped # Plot Regression Coefficients: create variable names for plotting betaHatLASSO_kime_swapped = as.double(coef(cvLASSO_kime_swapped, s=cvLASSO_kime_swapped$lambda.min)) #cvLASSO_kime_swapped$lambda.1se #coefplot(betaHatLASSO_kime_swapped[377:386], sd = rep(0, 10), pch=0, cex.pts = 3, col="red", # main = "(Spacekime, Swapped-Phases) LASSO-Regularized Regression Coefficient Estimates", # varnames = varNames[377:386]) varImp(cvLASSO_kime_swapped, lambda = cvLASSO_kime_swapped$lambda.min) coefList_kime_swapped <- coef(cvLASSO_kime_swapped, s='lambda.min') coefList_kime_swapped <- data.frame(coefList_kime_swapped@Dimnames[[1]][coefList_kime_swapped@i+1], coefList_kime_swapped@x) names(coefList_kime_swapped) <- c('Feature','EffectSize') arrange(coefList_kime_swapped, -abs(EffectSize))[2:10, ] # Feature EffectSize #2 Feature_2_ArimaVec_8 1.484856e+15 #3 Feature_42_ArimaVec_7 1.970862e+14 #4 Feature_23_ArimaVec_7 -2.467246e+13 #5 Feature_37_ArimaVec_5 -2.983216e+00 #6 Feature_34_ArimaVec_4 -1.382639e+00 #7 Feature_36_ArimaVec_3 -1.198157e+00 #8 Feature_6_ArimaVec_3 1.106294e-01 #9 Feature_38_ArimaVec_2 -1.058259e-02 #10 Feature_38_ArimaVec_1 -1.124584e-03 # # ARIMA-spacetime: 4=non-seasonal MA, 5=seasonal AR, 8=non-seasonal Diff # ARIMA-spacekime_nill: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA # ARIMA-spacekime_swapped: 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA # 9 ARIMA-derived vector includes: # (1=ts_avg, 2=forecast_avg, 3=non-seasonal AR, 4=non-seasonal MA, 5=seasonal AR, 6=seasonal MA, # 7=period, 8=non-seasonal Diff, 9=seasonal differences) coef(cvLASSO_kime_swapped, s = "lambda.min") %>% broom::tidy() %>% filter(row != "(Intercept)") %>% top_n(100, wt = abs(value)) %>% ggplot(aes(value, reorder(row, value), color = value > 0)) + geom_point(show.legend = FALSE, aes(size = abs(value))) + ggtitle("(Spacekime, Swapped-Phases) Top 9 salient features (LASSO penalty)") + xlab("Effect-size") + ylab(NULL) # pack a 31*4 DF with (predLASSO_kime, IFT_NilPhase_FT_aggregate_arima_vector_Y, # IFT_SwappedPhase_FT_aggregate_arima_vector_Y, Y) validation_kime_swapped <- cbind(predLASSO[, 1], predLASSO_nil_kime_all[, 1], predLASSO_kime_swapped[ , 1], Y) colnames(validation_kime_swapped) <- c("predLASSO (spacetime)", "predLASSO_IFT_NilPhase", "predLASSO_IFT_SwappedPhase", "Orig_Y") head(validation_kime_swapped); dim(validation_kime_swapped) # Prediction correlations: cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4]) # predLASSO_IFT_SwappedPhase OA rank vs. predLASSO_spacekime: 0.7 cor(validation_kime_swapped[ , 1], validation_kime_swapped[, 3]) # predLASSO (spacetime) vs. predLASSO_IFT_SwappedPhase OA rank: 0.83 # Plot observed Y (Overall Country ranking), x-axis vs. Kime-Swapped LASSO (9-parameters) predicted Y^ linFit1_kime_swapped <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4]) plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="predLASSO_IFT_SwappedPhase_FT_Y", ylab="predLASSO_spacekime_swapped Country Overall Ranking", main = sprintf("Observed (x) vs. Kime IFT_SwappedPhase_FT_Y (y) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4]))) abline(linFit1_kime_swapped, lwd=3, col="red") #abline(linFit1_kime, lwd=3, col="green") # Plot Spacetime LASSO forecasting # Plot observed Y (Overall Country ranking), x-axis vs. LASSO (9-parameters) predicted Y^, y-axis linFit1_spacetime <- lm(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4]) plot(validation_kime_swapped[ , 3] ~ validation_kime_swapped[ , 4], col="blue", xaxt='n', yaxt='n', pch = 16, cex=3, xlab="Observed Country Overall Ranking", ylab="predLASSO_spacetime", main = sprintf("Predicted (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4]))) abline(linFit1_spacetime, lwd=3, col="red") # add Top_30_Ranking_Indicator validation_kime_swapped <- as.data.frame(cbind(validation_kime_swapped, ifelse (validation_kime_swapped[,4]<=30, 1, 0))) colnames(validation_kime_swapped)[5] <- "Top30Rank" rownames(validation_kime_swapped)[11] <- "Germany" head(validation_kime_swapped) # Write out the Binary top-30/Not-Top-30 labels df_top30 <- cbind(countryNames, validation_kime_swapped[ , 5]) colnames(df_top30) <- c("Country", "Top-30-Label") write.table(df_top30, fileEncoding = "UTF-16LE", append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = TRUE, "C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_Labels_31Countries_Top_30_BinaryOutcome.txt") # Spacetime LASSO modeling myPlotSwappedPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y, y=validation_kime_swapped[, 3], label=rownames(validation_kime_swapped))) + geom_smooth(method='lm') + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) + geom_label_repel(aes(label = rownames(validation_kime_swapped), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacekime LASSO Predicted, Swapped-Phases (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 3], validation_kime_swapped[, 4])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacekime LASSO Predicted, using Swapped-Phases") myPlotSwappedPhase ``` #### Overall Plots ```{r message=F, error=F, warning=F} myPlotNilPhase <- ggplot(as.data.frame(validation_kime_swapped), aes(x=Orig_Y, y=predLASSO_kime, label=rownames(validation_kime_swapped))) + geom_smooth(method='lm') + geom_point() + # Color by groups # geom_text(aes(color=factor(rownames(validation_kime_swapped)))) + geom_label_repel(aes(label = rownames(validation_kime_swapped), fill = factor(Top30Rank)), color = 'black', size = 5, point.padding = unit(0.3, "lines")) + # theme(legend.position = "bottom") + theme(legend.position = c(0.1, 0.9), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) + scale_fill_discrete(name = "Country Overall Ranking", labels = c("Below 30 Rank", "Top 30 Rank")) + labs(title=sprintf("Spacekime LASSO Predicted, Nil-Phases, (y) vs. Observed (x) Overall Country Ranking, cor=%.02f", cor(validation_kime_swapped[ , 2], validation_kime_swapped[, 4])), x ="Observed Overall Country Ranking (1 is 'best')", y = "Spacekime LASSO Predicted, using Nil-Phases") myPlotNilPhase countryNames[11]<-"Germany" aggregateResults <- (rbind(cbind(as.character(countryNames), "predLASSO_spacetime_386", as.numeric(predLASSO)), cbind(as.character(countryNames), "predLASSO_spacetime_378", predLASSO_lim), cbind(as.character(countryNames), "predLASSO_nil_kime_386", predLASSO_nil_kime_all), cbind(as.character(countryNames), "predLASSO_nil_kime_378", predLASSO_nil_kime), cbind(as.character(countryNames), "predLASSO_swapped_kime_386", predLASSO_kime_swapped), cbind(as.character(countryNames), "predLASSO_swapped_kime_378", predLASSO_kime_swapped_lim), cbind(as.character(countryNames), "observed", Y) )) aggregateResults <- data.frame(aggregateResults[ , -3], as.numeric(aggregateResults[,3])) colnames(aggregateResults) <- c("country", "estimate_method", "ranking") ggplot(aggregateResults, aes(x=country, y=ranking, color=estimate_method)) + geom_point(aes(shape=estimate_method, color=estimate_method, size=estimate_method)) + geom_point(size = 5) + geom_line(data = aggregateResults[aggregateResults$estimate_method == "observed", ], aes(group = estimate_method), size=2, linetype = "dashed") + theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + # theme(legend.position = "bottom") + # scale_shape_manual(values = as.factor(aggregateResults$estimate_method)) + # scale_color_gradientn(colours = rainbow(7)) + theme(text = element_text(size = 15), legend.position = c(0.3, 0.85), axis.text=element_text(size=16), legend.text = element_text(colour="black", size=12, face="bold"), legend.title = element_text(colour="black", size=14, face="bold")) library(plotly) plot_ly(aggregateResults) %>% add_markers(x = ~country, y = ~ranking, type = "scatter", color = ~estimate_method, colors = c("black", "red", "blue", "green", "purple", "orange", "yellow"), mode = "markers", marker = list(size = ~ranking, opacity=~(1-(ranking-1)/49), line = list(color = "black", width = 2))) %>% layout(title="Spacekime Analytics - Country Ranking using Different Phase Estimators", legend = list(orientation = "h", # show legend horizontally xanchor = "center", # use center of legend as anchor x = 0)) # put legend in center of x-axis) ``` ### Unsupervised clustering Use [hierarchical, k-means and spectral clustering](https://dspa.predictive.space/) to generate derived-computed phenotypes of countries. Do these derived labels relate (correspond to) the overall (OA) country ranking? ```{r message=F, error=F, warning=F} load("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_SpaceKime.RData") # View(aggregate_arima_vector_country_ranking_df) dim(aggregate_arima_vector_country_ranking_df) # 31(countries) 387(features) # Features = country-index + 386 features (378 time-series derivatives + 8 meta-data features) ``` #### Spacetime analytics - **1.1 Lasso features selection** ```{r warning=F, error=F} eudata <- aggregate_arima_vector_country_ranking_df colnames(eudata) <- c("country",colnames(eudata[,-1])) eudata <- eudata[ , -ncol(eudata)] Y<-aggregate_arima_vector_country_ranking_df$OA # Complete data 386 features (378 + 8) X<-eudata[,-ncol(eudata)]; dim(X) # TS-derivative features only (378) X378 <- X[, -c(379:386)]; dim(X378) countryinfo<-as.character(X[,1]) countryinfo[11]<-"Germany" X<-X[,-1] keeplist<-NULL for (i in 1:ncol(X)) { if(FALSE %in% (X[,i]==mean(X[,i]))) {keeplist<-c(keeplist,i)} } X<-X[,keeplist]; dim(X) # Reduced to 378 features #countryinfo<-as.character(X378[,1]) #countryinfo[11]<-"Germany" #X378<-X378[,-1] #keeplist<-NULL #for (i in 1:ncol(X378)) { # if(FALSE %in% (X378[,i]==mean(X378[,i]))) {keeplist<-c(keeplist,i)} #} #X378<-X378[,keeplist]; dim(X378) library(glmnet) fitLASSO <- glmnet(as.matrix(X), Y, alpha = 1) library(doParallel) registerDoParallel(5) #cross-validation cvLASSO = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE) # fitLASSO <- glmnet(as.matrix(X378), Y, alpha = 1) #library(doParallel) #registerDoParallel(5) #cross-validation #cvLASSO = cv.glmnet(data.matrix(X378), Y, alpha = 1, parallel=TRUE) # To choose features we like to have based on lasso chooselambda <- function(cvlasso, option, k=NULL) { lambmat<-cbind(cvlasso$glmnet.fit$df,cvlasso$glmnet.fit$lambda) result<-tapply(lambmat[,2],lambmat[,1],max) kresult<-result[which(names(result)==as.factor(k))] if(option==1) {return(result)} else if (option==2) {return(kresult)} else (return("Not a valid option")) } showfeatures <- function(object, lambda, k ,...) { lam<-lambda[which(names(lambda)==as.factor(k))] beta <- predict(object, s = lam, type = "coef") if(is.list(beta)) { out <- do.call("cbind", lapply(beta, function(x) x[,1])) out <- as.data.frame(out) s <- rowSums(out) out <- out[which(s)!=0,,drop=FALSE] } else {out<-data.frame(Overall = beta[,1]) out<-out[which(out!=0),,drop=FALSE] } out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE]) out } ``` - ** 1.2 Comparison of different ML algorithms of different feature numbers** ```{r message=F, error=F, warning=F} #test training data setup randchoose <- function(matr) { leng<-nrow(matr) se<-seq(1:leng) sam<-sample(se,as.integer(leng*0.6)) return(sam) } eusample<-X eusample$Rank<-as.factor(ifelse(Y<30, 1, 0)) set.seed(1234) eutrain<-eusample[randchoose(eusample), ] set.seed(1234) eutest<-eusample[-randchoose(eusample), ] eusample378 <- X378 eusample378$Rank <- as.factor(ifelse(Y<30, 1, 0)) set.seed(1234) eutrain378 <- eusample378[randchoose(eusample378), ] set.seed(1234) eutest378 <- eusample378[-randchoose(eusample378), ] # Load Libraries library(e1071) library("randomForest") library(ada); library(adabag) library(caret) library(kernlab) library(cluster) library(ipred) library(ggplot2) MLcomp <- function(fitlas, cvlas, trn, test, option=1) { allfeat<-as.numeric(names(chooselambda(cvlasso = cvlas, option = 1))) allfeat<-allfeat[which(allfeat>4)] trainlist<-as.list(NULL) for (i in 1:length(allfeat)) { trainlist[[i]]<-trn[,which(colnames(trn) %in% c(row.names(showfeatures(fitlas, chooselambda(cvlas = cvlas,1), allfeat[i])), "Rank"))] } resultframe<-data.frame(origin=rep(NA,length(allfeat))) rownames(resultframe)<-allfeat resultframe$Decision_tree_bagging<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { set.seed(1234) eubag<-ipred::bagging(Rank~.,data = trainlist[[i]],nbagg=100) bagtest<-predict(eubag, eutest) bagagg<-bagtest==eutest$Rank accuracy<-prop.table(table(bagagg))[c("TRUE")] resultframe$Decision_tree_bagging[i]<-accuracy } resultframe$Random_forest<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { set.seed(1234) eurf<-randomForest(Rank~.,data=trainlist[[i]]) rftest<-predict(eurf,eutest) rfagg<-rftest==eutest$Rank accuracy<-prop.table(table(rfagg))[c("TRUE")] resultframe$Random_forest[i]<-accuracy } resultframe$Decision_tree_adaboost<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { set.seed(1234) enada<-ada(Rank~.,data = trainlist[[i]],iter=50) adatest<-predict(enada,eutest) adaagg<-adatest==eutest$Rank accuracy<-prop.table(table(adaagg))[c("TRUE")] resultframe$Decision_tree_adaboost[i]<-accuracy } resultframe$GLM<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { euglm<-glm(Rank~.,data = trainlist[[i]],family = "binomial") glmtest<-predict(euglm,eutest) glmtest<-ifelse(glmtest<0,0,1) glmagg<-glmtest==eutest$Rank accuracy<-prop.table(table(glmagg))[c("TRUE")] resultframe$GLM[i]<-accuracy } resultframe$SVM_best_Gamma_Cost<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { set.seed(1234) svmtune<-tune.svm(Rank~.,data = trainlist[[i]],gamma = 10^(-6:1),cost = 10^(-10:10)) svmed<-svm(Rank~.,data=trainlist[[i]],gamma=svmtune$best.parameters[1],cost=svmtune$best.parameters[2]) svmtest<-predict(svmed,eutest) svmagg<-svmtest==eutest$Rank accuracy<-prop.table(table(svmagg))[c("TRUE")] resultframe$SVM_best_Gamma_Cost[i]<-accuracy } resultframe$origin<-NULL if(option==1){return(resultframe)} } resultframe <- MLcomp(fitLASSO, cvLASSO, eutrain, eutest, 1) resultframe_386_ST <- resultframe # View(resultframe_386_ST) # resultframe_378_ST <- MLcomp(fitLASSO, cvLASSO, eutrain378, eutest378, 1) # Display results resultframe$features<-as.factor(as.numeric(rownames(resultframe))) ppframe<-data.frame(NULL) for (i in 1:5) { FM <- data.frame(resultframe[,i], resultframe$features, Methods<-rep(colnames(resultframe)[i], nrow(resultframe))) ppframe<-rbind(ppframe, FM) } colnames(ppframe)<-c("Accuracy", "Features", "Methods") ggplot(ppframe, aes(x=Features, y=Accuracy, colour=Methods, group=Methods, shape=Methods))+ geom_line(position=position_dodge(0.2), lwd=2)+ ylim(0.2, 1.0) + geom_point(size=5, position=position_dodge(0.2))+ theme(legend.position="top", legend.text=element_text(size=16))+ ggtitle("Spacetime (386 features): Compare ML Forecasting Results")+ theme( axis.text=element_text(size=16), plot.title = element_text(size=18, face="bold.italic"), axis.title.x = element_text(size=14, face="bold"), axis.title.y = element_text(size=14, face="bold")) # spacetime (ST) 378_ST resultframe_378_ST$features<-as.factor(as.numeric(rownames(resultframe_378_ST))) ppframe_378_ST<-data.frame(NULL) for (i in 1:5) { FM_378_ST <- data.frame(resultframe_378_ST[,i], resultframe_378_ST$features, Methods<-rep(colnames(resultframe_378_ST)[i], nrow(resultframe_378_ST))) ppframe_378_ST<-rbind(ppframe_378_ST, FM_378_ST) } colnames(ppframe_378_ST)<-c("Accuracy", "Features", "Methods") ggplot(ppframe_378_ST, aes(x=Features, y=Accuracy, colour=Methods, group=Methods, shape=Methods))+ geom_line(position=position_dodge(0.2), lwd=2)+ ylim(0.2, 1.0) + geom_point(size=5, position=position_dodge(0.2))+ theme(legend.position="top", legend.text=element_text(size=16))+ ggtitle("Spacetime (386 features): Compare ML Forecasting Results")+ theme( axis.text=element_text(size=16), plot.title = element_text(size=18, face="bold.italic"), axis.title.x = element_text(size=14, face="bold"), axis.title.y = element_text(size=14, face="bold")) ``` - **1.3 Clustering** ```{r message=F, error=F, warning=F} showfeatures(fitLASSO, chooselambda(cvLASSO,1), 10) feat_5 <- predict(fitLASSO, s = chooselambda(cvLASSO,2,10), newx = data.matrix(X)) df1 <- as.data.frame(rbind(as.numeric(feat_5),Y), row.names = c("Predicted Rank","OA Rank")) colnames(df1) <- countryNames df1 # View(t(df1)) # Clustering cluster5 <- X[, which(colnames(X) %in% row.names(showfeatures(fitLASSO, chooselambda(cvLASSO,1), 10)))] rownames(cluster5) <- countryNames # countryinfo #1. hierarchical clustering scaled_cluster5 <- scale(cluster5) ##deal with NAN values #scaled_country<-scaled_country[,which(is.nan(scaled_country[1,])==FALSE)] dis_SC5 <- dist(scaled_cluster5) H_clust_SC5 <- hclust(dis_SC5) library("factoextra") library("FactoMineR") H_clust_SC5 <- eclust(scaled_cluster5, k=5, "hclust") fviz_dend(H_clust_SC5, rect = TRUE, cex=0.5) # fviz_dend(H_clust_SC5, lwd=2, rect = TRUE) # ST 378 cluster5_378_ST <- X378[, which(colnames(X378) %in% row.names(showfeatures(fitLASSO, chooselambda(cvLASSO,1), 10)))] rownames(cluster5_378_ST) <- countryNames # countryinfo #1. hierarchical clustering scaled_cluster5_378_ST <- scale(cluster5_378_ST) dis_SC5_378_ST <- dist(scaled_cluster5_378_ST) H_clust_SC5_378_ST <- hclust(dis_SC5_378_ST) H_clust_SC5_378_ST <- eclust(scaled_cluster5_378_ST, k=5, "hclust") fviz_dend(H_clust_SC5_378_ST,rect = TRUE, cex=0.5) ``` #### Spacekime - Nil-Phase - ** 2.1 Lasso features selection** - ** 2.2 Comparison of different ML algorithms of different feature numbers** - ** 2.3 Clustering** #### Spacekime - Swapped-Phase - ** 3.1 Lasso features selection** ```{r warning=F, error=F} dim(IFT_SwappedPhase_FT_aggregate_arima_vector) # [1] 31 387 eudata_SwappedPhase <- IFT_SwappedPhase_FT_aggregate_arima_vector colnames(eudata_SwappedPhase) <- c("country", colnames(eudata_SwappedPhase[,-1])) eudata_SwappedPhase <- as.data.frame(eudata_SwappedPhase[ , -ncol(eudata_SwappedPhase)]) Y <- as.data.frame(IFT_SwappedPhase_FT_aggregate_arima_vector)$OA # Complete data 386 features (378 + 8) X <- eudata_SwappedPhase countryinfo<-as.character(X[,1]) countryinfo[11]<-"Germany" X<-X[,-1] keeplist<-NULL for (i in 1:ncol(X)) { if(FALSE %in% (X[,i]==mean(X[,i]))) {keeplist<-c(keeplist,i)} } X<-X[,keeplist]; dim(X) # 31 343 # Reduced to 378 features # TS-derivative features only (378) # X378 <- X[, -c(379:386)]; dim(X378) #countryinfo<-as.character(X378[,1]) #countryinfo[11]<-"Germany" #X378<-X378[,-1] #keeplist<-NULL #for (i in 1:ncol(X378)) { # if(FALSE %in% (X378[,i]==mean(X378[,i]))) {keeplist<-c(keeplist,i)} #} #X378<-X378[,keeplist]; dim(X378) library(glmnet) fitLASSO_X <- glmnet(as.matrix(X), Y, alpha = 1) library(doParallel) registerDoParallel(5) #cross-validation cvLASSO_X = cv.glmnet(data.matrix(X), Y, alpha = 1, parallel=TRUE) # fitLASSO_X <- glmnet(as.matrix(X378), Y, alpha = 1) #library(doParallel) #registerDoParallel(5) #cross-validation #cvLASSO_X = cv.glmnet(data.matrix(X378), Y, alpha = 1, parallel=TRUE) # To choose features we like to have based on lasso chooselambda <- function(cvlasso, option, k=NULL) { lambmat<-cbind(cvlasso$glmnet.fit$df,cvlasso$glmnet.fit$lambda) result<-tapply(lambmat[,2],lambmat[,1],max) kresult<-result[which(names(result)==as.factor(k))] if(option==1) {return(result)} else if (option==2) {return(kresult)} else (return("Not a valid option")) } showfeatures <- function(object, lambda, k ,...) { lam<-lambda[which(names(lambda)==as.factor(k))] beta <- predict(object, s = lam, type = "coef") if(is.list(beta)) { out <- do.call("cbind", lapply(beta, function(x) x[,1])) out <- as.data.frame(out) s <- rowSums(out) out <- out[which(s)!=0,,drop=FALSE] } else {out<-data.frame(Overall = beta[,1]) out<-out[which(out!=0),,drop=FALSE] } out <- abs(out[rownames(out) != "(Intercept)",,drop = FALSE]) out } ``` - ** 3.2 Comparison of different ML algorithms of different feature numbers** ```{r message=F, error=F, warning=F} #test training data setup randchoose <- function(matr) { leng<-nrow(matr) se<-seq(1:leng) sam<-sample(se,as.integer(leng*0.6)) return(sam) } Xsample <- X Xsample$Rank <- as.factor(ifelse(Y<30, 1, 0)) set.seed(1234) Xtrain <- Xsample[randchoose(Xsample), ] set.seed(1234) Xtest <- Xsample[-randchoose(Xsample), ] #Xsample378 <- X378 #Xsample378$Rank <- as.factor(ifelse(Y<30, 1, 0)) #set.seed(1234) #Xtrain378 <- Xsample378[randchoose(Xsample378), ] #set.seed(1234) #Xtest378 <- Xsample378[-randchoose(Xsample378), ] # Load Libraries library(e1071) library("randomForest") library(ada); library(adabag) library(caret) library(kernlab) library(cluster) library(ipred) library(ggplot2) # resultXframe <- MLcomp(fitLASSO, cvLASSO, Xtrain, Xtest, 1) MLcompX <- function(fitlas, cvlas, trn, test, option=1) { allfeat<-as.numeric(names(chooselambda(cvlasso = cvlas, option = 1))) allfeat<-allfeat[which(allfeat>4)] trainlist<-as.list(NULL) for (i in 1:length(allfeat)) { trainlist[[i]]<-trn[,which(colnames(trn) %in% c(row.names(showfeatures(fitlas, chooselambda(cvlas = cvlas,1), allfeat[i])), "Rank"))] } resultXframe<-data.frame(origin=rep(NA,length(allfeat))) rownames(resultXframe)<-allfeat resultXframe$Decision_tree_bagging<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { #ERROR HANDLING possibleError <- tryCatch( function () { set.seed(1234) Xbag<-ipred::bagging(Rank~ . ,data = trainlist[[i]], nbagg=100, control=rpart.control(minsplit=2, cp=0.1, xval=10)) bagtest<-predict(Xbag, Xtest) bagagg<-bagtest==Xtest$Rank accuracy<-prop.table(table(bagagg))[c("TRUE")] resultXframe$Decision_tree_bagging[i]<-accuracy }, error=function(e) e ) if(inherits(possibleError, "error")) next # print(i) } resultXframe$Random_forest<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { set.seed(1234) Xrf<-randomForest(Rank~.,data=trainlist[[i]]) rftest<-predict(Xrf,test) rfagg<-rftest==test$Rank accuracy<-prop.table(table(rfagg))[c("TRUE")] resultXframe$Random_forest[i]<-accuracy } resultXframe$Decision_tree_adaboost<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { set.seed(1234) Xada<-ada(Rank~.,data = trainlist[[i]],iter=50) adatest<-predict(Xada,test) adaagg<-adatest==test$Rank accuracy<-prop.table(table(adaagg))[c("TRUE")] resultXframe$Decision_tree_adaboost[i]<-accuracy } resultXframe$GLM<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { euglm<-glm(Rank~.,data = trainlist[[i]],family = "binomial") glmtest<-predict(euglm,test) glmtest<-ifelse(glmtest<0,0,1) glmagg<-glmtest==test$Rank accuracy<-prop.table(table(glmagg))[c("TRUE")] resultXframe$GLM[i]<-accuracy } resultXframe$SVM_best_Gamma_Cost<-rep(NA,length(allfeat)) for (i in 1:length(allfeat)) { set.seed(1234) svmtune<-tune.svm(Rank~.,data = trainlist[[i]],gamma = 10^(-6:1),cost = 10^(-10:10)) svmed<-svm(Rank~.,data=trainlist[[i]],gamma=svmtune$best.parameters[1],cost=svmtune$best.parameters[2]) svmtest<-predict(svmed,test) svmagg<-svmtest==test$Rank accuracy<-prop.table(table(svmagg))[c("TRUE")] resultXframe$SVM_best_Gamma_Cost[i]<-accuracy } resultXframe$origin<-NULL if(option==1){return(resultXframe)} } resultXframe <- MLcompX(fitLASSO_X, cvLASSO_X, Xtrain, Xtest, 1) resultXframe_386_SK_Swapped <- resultXframe # View(resultXframe_386_SK_Swapped) # resultXframe_378_SK_Swapped <- MLcompX(fitLASSO_X, cvLASSO_X, Xtrain378, Xtest378, 1) # Display results resultXframe$features<-as.factor(as.numeric(rownames(resultXframe))) ppframeX<-data.frame(NULL) for (i in 1:5) { FM <- data.frame(resultXframe[,i], resultXframe$features, Methods<-rep(colnames(resultXframe)[i], nrow(resultXframe))) ppframeX<-rbind(ppframeX, FM) } colnames(ppframeX)<-c("Accuracy", "Features", "Methods") ggplot(ppframeX, aes(x=Features, y=Accuracy, colour=Methods, group=Methods, shape=Methods))+ geom_line(position=position_dodge(0.2), lwd=2)+ ylim(0.2, 1.0) + geom_point(size=5, position=position_dodge(0.2))+ theme(legend.position="top", legend.text=element_text(size=16))+ ggtitle("Spacekime Swapped-Phases (386 features): Compare ML Forecasting Results")+ theme( axis.text=element_text(size=16), plot.title = element_text(size=18, face="bold.italic"), axis.title.x = element_text(size=14, face="bold"), axis.title.y = element_text(size=14, face="bold")) # spacetime (ST) 378_ST resultframe_378_ST$features<-as.factor(as.numeric(rownames(resultframe_378_ST))) ppframe_378_ST<-data.frame(NULL) for (i in 1:5) { FM_378_ST <- data.frame(resultframe_378_ST[,i], resultframe_378_ST$features, Methods<-rep(colnames(resultframe_378_ST)[i], nrow(resultframe_378_ST))) ppframe_378_ST<-rbind(ppframe_378_ST, FM_378_ST) } colnames(ppframe_378_ST)<-c("Accuracy", "Features", "Methods") ggplot(ppframe_378_ST, aes(x=Features, y=Accuracy, colour=Methods, group=Methods, shape=Methods))+ geom_line(position=position_dodge(0.2), lwd=2)+ ylim(0.2, 1.0) + geom_point(size=5, position=position_dodge(0.2))+ theme(legend.position="top", legend.text=element_text(size=16))+ ggtitle("Spacetime (386 features): Compare ML Forecasting Results")+ theme( axis.text=element_text(size=16), plot.title = element_text(size=18, face="bold.italic"), axis.title.x = element_text(size=14, face="bold"), axis.title.y = element_text(size=14, face="bold")) ##################### for resultXframe_378_SK_Swapped resultXframe_378_SK_Swapped$features<-as.factor(as.numeric(rownames(resultXframe_378_SK_Swapped))) ppframeX<-data.frame(NULL) for (i in 1:5) { FM <- data.frame(resultXframe_378_SK_Swapped[, i], resultXframe_378_SK_Swapped$features, Methods<-rep(colnames(resultXframe_378_SK_Swapped)[i], nrow(resultXframe_378_SK_Swapped))) ppframeX<-rbind(ppframeX, FM) } colnames(ppframeX)<-c("Accuracy", "Features", "Methods") ggplot(ppframeX, aes(x=Features, y=Accuracy, colour=Methods, group=Methods, shape=Methods))+ geom_line(position=position_dodge(0.2), lwd=2)+ ylim(0.2, 1.0) + geom_point(size=5, position=position_dodge(0.2))+ theme(legend.position="top", legend.text=element_text(size=16))+ ggtitle("Spacekime Swapped-Phases (386 features): Compare ML Forecasting Results")+ theme( axis.text=element_text(size=16), plot.title = element_text(size=18, face="bold.italic"), axis.title.x = element_text(size=14, face="bold"), axis.title.y = element_text(size=14, face="bold")) ``` - ** 3.3 Clustering** ```{r message=F, error=F, warning=F} showfeatures(fitLASSO_X, chooselambda(cvLASSO_X, 1), 10) feat_5 <- predict(fitLASSO_X, s = chooselambda(cvLASSO_X, 2, 10), newx = data.matrix(X)) df1 <- as.data.frame(rbind(as.numeric(feat_5), Y), row.names = c("Predicted Rank","OA Rank")) colnames(df1) <- countryNames df1 # View(t(df1)) # Clustering cluster5 <- X[, which(colnames(X) %in% row.names(showfeatures(fitLASSO_X, chooselambda(cvLASSO_X, 1), 10)))] rownames(cluster5) <- countryNames # countryinfo #1. hierarchical clustering scaled_cluster5 <- scale(cluster5) ##deal with NAN values #scaled_country<-scaled_country[,which(is.nan(scaled_country[1,])==FALSE)] dis_SC5 <- dist(scaled_cluster5) H_clust_SC5 <- hclust(dis_SC5) library("factoextra") library("FactoMineR") H_clust_SC5 <- eclust(scaled_cluster5, k=5, "hclust") fviz_dend(H_clust_SC5, rect = TRUE, cex=0.5) # fviz_dend(H_clust_SC5, lwd=2, rect = TRUE) # ST 378 cluster5_378_SK <- X378[, which(colnames(X378) %in% row.names(showfeatures(fitLASSO_X, chooselambda(cvLASSO_X, 1), 10)))] rownames(cluster5_378_SK) <- countryNames # countryinfo #1. hierarchical clustering scaled_cluster5_378_SK <- scale(cluster5_378_SK) dis_SC5_378_SK <- dist(scaled_cluster5_378_SK) H_clust_SC5_378_SK <- hclust(dis_SC5_378_SK) H_clust_SC5_378_SK <- eclust(scaled_cluster5_378_SK, k=5, "hclust") fviz_dend(H_clust_SC5_378_SK,rect = TRUE, cex=0.5) ``` # Save Workspace ```{r message=F, error=F, warning=F} # Save this entire Computed Workspace as an image: # save.image("C:/Users/dinov/Desktop/Ivo.dir/Research/UMichigan/Publications_Books/2018/others/4D_Time_Space_Book_Ideas/ARIMAX_EU_DataAnalytics/EU_Econ_SpaceKime.RData") ```
SOCR Resource Visitor number Web Analytics SOCR Email