SOCR ≫ TCIU Website ≫ TCIU GitHub ≫

1 Data Preprocessing

#super sample the dataset
cleardata <- function(mat) {
  for (i in 1:ncol(mat)) {
    mat[is.na(mat[,i]),i]<-mean(mat[,i],na.rm = T) + rnorm(sum(is.na(mat[,i])),sd = sd(mat[,i],na.rm = T))
  }
  return(mat)
}
# 1. Find all "Common" freatures (highly-observed and congruent Econ indicators)
countryNames <- unique(time_series$country); length(countryNames); # countryNames
# initialize 3D array of DF's that will store the data for each of hte countries into a 2D frame
countryData <- list()  # countryData[[listID==Country]][1-time-72, 1-feature-197]

for (i in 1:length(countryNames)) {
  countryData[[i]] <- filter(time_series, country == countryNames[i])
}
# Check countryData[[2]][2, 3] == Belgium[2,3]

list_of_dfs_CommonFeatures <- list()  # list of data for supersampled countries 360 * 197

# 2. General function that ensures the XReg predictors for ALL 31 EU countries are homologous
completeHomologousX_features <- function (list_of_dfs) {
  # delete features that are missing at all time points
  for (j in 1:length(list_of_dfs)) {
    print(paste0("Pre-processing Country: ...", countryNames[j], "... "))
    data = list_of_dfs[[j]]
    data = data[ , colSums(is.na(data)) != nrow(data)]
    data = select(data, -time, -country)
    DataMatrix = as.matrix(data)
    DataMatrix = cleardata(DataMatrix)
    DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value
    DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0
    # Supersample 72 --*5--> 360 timepoints 
    #DataMatrix = splinecreate(DataMatrix)
    DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data
    # remove some of features  
    #DataSuperSample = DataSuperSample[, -c(50:80)]; dim(X)  # 360 167
    # ensure full-rank design matrix, DataSuperSample
    DataSuperSample <- 
      DataSuperSample[ , qr(DataSuperSample)$pivot[seq_len(qr(DataSuperSample)$rank)]]
    print(paste0("dim()=(", dim(DataSuperSample)[1], ",", dim(DataSuperSample)[2], ") ..."))
    # update the current DF/Country
    list_of_dfs_CommonFeatures[[j]] <- DataSuperSample
  }

  # Identify All Xreg features that are homologous (same feature columns) across All 31 countries
  # Identify Common Columns (freatures)
  comCol <- Reduce(intersect, lapply(list_of_dfs_CommonFeatures, colnames))
  list_of_dfs_CommonFeatures <- lapply(list_of_dfs_CommonFeatures, function(x) x[comCol])

  for (j in 1:length(list_of_dfs_CommonFeatures)) {
    list_of_dfs_CommonFeatures[[j]] <- subset(list_of_dfs_CommonFeatures[[j]], select = comCol)
    print(paste0("dim(", countryNames[j], ")=(", dim(list_of_dfs_CommonFeatures[[j]])[1], 
                 ",", dim(list_of_dfs_CommonFeatures[[j]])[2], ")!"))  # 72 * 197
  }
  return(list_of_dfs_CommonFeatures)
}
# Test completeHomologousX_features: dim(AllCountries)=(360,42)!
list_of_dfs_CommonFeatures <- completeHomologousX_features(countryData); 
length(list_of_dfs_CommonFeatures); dim(list_of_dfs_CommonFeatures[[1]]) # Austria data matrix 360*42

2 Laplace Transform

# create the LT
NuLT = function(datax, datay, inputz, k = 3, fitwarning = FALSE, mirror = FALSE, range = 2*pi) {
  
  datax = as.numeric(datax)
  datay = as.numeric(datay)

  n = length(datax)
  x1 = n/(n+0.5)*((datax-min(datax))/(max(datax)-min(datax)))*range
  
  if(mirror){
    x1 = c(x1,rev(2*range - x1))/2
    n = 2*n
    datay = c(datay, rev(datay))
    #plot(x1, datay)
  }
  
  #generate the coefficients in indefinite integral of t^n*exp(-zt)
  coef = 1;
  coefm = as.matrix(coef)
  for(i in 1:k){
    coefm = cbind(coefm,0)
    coef = c(coef*i,1)
    coefm = rbind(coefm,coef)
  }
  # these coefficients ordered by ^0, ^1, ^2, ... in column format
  
  # compute 1, z, z^2...,z^k
  zz = cbind(1,inputz)
  zt = inputz
  for (i in 2:k){
    zt = zt*inputz
    zz = cbind(zz,zt)
  }
  zd = zt*inputz
  
  # compute 1, x, x^2...,x^k
  tx = x1;
  xm = cbind(1,x1)
  for (i in 2:k){
    tx = tx*x1
    xm = cbind(xm,tx)
  }
  
  
  # sum over intervals
  result = 0*inputz
  ii = 1
  while(ii+k<=n)
  {
    A = xm[seq(ii,ii+k),c(0:k+1)]
    b = datay[seq(ii,ii+k)]
    # polyfit might be faster when using polynomial basis, while matrix inverse, `solve()`,
    # is the more general approach that works for any function basis
    polyc = as.numeric(solve(A,b))

  
    #ordered by ^0, ^1, ^2, ... in column format
    
    # Enter a new function variable qualityCheck=FALSE
    # check fit quality; this step can be skipped for speed/efficiency
    # if (qualityCheck) { .... }
    
    if (fitwarning){
      xx = seq(A[1,2],A[k+1,2],length.out = 100);
      yy = polyval(rev(polyc),xx)
      if(max(abs(yy-mean(b)))>2*max(abs(b-mean(b)))){
        print(c("Warning: Poor Fit at ",ii,", Largest Deviation is",max(abs(yy-mean(b)))))
        print(c("Spline Polynomial is", polyc),3);
        #print(c(polyval(rev(polyc),A[,2]),b))
        plot(xx, yy, main="Polynomial fit", ylab="", type="l", col="blue")
        lines(A[,2],b, col="red")
        legend("topleft",c("fit","data"),fill=c("blue","red"))
        print(" ")
      }
    }
    
    # Use vector/matrix operations to avoid looping, 
    # some of the expressions look weird
    # May need to actually compare the efficiency/speed of
    # vector based vs. standard numeric calculations
    
    m1 = t(t(polyc*coefm)*A[1,])
    m11 = as.numeric(tapply(m1, col(m1)-row(m1), sum))[0:k+1] 
    
    m2 = t(t(polyc*coefm)*A[k+1,])
    m22 = as.numeric(tapply(m2, col(m2)-row(m2), sum))[0:k+1]
    
    intgl = (exp(-inputz*A[1,2])*colSums(t(zz)*m11)-
               exp(-inputz*A[k+1,2])*colSums(t(zz)*m22))/zd
    result = result+intgl
    ii=ii+k
  }
  
  # Computations over the last interval
  if(ii<n){
    nk = n-ii;
    A = xm[seq(ii,ii+nk),c(0:nk+1)]
    b = datay[seq(ii,ii+nk)]
    nc = as.numeric(solve(A,b))
    nc = c(nc,seq(0,0,length.out = k-nk))
    
    A = xm[seq(ii,ii+nk),]
    m1 = t(t(nc*coefm)*A[1,])
    m11 = as.numeric(tapply(m1, col(m1)-row(m1), sum))[0:k+1]
    
    m2 = t(t(nc*coefm)*A[nk+1,])
    m22 = as.numeric(tapply(m2, col(m2)-row(m2), sum))[0:k+1]
    
    # cc = colSums(coefm*polyc)
    intgl = (exp(-inputz*A[1,2])*colSums(t(zz)*m11)-
               exp(-inputz*A[nk+1,2])*colSums(t(zz)*m22))/zd
    result = result+intgl
  }
  #offset = 0.05*pi
  #result = result + datay[n]*(exp(-2*pi*inputz)-exp(-(2*pi+offset)*inputz))/inputz
  return(result)
}

3 Kimesurface of GDP for 30 EU countries

4 Inverse Laplace Transform