SOCR ≫ TCIU Website ≫ TCIU GitHub ≫

4 Fit the ARIMA model for each country

4.1 Belgium

Extract Belgium data.

Manually clean (preprocess) the Belgium data and fit ARIMAX model \[Y = BelguimSuperSample\$'Unemployment,\ Females,\ From\ 15-64\ years,\ Total',\]

\[X = XReg\ (all\ other\ covariates\ -starts\_with('Unemployment')).\]

#super sample the dataset
cleardata <- function(mat) {
  for (i in 1:ncol(mat)) {
    mat[is.na(mat[,i]),i]<-mean(mat[,i],na.rm = T) + rnorm(sum(is.na(mat[,i])),sd = sd(mat[,i],na.rm = T))
  }
  return(mat)
}

#use spline regression model to expand the dataset
splinecreate <- function(mat) {
  res<-NULL
  for (i in 1:ncol(mat)) {
    sp<-smooth.spline(seq(1:72),mat[,i])
    spresult<-predict(sp,seq(from=0,by=1/5,length.out = 360))
    spfeat<-spresult$y+rnorm(360,sd=sd(mat[,i]))
    res<-cbind(res,spfeat)
  }
  colnames(res)<-colnames(mat)
  return(res)
}
BelguimMatrix = as.matrix(belgium)
BelguimMatrix = cleardata(BelguimMatrix)
BelguimMatrix = splinecreate(BelguimMatrix)
BelguimSuperSample = as.data.frame(BelguimMatrix)

#############################
# Outcome Features to examine, predict, forecast, spacekime-analyze
#   "Gross domestic product at market prices"
#   "Unemployment , Females, From 15-64 years, Total"
#   "Capital transfers, payable"       
#   "Capital transfers, receivable"
#   "Debt securities"
#   "Government consolidated gross debt"
#   ... View(colnames(BelguimMatrix))
############################# 

################################################
#    "Unemployment , Females, From 15-64 years, Total"
Y = dplyr::select(BelguimSuperSample, "Unemployment , Females, From 15-64 years, Total"); dim(Y)
X = dplyr::select(BelguimSuperSample, -starts_with("Unemployment")); dim(X)
fitArimaX = auto.arima(Y, xreg=data.matrix(X[,-112])); fitArimaX$arma

################################################
#    "Government consolidated gross debt"
#Y = select(BelguimSuperSample, "Government consolidated gross debt"); dim(Y)
#X = select(BelguimSuperSample, -matches("debt|Debt")); dim(X)
# X_scale <- scale(X); Matrix::rankMatrix(X_scale); any(is.na(X_scale)); any(is.infinite(X_scale))
#     remove columns with infinite or missing value  
# X_scale_1 <- X_scale[ , !is.infinite(colSums(X_scale)) & !is.na(colSums(X_scale))]
#X_1 <- X[ , !is.infinite(colSums(X)) & !is.na(colSums(X))]; dim(X_1); Matrix::rankMatrix(X_1)
#X_2 <- X_1[, qr(X_1)$pivot[seq_len(qr(X_1)$rank)]]; dim(X_2)
# 
#fitArimaX = auto.arima(Y, xreg=X_2, method = "CSS", # "SANN", method = "CSS", optim.method = "BFGS"
#          optim.control=list(maxit = 20000, temp = 20), optim.method = "BFGS")
#fitArimaX; View(sort(fitArimaX$coef)[1:10]); fitArimaX$arma

################################################
#   "Gross domestic product at market prices"
Y = select(BelguimSuperSample, "Gross domestic product at market prices"); dim(Y)
X = select(BelguimSuperSample, -matches("debt|Debt")); dim(X)  # 360 167
X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X)
ts_Y <- ts(Y, start=c(2000,1), end=c(2017, 20), frequency = 20); length(ts_Y)
set.seed(1234)
fitArimaX = auto.arima(ts_Y, xreg=data.matrix(X[ , -c(50:60)])); fitArimaX$arma
# 5  0  2  0 20  0  0
# sigma^2 estimated as 57.04:  log likelihood=-1132.78 AIC=2593.55   AICc=2871.09   BIC=3230.88
pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60 <- 
predict(fitArimaX, n.ahead = 60, newxreg = data.matrix(X[301:360 , -c(50:60)]))$pred

plot(forecast(fitArimaX, xreg = data.matrix(X[301:360 , -c(50:60)])),         # ARIMA forecast
     include=120, lwd=4, lty=3, xlab="Time", ylab="GDP", ylim=c(50, 150),
     main = "ARIMAX Analytics (Train: 2000-2017; Test: 2018-2020) GDP Forecasting\n
      based on fitting ARIMAX Models on raw (spline interpolated) Belgium data")
 lines(pred_arimaX_5_0_2_Y_Belgium_train300_Belgium_test60, col = "red", lwd = 4, lty=3) 
 legend("topleft", bty="n", legend=c("Belgium Training Data (2000-2017)", 
                        "ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)",
                        "ARIMAX(5,0,2)-model GDP Forecasting (2018-2020)"),
       col=c("black", "blue", "red"), 
       lty=c(3,3,3), lwd=c(4,4,4), cex=1.2, x.intersp=1.5, y.intersp=0.6)
text(2015, 60, expression(atop(paste("Training Region (2000-2017)"), 
                paste(Model(Unempl) %->% "ARIMAX(p, q, r) ;  ", 
                      XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)
text(2019.5, 60, expression(atop(paste("Validation Region (2018-2020)"), 
        paste(hat(Unempl) %<-% "ARIMAX(5,0 ,2); ", 
              XReg %==% X[i], " ", i %in% {1 : 167}))), cex=1.5)
  • GDP Description: GDP (gross domestic product) is an indicator for a nation?s economic situation. It reflects the total value of all goods and services produced less the value of goods and services used for intermediate consumption in their production. Expressing GDP in PPS (purchasing power standards) eliminates differences in price levels between countries, and calculations on a per head basis allows for the comparison of economies significantly different in absolute size.

GDP unit of measure represents the Current prices, euro per capita. The volume index of GDP per capita in Purchasing Power Standards (PPS) is intended for cross-country comparisons rather than for temporal comparisons. GDP per capita when expressed in PPS eliminates the differences in price levels between countries allowing meaningful volume comparisons of GDP between countries. Expressed in relation to the European Union (EU27 GDP = 100), a country with an index that is higher than \(100\) or lower than \(100\) correspond to that this country’s level of GDP per head being higher or lower than the EU average, respectively.

4.2 Bulgaria

Define a new function that (1) cleans the data, and (2) fits the ARIMA model estimating the seasonal and non-seasonal time-series parameters \((p,d,q)\) and the effect-sizes (\(\beta\)’s) for the exogenous regression features (\(X\)).

#write a function of clean the data and fit the ARIMA model
Fit_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20,
                     feature="Unemployment , Females, From 15-64 years, Total")
{
  #delete features that are missing at all time points
  countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)]
  countryData = select(countryData, -time, -country)
  DataMatrix = as.matrix(countryData)
  DataMatrix = cleardata(DataMatrix)
  DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove feature that only has one value
  DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove feature that all the values are 0
  DataMatrix = splinecreate(DataMatrix)
  DataSuperSample = as.data.frame(DataMatrix)
  if (feature=="Unemployment , Females, From 15-64 years, Total") {
      Y = select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total")
      X = select(DataSuperSample, -starts_with("Unemployment"))
  } else if (feature=="Gross domestic product at market prices") {
    Y = select(DataSuperSample, "Gross domestic product at market prices"); dim(Y)
    X = select(DataSuperSample, -matches("debt|Debt")); dim(X)  # 360 167
    print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], ");  ",
                 " dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ..."))
    X <- X[, qr(X)$pivot[seq_len(qr(X)$rank)]]; dim(X)  # ensure full-rank design matrix, X
  }
  else {
    print(paste0("This feature ", feature, " is not imlemented yet! Exiting Fit_ARIMA() method ...")) 
    return(NULL)
  }
  ts_Y <- ts(Y, start=c(start, 1), end=c(end, frequency), frequency = frequency); length(ts_Y)
  set.seed(1234)
  fitArimaX = auto.arima(ts_Y, xreg=data.matrix(X))
  return(fitArimaX)
}
Bulgaria = filter(time_series,country == "Bulgaria")
BulgariaARIMA = Fit_ARIMA(countryData=Bulgaria, start=2000, end=2017, frequency=20, 
                     feature="Gross domestic product at market prices")
BulgariaARIMA$arma

# Extend the Fit-ARIMA method to ensure testing-training modeling/assessment for 2 countries works
preprocess_ARIMA <- function(countryData=Belgium, start=2000, end=2017, frequency=20,
                     feature="Unemployment , Females, From 15-64 years, Total")
{
  #delete features that are missing at all time points
  countryData = countryData[, colSums(is.na(countryData)) != nrow(countryData)]
  countryData = select(countryData, -time, -country)
  DataMatrix = as.matrix(countryData)
  DataMatrix = cleardata(DataMatrix)
  DataMatrix = DataMatrix[ , colSums(is.na(DataMatrix)) == 0] # remove features with only 1 value
  DataMatrix = DataMatrix[ , colSums(DataMatrix) != 0] # remove features with all values=0
  DataMatrix = splinecreate(DataMatrix)
  DataSuperSample = as.data.frame(DataMatrix) # super-Sample the data
  print(paste0("Processing feature: ...", feature, "... "))
      
  if (feature=="Unemployment , Females, From 15-64 years, Total") {
      Y = select(DataSuperSample, "Unemployment , Females, From 15-64 years, Total")
      X = select(DataSuperSample, -starts_with("Unemployment"))
  } else if (feature=="Gross domestic product at market prices") {
      Y = select(DataSuperSample, "Gross domestic product at market prices"); dim(Y)
      X = select(DataSuperSample, -matches("debt|Debt")); 
      X <- X [, -c(50:80)]; dim(X)  # 360 167
  } else {
    print(paste0("This feature: ...", feature, "... is not imlemented yet! Exiting preprocess_ARIMA() method ...")) 
    return(NULL)
  }
  
  # reduce the number of observations (matrix rows) to specified timerange
  len_1 <- (end + 1 - start) * frequency; print(paste0("dim(X)[1]=", len_1))
  X <- X[1:len_1 , qr(X[1:len_1 , ])$pivot[seq_len(qr(X[1:len_1 , ])$rank)]]; dim(X)  
  # ensure full-rank design matrix, X
  Y <- as.data.frame(Y[1:len_1 , ])
  print(paste0("dim(X)=(", dim(X)[1], ",", dim(X)[2], ");  ",         # 300 136
                 " dim(Y)=(", dim(Y)[1], ",", dim(Y)[2], ") ..."))    # 300 1
  return(list("X"=X, "Y"=Y))
}

# Outcome Variable to be modelled, as a timeseries: 2000 - 2017 (18 years, Quarterly measures)
# Spline interpolation *5;   2000-01 - 2014-20 (300 observations for training): 60 observations (2015-2017) for Testing

Belgium  <- filter(time_series, country == "Belgium")
Bulgaria <- filter(time_series, country == "Bulgaria")
Netherlands <- filter(time_series, country == "Netherlands")
# Test preprocess_ARIMA()
#preprocess_Belgium <- preprocess_ARIMA(countryData=Belgium, start=2000, end=2014,
#                frequency=20, feature="Gross domestic product at market prices")
#preprocess_Bulgaria <- preprocess_ARIMA(countryData=Bulgaria, start=2000, end=2014,
#                frequency=20, feature="Gross domestic product at market prices")


# General function that ensures the XReg predictors for 2 countries are homologous
homologousX_features <- function (X_Country1, X_Country2){
  # Check if the Belgium and Bilgaria Xreg are homologous (same feature columns)
  common_cols <- intersect(colnames(X_Country1), colnames(X_Country2))
  X_Country1 <- subset(X_Country1, select = common_cols)
  X_Country2 <- subset(X_Country2, select = common_cols)
  print(paste0("dim(X1)=(", dim(X_Country1)[1], ",", dim(X_Country1)[2], ");  ", # 300 131
              " dim(X2)=(", dim(X_Country2)[1], ",", dim(X_Country2)[2], ")!"))  # 300 131
  return(list("X_Country1"=X_Country1, "X_Country2"=X_Country2))
}
# Test homologousX_features
# homoFeat <- homologousX_features(preprocess_Belgium$X, preprocess_Bulgaria$X)
# X_Belgium  <- homoFeat$X_Country1
# X_Bulgaria <- homoFeat$X_Country2

fit_ARIMA <- function(country1Data=Belgium, country2Data=Bulgaria,