SOCR ≫ TCIU Website ≫ TCIU GitHub ≫

1 Exegeneous Feature Time-series analysis

library(forecast)
library(glmnet)
library(arm)
library(knitr) 
library(doParallel)

1.1 Figure 6.7

We can use the UCI ML Air Quality Dataset to demonstrate the effect of kime-direction on the analysis of the longitudinal data. The Air Quality data consists of \(9358\) hourly-averaged responses from an array of 5 sensors embedded in an Air Quality Chemical Multisensor Device. These measurements were obtained in a significantly polluted area during a one year period (March 2004 to February 2005). The features include Concentrations for CO, Non Metanic Hydrocarbons, Benzene, Total Nitrogen Oxides (NOx), and Nitrogen Dioxide (NO2).

The attributes in the CSV file include:

  • Date (DD/MM/YYYY)
  • Time (HH.MM.SS)
  • True hourly averaged concentration CO in mg/m^3 (reference analyzer)
  • PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)
  • True hourly averaged overall Non Metanic Hydro-carbons concentration in microg/m^3 (reference analyzer)
  • True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)
  • PT08.S2 (Titania) hourly averaged sensor response (nominally NMHC targeted)
  • True hourly averaged NOx concentration in ppb (reference analyzer)
  • PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)
  • True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)
  • PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)
  • PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)
  • Temperature in ?C
  • Relative Humidity (%)
  • AH Absolute Humidity
aqi_data <- read.csv("https://umich.instructure.com/files/8208336/download?download_frd=1")
summary(aqi_data)
##      Date               Time               CO.GT.         PT08.S1.CO.  
##  Length:9471        Length:9471        Min.   :-200.00   Min.   :-200  
##  Class :character   Class :character   1st Qu.:   0.60   1st Qu.: 921  
##  Mode  :character   Mode  :character   Median :   1.50   Median :1053  
##                                        Mean   : -34.21   Mean   :1049  
##                                        3rd Qu.:   2.60   3rd Qu.:1221  
##                                        Max.   :  11.90   Max.   :2040  
##                                        NA's   :114       NA's   :114   
##     NMHC.GT.         C6H6.GT.        PT08.S2.NMHC.       NOx.GT.      
##  Min.   :-200.0   Min.   :-200.000   Min.   :-200.0   Min.   :-200.0  
##  1st Qu.:-200.0   1st Qu.:   4.000   1st Qu.: 711.0   1st Qu.:  50.0  
##  Median :-200.0   Median :   7.900   Median : 895.0   Median : 141.0  
##  Mean   :-159.1   Mean   :   1.866   Mean   : 894.6   Mean   : 168.6  
##  3rd Qu.:-200.0   3rd Qu.:  13.600   3rd Qu.:1105.0   3rd Qu.: 284.0  
##  Max.   :1189.0   Max.   :  63.700   Max.   :2214.0   Max.   :1479.0  
##  NA's   :114      NA's   :114        NA's   :114      NA's   :114     
##   PT08.S3.NOx.     NO2.GT.         PT08.S4.NO2.   PT08.S5.O3.    
##  Min.   :-200   Min.   :-200.00   Min.   :-200   Min.   :-200.0  
##  1st Qu.: 637   1st Qu.:  53.00   1st Qu.:1185   1st Qu.: 700.0  
##  Median : 794   Median :  96.00   Median :1446   Median : 942.0  
##  Mean   : 795   Mean   :  58.15   Mean   :1391   Mean   : 975.1  
##  3rd Qu.: 960   3rd Qu.: 133.00   3rd Qu.:1662   3rd Qu.:1255.0  
##  Max.   :2683   Max.   : 340.00   Max.   :2775   Max.   :2523.0  
##  NA's   :114    NA's   :114       NA's   :114    NA's   :114     
##        T                  RH                AH               X          
##  Min.   :-200.000   Min.   :-200.00   Min.   :-200.0000   Mode:logical  
##  1st Qu.:  10.900   1st Qu.:  34.10   1st Qu.:   0.6923   NA's:9471     
##  Median :  17.200   Median :  48.60   Median :   0.9768                 
##  Mean   :   9.778   Mean   :  39.49   Mean   :  -6.8376                 
##  3rd Qu.:  24.100   3rd Qu.:  61.90   3rd Qu.:   1.2962                 
##  Max.   :  44.600   Max.   :  88.70   Max.   :   2.2310                 
##  NA's   :114        NA's   :114       NA's   :114                       
##    X.1         
##  Mode:logical  
##  NA's:9471     
##                
##                
##                
##                
## 
aqi_data.ts <- ts(aqi_data, start=c(2004,3), freq=24) # hourly sampling rate

# set up training and testing time-periods
alltrain.ts <- window(aqi_data.ts, end=c(2004,3))
allvalid.ts <- window(aqi_data.ts, start=c(2005,1))





# Estimate the ARIMAX model

fitArimaX <- auto.arima(aqi_data$CO.GT., xreg= aqi_data[ , 
                                  c("PT08.S1.CO.", "NMHC.GT.", "C6H6.GT.", "PT08.S2.NMHC.",
                                    "NOx.GT.", "PT08.S3.NOx.", "NO2.GT.", "PT08.S4.NO2.",
                                    "PT08.S5.O3.", "T", "RH", "AH")] %>% as.matrix()
                          )
fitArimaX
## Series: aqi_data$CO.GT. 
## Regression with ARIMA(1,0,1) errors 
## 
## Coefficients:
##          ar1      ma1  intercept  PT08.S1.CO.  NMHC.GT.  C6H6.GT.
##       0.9758  -0.5825   -40.4356       0.0056    0.0452   -2.9162
## s.e.  0.0025   0.0099    22.5593       0.0120    0.0071    0.5165
##       PT08.S2.NMHC.  NOx.GT.  PT08.S3.NOx.  NO2.GT.  PT08.S4.NO2.  PT08.S5.O3.
##              0.1017   0.0363       -0.0157   0.0091       -0.0196      -0.0091
## s.e.         0.0188   0.0057        0.0073   0.0098        0.0095       0.0044
##             T       RH      AH
##       -0.3853  -0.2147  3.4773
## s.e.   0.3318   0.1219  0.6934
## 
## sigma^2 estimated as 1202:  log likelihood=-46447.71
## AIC=92927.43   AICc=92927.49   BIC=93041.73
# Predict prospective CO concentration
pred_length <- 24*30 # 1 month forward forecasting
predArrivals <- predict(fitArimaX, n.ahead = pred_length, 
                        newxreg = aqi_data[c((9471-pred_length+1):9471), c(4:15)])
#plot(predArrivals$pred, main="Forward time-series predictions (fitArimaX)")
plot(forecast(fitArimaX, 
              xreg = as.matrix( aqi_data[c((9471-pred_length+1):9471), c(4:15)]) ) )

We can first explore the time-course harmonics of the data.

aqi_data[is.na(aqi_data)] <- 0
y     <- aqi_data$PT08.S1.CO.
t     <- 1:dim(aqi_data)[1]
# range <- diff(range(y))

# Compute the spectral decomposition (harmonics)
ff_harmonics = function(x=NULL, n=NULL, up=10L, plot=TRUE, add=F, main=NULL, ...) {
  # The discrete Fourier transformation
  dff = fft(x)
  # time
  t = seq(from = 1, to = length(x))
  
  # Upsampled time
  nt = seq(from = 1, to = length(x)+1-1/up, by = 1/up)
 
  #New spectrum
  ndff = array(data = 0