--- title: "Spacekime Analytics (Time Complexity and Inferential Uncertainty)" subtitle: "Exegeneous Feature Time-series analysis" author: "SOCR Team " date: "`r format(Sys.time(),'%m/%d/%Y')`" output: html_document: theme: spacelab highlight: tango includes: before_body: TCIU_header.html toc: true number_sections: true toc_depth: 2 toc_float: collapsed: false smooth_scroll: true code_folding: hide --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, warings = FALSE) ``` ```{r kime, message=F, warning=F} # Get this figure: fig <- get_figure("MattSundquist", 4064) # Get this figure's data: data <- get_figure("MattSundquist", 4064)$data # Add data to this figure: p <- add_trace(p, x=c(4, 5), y=c(4, 5), kwargs=list(filename="Klein bottle", fileopt="extend")) # Get y data of first trace: y1 <- get_figure("MattSundquist", 4064)$data[[1]]$y library(plotly) ``` # Background The function `fftshift()` is useful for visualizing the Fourier transform with the zero-frequency component in the middle of the spectrum. Its inverse counterpart, `ifftshift()`, is needed to rearrange again the indices appropriately after the IFT is employed, so that the image is correctly reconstructed in spacetime. The FT only computes half of the frequency spectrum corresponding to the non-negative (positive and zero if the `length(f)` is odd) frequencies in order to save computation time. To preserve the dimensions of the output $\hat{f}=FT(f)$, the second half of the frequency spectrum (the complex conjugate of the first half) is just added at the end of this vector. In a 1D setting, the result of `fft()` is: $0\ 1\ 2\ 3\ ...\ (freq\ bins > 0)\ ... {\frac{Fs}{2}}$ and $-{\frac{Fs}{2}}\ ... \ (freq\ bins < 0)\ ...\ -3\ -2\ -1.$ where $F_s$ is the frequency sample. The `fftshift` method sets the zero-frequency component in the center of the array, i.e., it just shifts (offsets) the second part with the negative frequency bins to the beginning and the first part to the end of the resulting FT vector, or matrix. Thus, the shifted discrete FT can be *nicely* plotted in the center covering the frequency spectrum from $-{\frac{Fs}{2}}$ on the left to ${\frac{Fs}{2}}$ on the right. This is not necessary, but is used for better visualization aesthetics. To synthesize back the correct image, after using `fftshift` on the FT signal, we always have to undo that re-indexing by using `ifftshift()` on the inverse-FT. ```{r message=F, warning=F} # FFT SHIFT fftshift <- function(img_ff, dim = -1) { rows <- dim(img_ff)[1] cols <- dim(img_ff)[2] # planes <- dim(img_ff)[3] swap_up_down <- function(img_ff) { rows_half <- ceiling(rows/2) return(rbind(img_ff[((rows_half+1):rows), (1:cols)], img_ff[(1:rows_half), (1:cols)])) } swap_left_right <- function(img_ff) { cols_half <- ceiling(cols/2) return(cbind(img_ff[1:rows, ((cols_half+1):cols)], img_ff[1:rows, 1:cols_half])) } #swap_side2side <- function(img_ff) { # planes_half <- ceiling(planes/2) # return(cbind(img_ff[1:rows, 1:cols, ((planes_half+1):planes)], img_ff[1:rows, 1:cols, 1:planes_half])) #} if (dim == -1) { img_ff <- swap_up_down(img_ff) return(swap_left_right(img_ff)) } else if (dim == 1) { return(swap_up_down(img_ff)) } else if (dim == 2) { return(swap_left_right(img_ff)) } else if (dim == 3) { # Use the `abind` package to bind along any dimension a pair of multi-dimensional arrays # install.packages("abind") library(abind) planes <- dim(img_ff)[3] rows_half <- ceiling(rows/2) cols_half <- ceiling(cols/2) planes_half <- ceiling(planes/2) img_ff <- abind(img_ff[((rows_half+1):rows), (1:cols), (1:planes)], img_ff[(1:rows_half), (1:cols), (1:planes)], along=1) img_ff <- abind(img_ff[1:rows, ((cols_half+1):cols), (1:planes)], img_ff[1:rows, 1:cols_half, (1:planes)], along=2) img_ff <- abind(img_ff[1:rows, 1:cols, ((planes_half+1):planes)], img_ff[1:rows, 1:cols, 1:planes_half], along=3) return(img_ff) } else { stop("Invalid dimension parameter") } } ifftshift <- function(img_ff, dim = -1) { rows <- dim(img_ff)[1] cols <- dim(img_ff)[2] swap_up_down <- function(img_ff) { rows_half <- floor(rows/2) return(rbind(img_ff[((rows_half+1):rows), (1:cols)], img_ff[(1:rows_half), (1:cols)])) } swap_left_right <- function(img_ff) { cols_half <- floor(cols/2) return(cbind(img_ff[1:rows, ((cols_half+1):cols)], img_ff[1:rows, 1:cols_half])) } if (dim == -1) { img_ff <- swap_left_right(img_ff) return(swap_up_down(img_ff)) } else if (dim == 1) { return(swap_up_down(img_ff)) } else if (dim == 2) { return(swap_left_right(img_ff)) } else if (dim == 3) { # Use the `abind` package to bind along any dimension a pair of multi-dimensional arrays # install.packages("abind") library(abind) planes <- dim(img_ff)[3] rows_half <- floor(rows/2) cols_half <- floor(cols/2) planes_half <- floor(planes/2) img_ff <- abind(img_ff[1:rows, 1:cols, ((planes_half+1):planes)], img_ff[1:rows, 1:cols, 1:planes_half], along=3) img_ff <- abind(img_ff[1:rows, ((cols_half+1):cols), (1:planes)], img_ff[1:rows, 1:cols_half, (1:planes)], along=2) img_ff <- abind(img_ff[((rows_half+1):rows), (1:cols), (1:planes)], img_ff[(1:rows_half), (1:cols), (1:planes)], along=1) return(img_ff) } else { stop("Invalid dimension parameter") } } ``` # Exegeneous Feature Time-series analysis We can use the [UCI ML Air Quality Dataset](https://archive.ics.uci.edu/ml/datasets/Air+quality) to demonstrate the effect of kime-direction on the analysis of the longitudinal data. The *Air Quality data* consists of $9358$ hourly-averaged responses from an array of 5 sensors embedded in an Air Quality Chemical Multisensor Device. These measurements were obtained in a significantly polluted area during a one year period (March 2004 to February 2005). The features include Concentrations for CO, Non Metanic Hydrocarbons, Benzene, Total Nitrogen Oxides (NOx), and Nitrogen Dioxide (NO2). The attributes in the CSV file include: * Date (DD/MM/YYYY) * Time (HH.MM.SS) * True hourly averaged concentration CO in mg/m^3 (reference analyzer) * PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted) * True hourly averaged overall Non Metanic Hydro-carbons concentration in microg/m^3 (reference analyzer) * True hourly averaged Benzene concentration in microg/m^3 (reference analyzer) * PT08.S2 (Titania) hourly averaged sensor response (nominally NMHC targeted) * True hourly averaged NOx concentration in ppb (reference analyzer) * PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted) * True hourly averaged NO2 concentration in microg/m^3 (reference analyzer) * PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted) * PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted) * Temperature in ?C * Relative Humidity (%) * AH Absolute Humidity ```{r} aqi_data <- read.csv("https://umich.instructure.com/files/8208336/download?download_frd=1") summary(aqi_data) aqi_data.ts <- ts(aqi_data, start=c(2004,3), freq=24) # hourly sampling rate dateTime <- as.POSIXct(paste(aqi_data$Date, aqi_data$Time), format="%m/%d/%Y %H:%M:%S") # set up training and testing time-periods alltrain.ts <- window(aqi_data.ts, end=c(2004,3)) allvalid.ts <- window(aqi_data.ts, start=c(2005,1)) # Estimate the ARIMAX model library(forecast) fitArimaX <- auto.arima(aqi_data$CO.GT., xreg= as.matrix(aqi_data[ , c("PT08.S1.CO.", "NMHC.GT.", "C6H6.GT.", "PT08.S2.NMHC.", "NOx.GT.", "PT08.S3.NOx.", "NO2.GT.", "PT08.S4.NO2.", "PT08.S5.O3.", "T", "RH", "AH")])) fitArimaX # Predict prospective CO concentration pred_length <- 24*30 # 1 month forward forecasting predArrivals <- predict(fitArimaX, n.ahead = pred_length, newxreg = aqi_data[c((9471-pred_length+1):9471), c(4:15)]) #plot(predArrivals$pred, main="Forward time-series predictions (fitArimaX)") plot(forecast(fitArimaX, xreg = as.matrix(aqi_data[c((9471-pred_length+1):9471), c(4:15)]))) library(plotly) library(magrittr) # generate date sequence arimaForecast <- forecast(fitArimaX, xreg = as.matrix(aqi_data[c((9471-pred_length+1):9471), c(4:15)])) str(arimaForecast ) plot_ly(x=~dateTime, y=~arimaForecast$fitted, name="ARIMA Model", type = "scatter", mode='lines+markers') # Figure 6.9 ================================================ # aqi_data[is.na(aqi_data)] <- 0 # y <- aqi_data$PT08.S1.CO. # t <- 1:dim(aqi_data)[1] # # range <- diff(range(y)) # # # Compute the spectral decomposition (harmonics) # ff_harmonicsPlotLy = function(x=NULL, n=NULL, up=10L, plot=TRUE, add=F, main=NULL, ...) { # # The discrete Fourier transformation # dff = fft(x) # # time # t = seq(from = 1, to = length(x)) # # # Upsampled time # nt = seq(from = 1, to = length(x)+1-1/up, by = 1/up) # # #New spectrum # ndff = array(data = 0, dim = c(length(nt), 1L)) # ndff[1] = dff[1] # mean, DC component # if(n != 0){ # ndff[2:(n+1)] = dff[2:(n+1)] # positive frequencies come first # ndff[length(ndff):(length(ndff) - n + 1)] = dff[length(x):(length(x) - n + 1)] # negative frequencies # } # # # Invert the FT # indff = fft(ndff/length(y), inverse = TRUE) # idff = fft(dff/length(y), inverse = TRUE) # if(plot){ # plot(x = dateTime, y = x, pch = 16L, xlab = "Time", ylab = "Measurement", # col = rgb(red = 0.5, green = 0.5, blue = 0.5, alpha = 0.5), # main = ifelse(is.null(main), paste(n, "harmonics"), main)) # lines(y = Mod(idff), x = dateTime, col = adjustcolor(1L, alpha = 0.5)) # # plot_ly(x=~dateTime, y=~x, name="ARIMA Model", type = "scatter", mode='lines+markers') %>% # add_trace(x=~dateTime, y =~ Mod(idff), # name="First 3 Harmonics Model", # type = "scatter", mode='lines+markers') %>% # layout(title = "Air Quality with First 3 Harmonics Model") # } # ret = data.frame(time = dateTime, y = Mod(indff)) # return(ret) # } # # # Apply ff_harmonics to the timeseries as x, specifying the number of harmonics (n) and # # the upsampling (so we plot points in time beside the original ones) # result = ff_harmonicsPlotLy(x = y, n = 12L, up = 100L, col = 2L, lwd=3, cex=2) ``` We can first explore the time-course harmonics of the data. ```{r} aqi_data[is.na(aqi_data)] <- 0 y <- aqi_data$PT08.S1.CO. t <- 1:dim(aqi_data)[1] # range <- diff(range(y)) # Compute the spectral decomposition (harmonics) ff_harmonics = function(x=NULL, n=NULL, up=10L, plot=TRUE, add=F, main=NULL, ...) { # The discrete Fourier transformation dff = fft(x) # time t = seq(from = 1, to = length(x)) # Upsampled time nt = seq(from = 1, to = length(x)+1-1/up, by = 1/up) #New spectrum ndff = array(data = 0, dim = c(length(nt), 1L)) ndff[1] = dff[1] # mean, DC component if(n != 0){ ndff[2:(n+1)] = dff[2:(n+1)] # positive frequencies come first ndff[length(ndff):(length(ndff) - n + 1)] = dff[length(x):(length(x) - n + 1)] # negative frequencies } # Invert the FT indff = fft(ndff/length(y), inverse = TRUE) idff = fft(dff/length(y), inverse = TRUE) if(plot){ if(!add){ plot(x = t, y = x, pch = 16L, xlab = "Time", ylab = "Measurement", col = rgb(red = 0.5, green = 0.5, blue = 0.5, alpha = 0.5), main = ifelse(is.null(main), paste(n, "harmonics"), main)) lines(y = Mod(idff), x = t, col = adjustcolor(1L, alpha = 0.5)) } lines(y = Mod(indff), x = nt, ...) } ret = data.frame(time = nt, y = Mod(indff)) return(ret) } # Apply ff_harmonics to the timeseries as x, specifying the number of harmonics (n) and # the upsampling (so we plot points in time beside the original ones) result = ff_harmonics(x = y, n = 12L, up = 100L, col = 2L, lwd=3, cex=2) # We can add the fourth-to-twelveth harmonics and look at their sum (as a series difference, 12-3) add4to12_harmonics = ff_harmonics(x = y, n = 12L, up = 10L, col = 2L, plot = FALSE) add4to12_harmonics$y <- add4to12_harmonics$y - ff_harmonics(x = y, n = 3L, up = 10L, plot = T, col = 2L, lwd=3)$y plot(add4to12_harmonics, pch = 16L, xlab = "Time", ylab = "Measurement", main = "Sum of all harmonics up to order 12", type = "l", col = 2) # Harmonics plot of multiple frequencies (waves) in different colors colors = rainbow(14, alpha = 0.6) # ff_harmonics(x = y, n = 28L, up = 100L, col = colors[1], add=T) for(i in 1:14){ ad = ifelse(i == 1, FALSE, TRUE) ff_harmonics(x = y, n = i, up = 100L, col = colors[i], add = ad, lwd= 3, main = "All waves up to 14th harmonic") } ``` ## Kime-Phases Circular distribution plots ```{r} # install.packages("circular") library("circular") set.seed(1234) x <- rvonmises(n=1000, mu=circular(pi/5), kappa=3) y <- rvonmises(n=1000, mu=circular(-pi/3), kappa=5) z <- rvonmises(n=1000, mu=circular(0), kappa=10) resx <- density(x, bw=25) res <- plot(resx, points.plot=TRUE, xlim=c(-1.5,1), ylim=c(-1.1, 1.5), offset=1.1, shrink=1.2, lwd=3) resy <- density(y, bw=25) lines(resy, points.plot=TRUE, col=2, points.col=2, plot.info=res, offset=1.1, shrink=1.45, lwd=3) resz <- density(z, bw=25) lines(resz, points.plot=TRUE, col=3, points.col=3, plot.info=res, offset=1.1, shrink=1.2, lwd=3) ``` There is a key difference between spacekime data analytics and spacetime data modeling and inference. This contrast is based on the fact that in spacetime, statistical results are obtained by aggregating repeated (IID) dataset samples or measuring identical replicate cohorts under identical conditions. In spacekime, reliable inference can be made on a single sample, if the kime direction angles are known. Indeed the latter are generally no observable, however, they can be estimated, inferred, or approximated. As the FT and IFT are linear functionals, addition, averaging and multiplication by constants are preserved by the forward and inverse Fourier transforms. Therefore, if we have a number of phase estimates in k-space, we can aggregate these (e.g., by averaging them) and use the resulting *assemblage phase* to synthesize the data in spacekime. If the composite phases are indeed representative of the process kime orientation, then the reconstructed spacekime inference is expected to be valid even if we used a single sample. In a way, spacekime inference allows a dual representation of the [central limit theorem](http://www.amstat.org/publications/jse/v16n2/dinov.html), which guarantees the convergence of sample averages to their corresponding population mean counterparts. In light of this *analytics duality*, we can now perform a [traditional ARIMA modeling](http://www.socr.umich.edu/people/dinov/courses/DSPA_notes/18_BigLongitudinalDataAnalysis.html#17_autoregressive_integrated_moving_average_extended_(arimax)_model) of `CO` concentration (*outcome*: `PT08.S1.CO.`) based on several covariates, e.g., *predictors*: `NMHC.GT.`, `C6H6.GT.`, `PT08.S2.NMHC.`, `NOx.GT.`, `PT08.S3.NOx.`, `NO2.GT.`, `PT08.S4.NO2.`, `PT08.S5.O3.`, `T`, `RH`, and `AH`. ```{r} dim(aqi_data) epochs_aqi_data <- as.matrix(aqi_data[c(1:9000) , -c(1:3, 16:17)]) is.matrix(epochs_aqi_data); dim(epochs_aqi_data) dim(epochs_aqi_data) <- c(9, 1000, 12) dim(epochs_aqi_data); identical(epochs_aqi_data[9, 1000, 12], aqi_data[9000, 12+3]) epochs_aqi_data_1 <- epochs_aqi_data[1, , ]; dim(epochs_aqi_data_1) # 1D timeseries FFT SHIFT fftshift1D <- function(img_ff) { rows <- length(img_ff) rows_half <- ceiling(rows/2) return(append(img_ff[(rows_half+1):rows], img_ff[1:rows_half])) } # 1. Transform all 9 signals to k-space (Fourier domain) x1 <- c(1:1000) FT_epochs_aqi_data <- array(complex(), c(9, 1000, 12)) mag_FT_epochs_aqi_data <- array(complex(), c(9, 1000, 12)) phase_FT_epochs_aqi_data <- array(complex(), c(9, 1000, 12)) for (i in 1:9) { FT_epochs_aqi_data[i, , ] <- fft(epochs_aqi_data[i, , ]) X2 <- FT_epochs_aqi_data[i, , ] # plot(fftshift1D(log(Re(X2)+2)), main = "log(fftshift1D(Re(FFT(timeseries))))") mag_FT_epochs_aqi_data[i, , ] <- sqrt(Re(X2)^2+Im(X2)^2); # plot(log(fftshift1D(Re(X2_mag))), main = "log(Magnitude(FFT(timeseries)))") phase_FT_epochs_aqi_data[i, , ] <- atan2(Im(X2), Re(X2)); # plot(fftshift1D(X2_phase), main = "Shift(Phase(FFT(timeseries)))") } ### Test the process to confirm calculations # X2<-FT_epochs_aqi_data[1,,];X2_mag<-mag_FT_epochs_aqi_data[1,,];X2_phase<-phase_FT_epochs_aqi_data[1,,] # Real2 = X2_mag * cos(X2_phase) # Imaginary2 = X2_mag * sin(X2_phase) # man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2)) # ifelse(abs(man_hat_X2[5,10] - epochs_aqi_data[1, 5, 10]) < 0.001, "Perfect Syntesis", "Problems!!!") ####### # 2. Invert back to spacetime the epochs_aqi_data_1 signal with nil phase Real = mag_FT_epochs_aqi_data[1, , ] * cos(0) # cos(phase_FT_epochs_aqi_data[1, , ]) Imaginary = mag_FT_epochs_aqi_data[1, , ] * sin(0) # sin(phase_FT_epochs_aqi_data[1, , ]) ift_NilPhase_X2mag = Re(fft(Real+1i*Imaginary, inverse = T)/length(FT_epochs_aqi_data[1,,])) # display(ift_NilPhase_X2mag, method = "raster") # dim(ift_NilPhase_X2mag); View(ift_NilPhase_X2mag); # compare to View(epochs_aqi_data[1, , ]) # 3. Perform ARIMAX modeling of ift_NilPhase_X2mag; report (p,d,q) params and quality metrics AIC/BIC library(forecast) fitArimaX_nil <- auto.arima(ift_NilPhase_X2mag[ , 1], xreg= ift_NilPhase_X2mag[ , 2:12]) fitArimaX_nil # Regression with ARIMA(2,0,1) errors #Coefficients: # ar1 ar2 ma1 intercept xreg1 xreg2 xreg3 xreg4 xreg5 xreg6 xreg7 xreg8 # 1.1141 -0.1457 -0.7892 503.3455 -0.4028 0.1366 -0.5146 1.0961 1.2195 1.3063 1.2087 1.1491 #s.e. 0.2064 0.1571 0.1821 73.4212 0.1087 0.1123 0.1072 0.1059 0.1029 0.1564 0.1023 0.1101 # xreg9 xreg10 xreg11 # -0.4823 0.0315 -0.4640 #s.e. 0.1049 0.1125 0.1076 # sigma^2 estimated as 30448: log likelihood=-6573.68 AIC=13179.36 AICc=13179.91 BIC=13257.88 # Predict prospective CO concentration pred_length <- 24*7 # 1 week forward forecasting predArrivals <- predict(fitArimaX_nil, n.ahead = pred_length, newxreg = ift_NilPhase_X2mag[800:1000, 2:12]) #plot(predArrivals$pred, main="Forward time-series predictions (fitArimaX)") plot(forecast(fitArimaX_nil, xreg = ift_NilPhase_X2mag[800:1000, 2:12]), main = "ARIMAX(2,0,1) Model Forecasting (1,001:1,200)", ylim = c(500, 6000)) lines(c(1001:1200), epochs_aqi_data[2, c(1:200), 1], col = "red", lwd = 2, lty=2) # overlay original data to show Nil-Phase effects on reconstruction legend("top", bty="n", legend=c("Prediction via Nil-Phase Reconstruction", "Real Timeseries (CO)"), col=c("blue", "red"), lty=c(1,2), lwd=c(2,2), cex=0.9, x.intersp=0.5) # 4. Compute the *average phase* across the eight series 2:9 phase_Avg <- apply(phase_FT_epochs_aqi_data, c(2,3), mean); dim(phase_Avg); phase_Avg[1:5 , 1:5] # 5. Invert epochs_aqi_data_1 signal to spacetime using average-phase Real = mag_FT_epochs_aqi_data[1, , ] * cos(phase_Avg) Imaginary = mag_FT_epochs_aqi_data[1, , ] * sin(phase_Avg) ift_AvgPhase_X2mag = Re(fft(Real+1i*Imaginary, inverse = T)/length(FT_epochs_aqi_data[1,,])) # display(ift_AvgPhase_X2mag, method = "raster") # dim(ift_AvgPhase_X2mag); View(ift_AvgPhase_X2mag); # compare to View(epochs_aqi_data[1, , ]) # 6. Perform ARIMAX modeling on ift_AvgPhase_X2mag; report (p, d, q) parameters and quality metrics fitArimaX_avg <- auto.arima(ift_AvgPhase_X2mag[ , 1], xreg= ift_NilPhase_X2mag[ , 2:12]) fitArimaX_avg # ARIMA(2,0,3) # Coefficients: # ar1 ar2 ma1 ma2 ma3 intercept xreg1 xreg2 xreg3 xreg4 xreg5 xreg6 # 0.3295 0.2384 0.2673 -0.0061 0.1573 742.8001 0.5838 0.2809 -0.6497 1.2399 -0.0261 1.0818 # s.e. 0.1354 0.1150 0.1363 0.0650 0.0451 97.1676 0.1700 0.1776 0.1468 0.1695 0.1710 0.2394 # xreg7 xreg8 xreg9 xreg10 xreg11 # 0.2540 0.3065 -0.4052 0.3511 -0.4577 # s.e. 0.1706 0.1665 0.1450 0.1791 0.1709 # sigma^2 estimated as 82982: log likelihood=-7073.9 AIC=14183.8 AICc=14184.5 BIC=14272.14 # 7. Perform ARIMAX modeling on epochs_aqi_data[1,,]; report (p,d,q) parameters and quality metrics fitArimaX_orig <- auto.arima(epochs_aqi_data[1, , 1], xreg= epochs_aqi_data[1, , 2:12]) fitArimaX_orig # Regression with ARIMA(1,1,4) errors # Coefficients: # ar1 ma1 ma2 ma3 ma4 xreg1 xreg2 xreg3 xreg4 xreg5 xreg6 xreg7 # 0.2765 -0.8891 0.1268 0.0304 -0.1766 0.0804 6.1495 0.0986 0.0163 -0.0482 -0.0110 0.1833 #s.e. 0.1294 0.1272 0.0933 0.0450 0.0384 0.0213 1.6611 0.0554 0.0152 0.0207 0.0257 0.0274 # xreg8 xreg9 xreg10 xreg11 # 0.1765 6.5374 1.7939 -12.0697 #s.e. 0.0118 0.7141 0.2724 1.8905 # sigma^2 estimated as 2287: log likelihood=-5273.65 AIC=10581.29 AICc=10581.92 BIC=10664.71 # 8. Compare the analytics results from #3, #6, and #7 # Generate a table with results ### correlations cor_orig_obs <- format(cor(forecast(fitArimaX_orig, xreg = epochs_aqi_data[2, c(801:1000), 2:12])$mean, epochs_aqi_data[2, c(1:200), 1]), digits=3); cor_orig_obs cor_orig_nil <- format(cor(forecast(fitArimaX_orig, xreg = epochs_aqi_data[2, c(801:1000), 2:12])$mean, forecast(fitArimaX_nil, xreg = ift_NilPhase_X2mag[801:1000, 2:12])$mean), digits=3); cor_orig_nil cor_orig_avg <- format(cor(forecast(fitArimaX_orig, xreg = epochs_aqi_data[2, c(801:1000), 2:12])$mean, forecast(fitArimaX_avg, xreg = ift_AvgPhase_X2mag[801:1000, 2:12])$mean), digits=3); cor_orig_avg ### plots plot(forecast(fitArimaX_orig, xreg = epochs_aqi_data[1, 800:1000, 2:12]), main = sprintf("ARIMAX Model Forecasting (1,001:1,200): Corr(TrueObs,Orig)=%s", cor_orig_obs), xlim=c(800, 1200), ylim = c(500, 2000), col="black", lwd = 1, lty=1) #lines(c(1001:1200), forecast(fitArimaX_orig, xreg = epochs_aqi_data[2, c(801:1000), 2:12])$mean, # col = "black", lwd = 1, lty=1) # Original=True Phase reconstruction lines(c(1001:1200), forecast(fitArimaX_nil, xreg = ift_NilPhase_X2mag[801:1000, 2:12])$mean, col = "purple", lwd = 1, lty=1) lines(c(1001:1200), forecast(fitArimaX_avg, xreg = ift_AvgPhase_X2mag[801:1000, 2:12])$mean, col = "red", lwd = 1, lty=1) lines(c(1001:1200), epochs_aqi_data[2, c(1:200), 1], col = "green", lwd = 1, lty=1) legend("topleft", bty="n", legend=c( sprintf("Original (Correct Phases): Corr(Orig, TrueObs)=%s", cor_orig_obs), sprintf("Prediction via Nil-Phase Reconstruction: Corr(Orig, Nil)=%s", cor_orig_nil), sprintf("Prediction via Average-Phase Reconstruction: Corr(Orig, Avg)=%s", cor_orig_avg), sprintf("Real CO Timeseries (Epoch 2)"), "Orig (True Phase) ARIMA(1,1,4) Model Forecast"), col=c("black", "purple", "red", "green", "blue"), lty=c(1,1,1,1), lwd=c(2,2,2,2), cex=0.9, x.intersp=1, text.width=c(0.085,0.235,0.35, 0.3), xjust=0, yjust=0) #### Zoom in plot(forecast(fitArimaX_orig, xreg = epochs_aqi_data[1, 800:1000, 2:12]), main = sprintf("ARIMAX Model Forecasting (1,001:1,200): Corr(TrueObs,Orig)=%s", cor_orig_obs), xlim=c(950, 1050), ylim = c(500, 2000), col="black", lwd = 1, lty=1) #lines(c(1001:1200), forecast(fitArimaX_orig, xreg = epochs_aqi_data[2, c(801:1000), 2:12])$mean, # col = "black", lwd = 1, lty=1) # Original=True Phase reconstruction lines(c(1001:1200), forecast(fitArimaX_nil, xreg = ift_NilPhase_X2mag[801:1000, 2:12])$mean, col = "purple", lwd = 1, lty=1) lines(c(1001:1200), forecast(fitArimaX_avg, xreg = ift_AvgPhase_X2mag[801:1000, 2:12])$mean, col = "red", lwd = 1, lty=1) lines(c(1001:1200), epochs_aqi_data[2, c(1:200), 1], col = "green", lwd = 1, lty=1) legend("topleft", bty="n", legend=c( sprintf("Original (Correct Phases): Corr(Orig, TrueObs)=%s", cor_orig_obs), sprintf("Prediction via Nil-Phase Reconstruction: Corr(Orig, Nil)=%s", cor_orig_nil), sprintf("Prediction via Average-Phase Reconstruction: Corr(Orig, Avg)=%s", cor_orig_avg), sprintf("Real CO Timeseries (Epoch 2)"), "Orig (True Phase) ARIMA(1,1,4) Model Forecast"), col=c("black", "purple", "red", "green", "blue"), lty=c(1,1,1,1), lwd=c(2,2,2,2), cex=0.9, x.intersp=1, text.width=c(0.085,0.235,0.35, 0.3), xjust=0, yjust=0) ``` An alternative data analytic approach involves using the Fourier transform applied to the complete 2D data-matrix (rows=time, columns=features), inverting it back in spacetime, and investigating the effect of the timeseries analysis *with* and *without* using the correct kime-directions (phases). Knowing the kime-directions is expected to produce better analytical results (e.g., lower bias and lower dispersion). ```{r} dim(aqi_data) # 1. Transform the 2D matrix to k-space (Fourier domain) aqi_data1 <- aqi_data[ , 3:15] # remove string columns aqi_data_complete <- as.matrix(aqi_data1[complete.cases(aqi_data1), ]) dim(aqi_data_complete) # ; display(aqi_data_complete, method = "raster") FT_aqi_data <- fft(aqi_data_complete) X2 <- FT_aqi_data # display(FT_aqi_data, method = "raster") mag_FT_aqi_data <- sqrt(Re(X2)^2+Im(X2)^2) # plot(log(fftshift1D(Re(X2_mag))), main = "log(Magnitude(FFT(timeseries)))") phase_FT_aqi_data <- atan2(Im(X2), Re(X2)) ### Test the process to confirm calculations # X2<-FT_aqi_data; X2_mag <- mag_FT_aqi_data; X2_phase<-phase_FT_aqi_data # Real2 = X2_mag * cos(X2_phase) # Imaginary2 = X2_mag * sin(X2_phase) # man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2)) # ifelse(abs(man_hat_X2[5,10] - aqi_data1[5, 10]) < 0.001, "Perfect Syntesis", "Problems!!!") ####### # 2. Invert back to spacetime the epochs_aqi_data_1 signal with nil phase Real = mag_FT_aqi_data * cos(0) # cos(phase_FT_aqi_data) Imaginary = mag_FT_aqi_data * sin(0) # sin(phase_FT_aqi_data) ift_NilPhase_X2mag = Re(fft(Real+1i*Imaginary, inverse = T)/length(FT_aqi_data)) # display(ift_NilPhase_X2mag, method = "raster") # dim(ift_NilPhase_X2mag); View(ift_NilPhase_X2mag); # compare to View(aqi_data1) summary(aqi_data_complete); summary(ift_NilPhase_X2mag, method = "raster") # 3. Perform 2D modeling of ift_NilPhase_X2mag, e.g., ### LASSO "CO.GT." ~ "PT08.S1.CO."+"NMHC.GT."+"C6H6.GT."+"PT08.S2.NMHC."+"NOx.GT."+"PT08.S3.NOx."+ "NO2.GT."+"PT08.S4.NO2."+"PT08.S5.O3."+"T"+"RH"+"AH" library(glmnet) library(arm) library(knitr) y <- ift_NilPhase_X2mag[ , 1] # CO as outcome variable X <- ift_NilPhase_X2mag[ , 2:13] # remaining features are predictors set.seed(1234) train = sample(1 : nrow(X), round((4/5) * nrow(X))) test = -train # subset training data yTrain = y[train] XTrain = X[train, ] # subset test data yTest = y[test] XTest = X[test, ] #### Model Estimation & Selection #### # Estimate models: glmnet automatically standardizes the predictors fitRidge = glmnet(XTrain, yTrain, alpha = 0) # Ridge Regression fitLASSO = glmnet(XTrain, yTrain, alpha = 1) # The LASSO ### Plot Solution Path #### LASSO plot(fitLASSO, xvar="lambda", label="TRUE") # add label to upper x-axis mtext("(Nil-Phase) LASSO regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5) ### Plot Solution Path #### Ridge plot(fitRidge, xvar="lambda", label="TRUE") # add label to upper x-axis mtext("(Nil-Phase) Ridge regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5) #### 10-fold cross validation ##### LASSO library("glmnet") library(doParallel) registerDoParallel(6) set.seed(1234) # set seed cvLASSO = cv.glmnet(XTrain, yTrain, alpha = 1, parallel=TRUE) # (10-fold) cross validation for the LASSO plot(cvLASSO); mtext("(Nil-Phase) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5) ### Predict and Report LASSO MSE predLASSO <- predict(cvLASSO, s = cvLASSO$lambda.1se, newx = XTest) testMSE_LASSO <- mean((predLASSO - yTest)^2); testMSE_LASSO # LASSO coefficient estimates betaHatLASSO = as.double(coef(fitLASSO, s = cvLASSO$lambda.1se)) # s is lambda coefplot(betaHatLASSO[2:12], sd = rep(0, 11), pch=1, col.pts = "red", cex.pts = 2) legend("bottomright", "(Nil-Phase) LASSO", col = "blue", bty = "o", cex = 1) # 4. Perform LASSO modeling on ift_TruePhase_X2mag X2<-FT_aqi_data; X2_mag <- mag_FT_aqi_data; X2_phase<-phase_FT_aqi_data Real2 = X2_mag * cos(X2_phase) Imaginary2 = X2_mag * sin(X2_phase) man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2)) y3 <- man_hat_X2[ , 1] # CO as outcome variable X3 <- man_hat_X2[ , 2:13] # remaining features are predictors # subset training data y3Train = y3[train] X3Train = X3[train, ] # subset test data y3Test = y3[test] X3Test = X3[test, ] #### Model Estimation & Selection #### # Estimate models: glmnet automatically standardizes the predictors fitRidge3 = glmnet(X3Train, y3Train, alpha = 0) # Ridge Regression fitLASSO3 = glmnet(X3Train, y3Train, alpha = 1) # The LASSO ### Plot Solution Path #### LASSO plot(fitLASSO3, xvar="lambda", label="TRUE") # add label to upper x-axis mtext("(True-Phase) LASSO regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5) ### Plot Solution Path #### Ridge plot(fitRidge3, xvar="lambda", label="TRUE") # add label to upper x-axis mtext("(True-Phase) Ridge regularizer: Number of Nonzero (Active) Coefficients", side=3, line=2.5) #### 10-fold cross validation ##### LASSO library("glmnet") library(doParallel) registerDoParallel(6) set.seed(1234) # set seed cvLASSO3 = cv.glmnet(X3Train, y3Train, alpha = 1, parallel=TRUE) # (10-fold) cross validation for the LASSO plot(cvLASSO3); mtext("(True-Phase) CV LASSO: Number of Nonzero (Active) Coefficients", side=3, line=2.5) ### Predict and Report LASSO MSE predLASSO3 <- predict(cvLASSO3, s = cvLASSO3$lambda.1se, newx = X3Test) testMSE_LASSO3 <- mean((predLASSO3 - y3Test)^2); testMSE_LASSO3 # LASSO coefficient estimates betaHatLASSO3 = as.double(coef(fitLASSO3, s = cvLASSO3$lambda.1se)) # s is lambda coefplot(betaHatLASSO3[2:12], sd = rep(0, 11), pch=1, col.pts = "red", cex.pts = 2) legend("bottomright", "(True-Phase)LASSO", col = "blue", bty = "o", cex = 1.5) # 5. Compare the analytics results from #3, #6, and #7 # Stop local cluster stopImplicitCluster() ```