I'm running a rolling regression very similar to the following code:
library(PerformanceAnalytics)
library(quantmod)
data(managers)
FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4)
MyRegression <- function(df,FL) {
df <- as.data.frame(df)
model <- lm(FL,data=df[1:30,])
predict(model,newdata=df[31,])
}
system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL,
by.column = FALSE, align = "right", na.pad = TRUE))
I've got some extra processors, so I'm trying to find a way to parallelize the rolling window. If this was a non-rolling regression I could easily parallelize it using the apply family of functions...
Moving window analysis, sometimes referred to as focal analysis, is the process of calculating a value for a specific neighborhood of cells in a given raster. Typical functions calculated across the neighborhood are sum, mean, min, max, range, etc.
Rolling regressions are one of the simplest models for analysing changing relationships among variables overtime. They use linear regression but allow the data set used to change over time. In most linear regression models, parameters are assumed to be time-invariant and thus should not change overtime.
A Rolling window is expressed relative to the delivery date and automatically shifts forward with the passage of time. For example, a customer with a 5-year Rolling window who gets a delivery on May 4, 2015 would receive data covering the period from May 4, 2015 to May 4, 2020.
Performing a rolling regression (a regression with a rolling time window) simply means, that you conduct regressions over and over again, with subsamples of your original full sample. For example you could perform the regressions using windows with a size of 50 each, i.e. from 1:50, then from 51:100 etc.
The obvious one is to use lm.fit()
instead of lm()
so you don't incur all the overhead in processing the formula etc.
Update: So when I said obvious what I meant to say was blindingly obvious but deceptively difficult to implement!
After a bit of fiddling around, I came up with this
library(PerformanceAnalytics)
library(quantmod)
data(managers)
The first stage is to realise that the model matrix can be prebuilt, so we do that and convert it back to a Zoo object for use with rollapply()
:
mmat2 <- model.frame(Next(HAM1) ~ HAM1 + HAM2 + HAM3 + HAM4, data = managers,
na.action = na.pass)
mmat2 <- cbind.data.frame(mmat2[,1], Intercept = 1, mmat2[,-1])
mmatZ <- as.zoo(mmat2)
Now we need a function that will employ lm.fit()
to do the heavy lifting without having to create design matrices at each iteration:
MyRegression2 <- function(Z) {
## store value we want to predict for
pred <- Z[31, -1, drop = FALSE]
## get rid of any rows with NA in training data
Z <- Z[1:30, ][!rowSums(is.na(Z[1:30,])) > 0, ]
## Next() would lag and leave NA in row 30 for response
## but we precomputed model matrix, so drop last row still in Z
Z <- Z[-nrow(Z),]
## fit the model
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
## get things we need to predict, in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for row 31 of data passed in
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
A comparison of timings:
> system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL,
+ by.column = FALSE, align = "right",
+ na.pad = TRUE))
user system elapsed
0.925 0.002 1.020
>
> system.time(Result2 <- rollapply(mmatZ, 31, FUN = MyRegression2,
+ by.column = FALSE, align = "right",
+ na.pad = TRUE))
user system elapsed
0.048 0.000 0.05
Which affords a pretty reasonable improvement over the original. And now check that the resulting objects are the same:
> all.equal(Result, Result2)
[1] TRUE
Enjoy!
I wrote a package, rollRegres
, that does this much faster. It is ~ 58 times faster than Gavin Simpson's answer. Here is an example
# simulate data
set.seed(101)
n <- 10000
wdth <- 50
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(X %*% runif(10)) + rnorm(n)
Z <- cbind(y, X)
# assign other function
lm_version <- function(Z, width = wdth) {
pred <- Z[width + 1, -1, drop = FALSE]
## fit the model
Z <- Z[-nrow(Z), ]
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
## get things we need to predict, in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for next obs
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
# show that they yield the same
library(rollRegres) # the new package
library(zoo)
all.equal(
rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
roll_regres.fit(
x = X, y = y, width = wdth, do_compute = "1_step_forecasts"
)$one_step_forecasts)
#R [1] TRUE
# benchmark
library(compiler)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
newnew = roll_regres.fit(
x = X, y = y, width = wdth, do_compute = "1_step_forecasts"),
prev = rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
times = 10)
#R Unit: milliseconds
#R expr min lq mean median uq max neval
#R newnew 10.27279 10.48929 10.91631 11.04139 11.13877 11.87121 10
#R prev 555.45898 565.02067 582.60309 582.22285 602.73091 605.39481 10
You can reduce the run time by updating a decomposition. This yields an cost at each iteration instead of where n is you window width. Below is a code to compare the two. It would likely be much faster doing it in C++ but the LINPACK dchud
and dchdd
are not included with R so you would have to write a package to do so. Further, I recall reading that you may do faster with other implementations than the LINPACK dchud
and dchdd
for the R update
library(SamplerCompare) # for LINPACK `chdd` and `chud`
roll_forcast <- function(X, y, width){
n <- nrow(X)
p <- ncol(X)
out <- rep(NA_real_, n)
is_first <- TRUE
i <- width
while(i < n){
if(is_first){
is_first <- FALSE
qr. <- qr(X[1:width, ])
R <- qr.R(qr.)
# Use X^T for the rest
X <- t(X)
XtY <- drop(tcrossprod(y[1:width], X[, 1:width]))
} else {
x_new <- X[, i]
x_old <- X[, i - width]
# update R
R <- .Fortran(
"dchud", R, p, p, x_new, 0., 0L, 0L,
0., 0., numeric(p), numeric(p),
PACKAGE = "SamplerCompare")[[1]]
# downdate R
R <- .Fortran(
"dchdd", R, p, p, x_old, 0., 0L, 0L,
0., 0., numeric(p), numeric(p), integer(1),
PACKAGE = "SamplerCompare")[[1]]
# update XtY
XtY <- XtY + y[i] * x_new - y[i - width] * x_old
}
coef. <- .Internal(backsolve(R, XtY, p, TRUE, TRUE))
coef. <- .Internal(backsolve(R, coef., p, TRUE, FALSE))
i <- i + 1
out[i] <- X[, i] %*% coef.
}
out
}
# simulate data
set.seed(101)
n <- 10000
wdth = 50
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(X %*% runif(10)) + rnorm(n)
Z <- cbind(y, X)
# assign other function
lm_version <- function(Z, width = wdth) {
pred <- Z[width + 1, -1, drop = FALSE]
## fit the model
Z <- Z[-nrow(Z), ]
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
## get things we need to predict, in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for row 31 of data passed in
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
# show that they yield the same
library(zoo)
all.equal(
rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
roll_forcast(X, y, wdth))
#R> [1] TRUE
# benchmark
library(compiler)
roll_forcast <- cmpfun(roll_forcast)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
new = roll_forcast(X, y, wdth),
prev = rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
times = 10)
#R> Unit: milliseconds
#R> expr min lq mean median uq max neval cld
#R> new 113.7637 115.4498 129.6562 118.6540 122.4930 230.3414 10 a
#R> prev 639.6499 674.1677 682.1996 678.6195 686.8816 763.8034 10 b
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With