I came across Joseph Owen code here for batch forecasting. I have a dataset containing close to ~19k rows but the issue is even with the batch forecasting methodology applied my code still runs super slow.
I need to evaluate the best model for which I am using MAPE as the evaluating criteria before doing the actual forecast. Below is the workable code snippet for the same. My question is how to optimize the below code to make it run in the acceptable time (under 2 mins)
fcnChooseETS <- function(Ts){
TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not
ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")
ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data
if( !TsPositive ){
ModelsUsed <- ModelsNonPositive
}
lAllModels <- lapply(ModelsUsed, function(M){
ets(Ts, damped = NULL, model = M)
})
vecResult <- sapply(lAllModels, function(M) accuracy(M)[2])
names(vecResult) <- ModelsUsed
min(vecResult)
}
fcnTrending <- function( dt){
Ts <- lapply(transpose(dt), ts , frequency = 12 , end = FeedDate)
fit <- lapply(Ts , fcnChooseETS )
}
The following script tests 3 different ways of fitting the models in the question. The first of them is a more idiomatic version of the code posted in the question, the next two fit the several models in parallel.
This script was saved in file so_62497397.R
and run as below.
#
# filename: so_62497397.R
# Test serial and two types of parallel execution of
# exponential smoothing time series fitting.
library(parallel)
library(foreach)
library(doParallel)
library(forecast)
fcnChooseETS <- function(Ts){
TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not
ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")
ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data
if( !TsPositive ){
ModelsUsed <- ModelsNonPositive
}
lAllModels <- lapply(ModelsUsed, function(M){
ets(Ts, damped = NULL, model = M)
})
vecResult <- sapply(lAllModels, function(M) accuracy(M)[2])
names(vecResult) <- ModelsUsed
vecResult[which.min(vecResult)]
}
fcnChooseETS2 <- function(Ts, Ncpus = 2){
TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not
ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")
ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data
if( !TsPositive ){
ModelsUsed <- ModelsNonPositive
}
vecResult <- mclapply(ModelsUsed, function(M){
fit <- ets(Ts, damped = NULL, model = M)
accuracy(fit)[2]
}, mc.cores = Ncpus)
vecResult <- unlist(vecResult)
names(vecResult) <- ModelsUsed
vecResult[which.min(vecResult)]
}
fcnChooseETS3 <- function(Ts, Ncpus = 2){
TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not
ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")
ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data
if( !TsPositive ){
ModelsUsed <- ModelsNonPositive
}
cl <- makeCluster(Ncpus)
clusterExport(cl, 'ts')
clusterEvalQ(cl, library(forecast))
vecResult <- parLapply(cl, ModelsUsed, function(M){
fit <- ets(Ts, damped = NULL, model = M)
accuracy(fit)[2]
})
stopCluster(cl)
vecResult <- unlist(vecResult)
names(vecResult) <- ModelsUsed
vecResult[which.min(vecResult)]
}
makeTestdata <- function(N){
n <- length(USAccDeaths)
m <- ceiling(log2(N/n))
x <- as.numeric(USAccDeaths)
for(i in seq_len(m)) x <- c(x, x)
L <- length(x)/12 - 1
x <- ts(x, start = 2000 - L, frequency = 12)
x
}
numCores <- detectCores()
cat("numCores:", numCores, "\n")
x <- makeTestdata(5e3)
t1 <- system.time(
res1 <- fcnChooseETS(x)
)
t2 <- system.time(
res2 <- fcnChooseETS2(x, Ncpus = numCores)
)
t3 <- system.time(
res3 <- fcnChooseETS3(x, Ncpus = numCores)
)
rbind(t.lapply = t1,
t.mclapply = t2,
t.parLapply = t3)
c(res1, res2, res3)
Run with Rscript
on
The timings show that mclapply
is the best option, though not much faster than parLapply
. Of the fitted models, the chosen using MAPE are all the same, as they should be.
rui@rui:~$ Rscript --vanilla so_62497397.R
#Loading required package: iterators
#Registered S3 method overwritten by 'quantmod':
# method from
# as.zoo.data.frame zoo
#numCores: 4
# user.self sys.self elapsed user.child sys.child
#t.lapply 56.505 0.063 57.389 0.000 0.00
#t.mclapply 0.039 0.024 33.983 30.506 0.26
#t.parLapply 0.040 0.012 36.317 0.001 0.00
# ANA ANA ANA
#263.0876 263.0876 263.0876
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With