Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Calculate rolling weighted average in data.table using frollsum

Tags:

r

data.table

The goal is to calculate the weighted average by group with a window of 3 rows and weights 3, 2, 1 in order of the most recent rows. This is similar to the question here, but the weights are not given by a column. Also, I really would like to use frollsum() because I'm working with a lot of data and need it to be performant.

I have a solution using frollapply():

library(data.table)

# Your data
set.seed(1)
DT <- data.table(group = rep(c(1, 2), each = 10), value = round(runif(n = 20, 1, 5)))

weights <- 1:3
k <- 3

weighted_average <- function(x) {
  sum(x * weights[1:length(x)]) / sum(weights[1:length(x)])
}

# Apply rolling weighted average
DT[, wtavg := shift(frollapply(value, k, weighted_average, align = "right", fill = NA)), 
   by = group]

DT 
#>     group value    wtavg
#>  1:     1     2       NA
#>  2:     1     2       NA
#>  3:     1     3       NA
#>  4:     1     5 2.500000
#>  5:     1     2 3.833333
#>  6:     1     5 3.166667
#>  7:     1     5 4.000000
#>  8:     1     4 4.500000
#>  9:     1     4 4.500000
#> 10:     1     1 4.166667
#> 11:     2     2       NA
#> 12:     2     2       NA
#> 13:     2     4       NA
#> 14:     2     3 3.000000
#> 15:     2     4 3.166667
#> 16:     2     3 3.666667
#> 17:     2     4 3.333333
#> 18:     2     5 3.666667
#> 19:     2     3 4.333333
#> 20:     2     4 3.833333

Created on 2023-11-27 with reprex v2.0.2

like image 287
Giovanni Colitti Avatar asked Oct 19 '25 11:10

Giovanni Colitti


2 Answers

Probably not the optimal way (I would look into Rcpp) but you could get a significant speed up using simply frollsum thrice:

shift((frollsum(value, 3) + frollsum(value, 2) + frollsum(value, 1)) / 6)

Note that frollsum(value, 1) could be replaced by value.

Another (seemingly) faster and simpler alternative:

(c(shift(value, 3) + 2 * shift(value, 2) + 3 * shift(value, 1)) / 6,

Benchmarking

set.seed(1)
n = 1000000
groups = 1:1000
DT <- data.table(group = rep(groups, each = n/length(groups)), value = round(runif(n = n, 1, 5)))


bench::mark(
  A = {
    DT[, shift(frollapply(value, k, weighted_average, align = "right", fill = NA)), 
   by = group]
  },
  B = {
    DT[, shift((frollsum(value, 3) + frollsum(value, 2) + value) / 6),
       by = group]
  }
)

#   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
# 1 A             1.75s    1.75s     0.570    46.7MB     34.2     1    60      1.75s
# 2 B           80.38ms  85.72ms     9.24     77.4MB     12.9     5     7   541.13ms
like image 127
sindri_baldur Avatar answered Oct 21 '25 03:10

sindri_baldur


I did some more benchmarking using frollsum(), frollapply(), and RcppRoll::roll_meanr() at different lag lengths, number of groups, and number of observations. Each method was adapted to accept multiple variables.

RcppRoll::roll_meanr() is the clear winner.

# Using 12 threads
data.table::getDTthreads()
#> [1] 12

# Function to create data
make_data <- function(n, n_groups, seed = 1) {
  set.seed(seed)
  data.table::data.table(
    group = rep(seq(n_groups), each = n / n_groups), 
    value1 = round(runif(n = n, 1, 5)),
    value2 = round(runif(n = n, 1, 6))
  )  
}

# Used inside fapplysum() below
weighted_average <- function(x, weights) {
  sum(x * weights[1:length(x)]) / sum(weights[1:length(x)])
}

# Weighted average using RcppRoll::rollmeanr()
rcpp_roll <- function(dt, weights) {
  dt[, RcppRoll::roll_meanr(as.matrix(.SD), weights = weights) |> 
       as.data.frame() |> 
       data.table::shift(), 
     by = group, 
     .SDcols = c("value1", "value2")]
}

# Weighted average using frollsum()
fsum <- function(dt, weights) {
  dt[
    , 
    {
      x <- purrr::map(
        weights,
        ~data.table::frollsum(.SD, n = .x)
      )
      
      purrr::map(seq(ncol(.SD)), \(i) purrr::map(x, ~.x[[i]])) |> 
        purrr::map(~Reduce(`+`, .x) / sum(weights)) |>
        data.table::shift()
    }
    ,
    .SDcols = c("value1", "value2"), 
    by = group]
}

# Weighted average using sum() and frollapply()
fapplysum <- function(dt, weights) {
  dt[, data.table::frollapply(.SD, length(weights), weighted_average, weights = weights) |> 
       data.table::shift(), 
     .SDcols = c("value1", "value2"),
     by = group]
}

# Create benchmarking template
benchmark <- function(dt, weights, iter = 10) {
  bench::mark(
    filter_gc = FALSE,
    iterations = iter,
    rcpp_roll = rcpp_roll(dt, weights),
    fsum = fsum(dt, weights),
    fapplysum = fapplysum(dt, weights)
  )  
} 

# Create benchmarking data
combos <- expand.grid(
  n = c(1e3, 1e4, 5e4, 7.5e4, 1e5),
  n_groups = c(1, 10, 50),
  k = c(3, 10, 50)
)

combos$weights <-  sapply(combos$k, \(k) 1:k)

data <- purrr::map2(
  combos$n,
  combos$n_groups,
  make_data
)

# Benchmark rolling weighted average functions
results <- purrr::map2(
  .progress = TRUE,
  data,
  combos$weights,
  benchmark
)

# Plot results
library(ggplot2)

results2 <- dplyr::bind_rows(.id = "id", results) |> 
  dplyr::mutate(id = as.numeric(id)) |> 
  dplyr::select(id, expression, median) 


plot_data <- results2 |> 
  dplyr::inner_join(combos |> 
                      dplyr::select(-weights) |> 
                      dplyr::mutate(id = dplyr::row_number()), by = "id") |> 
  # average over different lag lengths and number of groups
  dplyr::summarise(
    .by = c("expression", "n"),
    time_avg = mean(as.numeric(median))
  ) |> 
  dplyr::mutate(expression = expression |> as.character() |> as.factor())

ggplot(data = plot_data) +
  geom_line(aes(x = n, y = time_avg, color = expression)) +
  ggtitle("Comparing execution time for different rolling weighted average functions") +
  ylab("Avg. Time (s)") +
  xlab("Number of Obs.")

Created on 2023-12-09 with reprex v2.0.2

like image 40
Giovanni Colitti Avatar answered Oct 21 '25 03:10

Giovanni Colitti