I want to get the rolling 7-day sum by ID. Suppose my data looks like this:
data<-as.data.frame(matrix(NA,42,3))
data$V1<-seq(as.Date("2014-05-01"),as.Date("2014-09-01"),by=3)
data$V2<-rep(1:6,7)
data$V3<-rep(c(1,2),21)
colnames(data)<-c("Date","USD","ID")
Date USD ID
1 2014-05-01 1 1
2 2014-05-04 2 2
3 2014-05-07 3 1
4 2014-05-10 4 2
5 2014-05-13 5 1
6 2014-05-16 6 2
7 2014-05-19 1 1
8 2014-05-22 2 2
9 2014-05-25 3 1
10 2014-05-28 4 2
How can I add a new column that will contain the rolling 7-day sum by ID?
If your data is big, you might want to check out this solution which uses data.table
. It is pretty fast. If you need more speed, you can always change mapply
to mcmapply
and use multiple cores.
#Load data.table and convert to data.table object
require(data.table)
setDT(data)[,ID2:=.GRP,by=c("ID")]
#Build reference table
Ref <- data[,list(Compare_Value=list(I(USD)),Compare_Date=list(I(Date))), by=c("ID2")]
#Use mapply to get last seven days of value by id
data[,Roll.Val := mapply(RD = Date,NUM=ID2, function(RD, NUM) {
d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
sum((d <= 0 & d >= -7)*Ref$Compare_Value[[NUM]])})]
1) Assuming you mean every successive overlapping 7 rows for that ID:
library(zoo)
transform(data, roll = ave(USD, ID, FUN = function(x) rollsumr(x, 7, fill = NA)))
2) If you really did mean 7 days and not 7 rows then try this:
library(zoo)
z <- read.zoo(data)
z0 <- merge(z, zoo(, seq(start(z), end(z), "day")), fill = 0) # expand to daily
roll <- function(x) rollsumr(x, 7, fill = NA)
transform(data, roll = ave(z0$USD, z0$ID, FUN = roll)[time(z)])
Updated Added (2) and made some improvements.
Dataset provided by OP does not expose the complexity of the task. In terms of addressing OP question so far only Mike's answer was the correct one.
In fact for a 8 rolling days, instead of 7 rolling days, due to d <= 0 & d >= -7
.zoo
solution by @G. Grothendieck is almost valid, only if merge
would be made to each group of ID
.
Below second data.table solution, this time valid results, using dev RcppRoll which allows na.rm=TRUE
.
And slightly formatted Mike's solution output.
data<-as.data.frame(matrix(NA,42,3))
data$V1<-seq(as.Date("2014-05-01"),as.Date("2014-09-01"),by=3)
data$V2<-rep(1:6,7)
data$V3<-rep(c(1,2),21)
colnames(data)<-c("Date","USD","ID")
library(microbenchmark)
library(RcppRoll) # install_github("kevinushey/RcppRoll")
library(data.table) # install_github("Rdatatable/data.table")
correct_jan_dt = function(n, partial=TRUE){
DT = as.data.table(data) # this can be speedup by setDT()
date.range = DT[,range(Date)]
all.dates = seq.Date(date.range[1],date.range[2],by=1)
setkey(DT,ID,Date)
r = DT[CJ(unique(ID),all.dates)][, c("roll") := as.integer(roll_sumr(USD, n, normalize = FALSE, na.rm = TRUE)), by="ID"][!is.na(USD)]
# This could be simplified when `partial` arg will be implemented in [kevinushey/RcppRoll](https://github.com/kevinushey/RcppRoll)
if(isTRUE(partial)){
r[is.na(roll), roll := cumsum(USD), by="ID"][]
}
return(r[order(Date,ID)])
}
correct_mike_dt = function(){
data = as.data.table(data)[,ID2:=.GRP,by=c("ID")]
#Build reference table
Ref <- data[,list(Compare_Value=list(I(USD)),Compare_Date=list(I(Date))), by=c("ID2")]
#Use mapply to get last seven days of value by id
data[, c("roll") := mapply(RD = Date,NUM=ID2, function(RD, NUM){
d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
sum((d <= 0 & d >= -7)*Ref$Compare_Value[[NUM]])})][,ID2:=NULL][]
}
identical(correct_mike_dt(), correct_jan_dt(n=8,partial=TRUE))
# [1] TRUE
microbenchmark(unit="relative", times=5L, correct_mike_dt(), correct_jan_dt(8))
# Unit: relative
# expr min lq mean median uq max neval
# correct_mike_dt() 274.0699 273.9892 267.2886 266.6009 266.2254 256.7296 5
# correct_jan_dt(8) 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 5
Looking forward for update from @Khashaa.
Timing on a bigger (still very tiny) dataset, 5439 rows:
library(zoo)
library(data.table)
library(dplyr)
library(RcppRoll)
library(microbenchmark)
data<-as.data.frame(matrix(NA,5439,3))
data$V1<-seq(as.Date("1970-01-01"),as.Date("2014-09-01"),by=3)
data$V2<-sample(1:6,5439,TRUE)
data$V3<-sample(c(1,2),5439,TRUE)
colnames(data)<-c("Date","USD","ID")
zoo_f = function(){
z <- read.zoo(data)
z0 <- merge(z, zoo(, seq(start(z), end(z), "day")), fill = 0) # expand to daily
roll <- function(x) rollsumr(x, 7, fill = NA)
transform(data, roll = ave(z0$USD, z0$ID, FUN = roll)[time(z)])
}
dt_f = function(){
DT = as.data.table(data) # this can be speedup by setDT()
date.range = DT[,range(Date)]
all.dates = seq.Date(date.range[1],date.range[2],by=1)
setkey(DT,Date)
DT[.(all.dates)
][order(Date), c("roll") := rowSums(setDT(shift(USD, 0:6, NA, "lag")),na.rm=FALSE), by="ID"
][!is.na(ID)]
}
dp_f = function(){
data %>% group_by(ID) %>%
mutate(roll=roll_sum(c(rep(NA,6), USD), 7))
}
dt2_f = function(){
# this can be speedup by setDT()
as.data.table(data)[, c("roll") := roll_sum(c(rep(NA,6), USD), 7), by="ID"][]
}
identical(as.data.table(zoo_f()),dt_f())
# [1] TRUE
identical(setDT(as.data.frame(dp_f())),dt_f())
# [1] TRUE
identical(dt2_f(),dt_f())
# [1] TRUE
microbenchmark(unit="relative", times=20L, zoo_f(), dt_f(), dp_f(), dt2_f())
# Unit: relative
# expr min lq mean median uq max neval
# zoo_f() 140.331889 141.891917 138.064126 139.381336 136.029019 137.730171 20
# dt_f() 14.917166 14.464199 15.210757 16.898931 16.543811 14.221987 20
# dp_f() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
# dt2_f() 1.536896 1.521983 1.500392 1.518641 1.629916 1.337903 20
Yet I'm not sure if my data.table code is already optimal.
Above functions did not answer OP question. Read the top of post for update. Mike's solution was the correct one.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With