Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Sum/count between two dates in R

Tags:

date

r

I have a data frame with two columns - one is the Start date the other is the End date. I need to get a new data frame with two columns - a Date column and a column which is the count of observations from the first data frame where the date is between Start and End.

Have:

dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))

 Start        End
2017-01-01 2017-01-03
2017-01-02 2017-01-04
2017-01-03 2017-01-05
2017-01-04 2017-01-06
2017-01-05 2017-01-07

Want:

Date         Count
2017-01-01     1
2017-01-02     2
2017-01-03     3
2017-01-04     3
2017-01-05     3
2017-01-06     2
2017-01-07     1

I could use for loops, but is there a better way to do this in R?

like image 506
dpl136 Avatar asked Dec 11 '22 02:12

dpl136


1 Answers

These base options avoid grouping.

The first option uses logic to avoid Map or reshaping. It is:

  1. Give me all the dates$Start options
  2. Combine with dates$End when the difference between the two isn't 0 (i.e., if it's the same date I shouldn't double count it).
  3. Combine with dates$Start plus the seq_len where, yet again, the difference between isn't 0.
date_diffs <- dates$End - dates$Start
x <- c(dates[['Start']],
  with(subset(dates, subset = date_diffs > 0)
        ,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
           ))
data.frame(table(x))

The second option Vectorize()s the seq.Date() function. Then it's just simply combining the results.


#or
vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)
table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1)))

        Var1 Freq
1 2017-01-01    1
2 2017-01-02    2
3 2017-01-03    3
4 2017-01-04    3
5 2017-01-05    3
6 2017-01-06    2
7 2017-01-07    1

Performance: For @akrun's base option, I wrapped the table() result in data.frame() so everyone was producing a data.frame of some sort.

# The original data set copied to make 1080 rows
Unit: milliseconds
             expr       min         lq       mean     median        uq       max neval
  tmfmnk_complete 1629.3048 1647.52845 1680.82496 1664.07245 1697.4511 1828.4093    20
    tmfmnk_lubrid 6882.1404 6959.20810 7018.69083 7002.27455 7064.0898 7276.2349    20
     M_M_data.tab  103.4166  103.99925  108.33817  107.95715  108.6591  134.9388    20
 Ronak_stack_base  131.0364  134.23370  137.46651  137.32235  141.4388  144.5971    20
      akrun_purrr  133.4917  136.89080  138.92787  138.44575  140.7778  147.4172    20
       akrun_base  130.4179  134.16735  137.21640  136.98225  140.1182  145.8873    20
        cole_base   15.4396   15.60345   16.42558   16.74245   17.1322   17.4201    20
     cole_vec_seq  114.7890  118.44795  130.69493  121.76430  124.0880  309.7418    20
  cole_dt_version   15.9107   15.98035   16.56220   16.11790   17.0216   18.8438    20

#Original data set copied to make  180 rows:

Unit: milliseconds
             expr       min         lq        mean     median         uq       max neval
  tmfmnk_complete  275.6845  279.06390  281.871350  281.95420  284.16025  287.5412    20
    tmfmnk_lubrid 1136.1109 1161.35915 1176.073630 1169.81685 1176.87130 1277.6444    20
     M_M_data.tab   19.0258   19.33070   19.766890   19.45450   19.63410   24.7390    20
 Ronak_stack_base   22.2327   22.56530   23.234095   22.85260   23.20790   27.0589    20
      akrun_purrr   27.8797   28.50225   29.146325   28.71840   29.11915   33.3277    20
       akrun_base   22.3477   22.61135   23.370780   22.81920   23.41300   28.6941    20
        cole_base    3.4258    3.50735    3.642605    3.62470    3.67595    3.9780    20
     cole_vec_seq   19.9366   20.08345   21.359275   20.17250   22.48055   25.7780    20
  cole_dt_version    3.9992    4.09905    4.207690    4.16135    4.28265    4.5052    20

# Original dataset copied to make 30 rows

Unit: milliseconds
             expr      min        lq       mean    median        uq      max neval
  tmfmnk_complete  51.2437  52.16495  54.524465  52.55520  56.19050  66.9461    20
    tmfmnk_lubrid 192.1206 196.99550 198.501640 197.64815 201.42050 203.7031    20
     M_M_data.tab   4.9511   5.05215   5.215670   5.19315   5.33075   5.7740    20
 Ronak_stack_base   4.3609   4.51110   4.995405   4.54885   4.79490   8.8183    20
      akrun_purrr  10.9024  10.96420  11.622235  11.07575  11.58300  15.7751    20
       akrun_base   4.4919   4.55905   4.843730   4.60825   4.73760   8.4334    20
        cole_base   1.4225   1.48635   1.738995   1.58685   1.60780   5.2324    20
     cole_vec_seq   4.0648   4.16095   4.318665   4.24445   4.48420   4.7344    20
  cole_dt_version   1.9733   2.06385   2.132040   2.13965   2.18945   2.3612    20

#Original 5 row dataset

Unit: milliseconds
             expr     min       lq      mean   median       uq     max neval
  tmfmnk_complete 14.7549 14.90780 15.463195 15.10195 15.62030 18.9115    20
    tmfmnk_lubrid 37.2571 37.58240 41.583090 38.18540 40.57435 86.6058    20
     M_M_data.tab  2.6235  2.85145  3.037975  2.90815  2.97045  5.3476    20
 Ronak_stack_base  1.3305  1.38490  1.465170  1.49175  1.53355  1.5978    20
      akrun_purrr  7.7036  7.86260  8.212875  7.98790  8.18055 11.7898    20
       akrun_base  1.4046  1.43715  1.501945  1.51890  1.56545  1.6176    20
        cole_base  1.0560  1.09905  1.169260  1.16010  1.21595  1.3601    20
     cole_vec_seq  1.3547  1.40685  1.452515  1.45645  1.51385  1.5328    20
  cole_dt_version  1.5662  1.70555  1.813365  1.78930  1.84720  2.5267    20

Code for reference:

library(data.table)
library(dplyr)
library(purrr)
library(tidyverse)
library(microbenchmark)
library(lubridate)

dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))
dates_dt <- as.data.table(dates)

dates <- rbind(dates,dates,dates,dates,dates,dates) #repeat this as many times as you want
dates_dt <- as.data.table(dates)

vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)

microbenchmark(
  tmfmnk_complete = {
    dates %>%
      rowid_to_column() %>%
      gather(var, Date, -rowid) %>%
      group_by(rowid) %>%
      complete(Date = seq(min(Date), max(Date), by = "1 day")) %>%
      ungroup() %>%
      count(Date)
  }
  , tmfmnk_lubrid = {
    dates %>%
      rowwise() %>%
      mutate(Date = interval(Start, End)/days(1),
             Date = list(Start + days(0:Date))) %>%
      ungroup() %>%
      unnest() %>%
      count(Date)
  }
  , M_M_data.tab = {
    dates_dt[ ,.(Date = seq(Start, End, by = "day")), 
      by = 1:nrow(dates_dt)][,
                          .(count = .N), by = Date]
  }
  , Ronak_stack_base = {
    stack(table(do.call(c, Map(seq, dates$Start, dates$End, by = "1 day"))))
  }
  , akrun_purrr = {
    dates %>%
      transmute(Date = map2(Start, End, seq, by = "1 day")) %>%
      unnest(Date) %>%
      count(Date)
  }
  , akrun_base = {
    lst1 <- do.call(Map, c(f = seq,  unname(dates), by = "1 day"))
    data.frame(table(do.call(c, lst1)))
  }
  , cole_base = {
    date_diffs <- dates$End - dates$Start
    x <- c(dates[['Start']],
           with(subset(dates, subset = date_diffs > 0)
                ,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
           ))
    data.frame(table(x))
  }
  , cole_vec_seq = {
    data.frame(table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1))))
  }
  , cole_dt_version = {
    date_diffs <- dates$End - dates$Start
    dates_dt[date_diffs > 0, data.frame(table({diff_sub = End - Start -1; c(dates_dt[['Start']], End, rep(Start, diff_sub) + sequence(diff_sub))}))]
  }
, times = 20
  )
like image 184
Cole Avatar answered Dec 12 '22 14:12

Cole