I have a data frame with two columns - one is the Start date the other is the End date. I need to get a new data frame with two columns - a Date column and a column which is the count of observations from the first data frame where the date is between Start and End.
Have:
dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))
Start End
2017-01-01 2017-01-03
2017-01-02 2017-01-04
2017-01-03 2017-01-05
2017-01-04 2017-01-06
2017-01-05 2017-01-07
Want:
Date Count
2017-01-01 1
2017-01-02 2
2017-01-03 3
2017-01-04 3
2017-01-05 3
2017-01-06 2
2017-01-07 1
I could use for loops, but is there a better way to do this in R?
These base options avoid grouping.
The first option uses logic to avoid Map
or reshaping. It is:
dates$Start
optionsdates$End
when the difference between the two isn't 0 (i.e., if it's the same date I shouldn't double count it).dates$Start
plus the seq_len
where, yet again, the difference between isn't 0. date_diffs <- dates$End - dates$Start
x <- c(dates[['Start']],
with(subset(dates, subset = date_diffs > 0)
,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
))
data.frame(table(x))
The second option Vectorize()
s the seq.Date()
function. Then it's just simply combining the results.
#or
vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)
table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1)))
Var1 Freq
1 2017-01-01 1
2 2017-01-02 2
3 2017-01-03 3
4 2017-01-04 3
5 2017-01-05 3
6 2017-01-06 2
7 2017-01-07 1
Performance:
For @akrun's base option, I wrapped the table()
result in data.frame()
so everyone was producing a data.frame
of some sort.
# The original data set copied to make 1080 rows
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 1629.3048 1647.52845 1680.82496 1664.07245 1697.4511 1828.4093 20
tmfmnk_lubrid 6882.1404 6959.20810 7018.69083 7002.27455 7064.0898 7276.2349 20
M_M_data.tab 103.4166 103.99925 108.33817 107.95715 108.6591 134.9388 20
Ronak_stack_base 131.0364 134.23370 137.46651 137.32235 141.4388 144.5971 20
akrun_purrr 133.4917 136.89080 138.92787 138.44575 140.7778 147.4172 20
akrun_base 130.4179 134.16735 137.21640 136.98225 140.1182 145.8873 20
cole_base 15.4396 15.60345 16.42558 16.74245 17.1322 17.4201 20
cole_vec_seq 114.7890 118.44795 130.69493 121.76430 124.0880 309.7418 20
cole_dt_version 15.9107 15.98035 16.56220 16.11790 17.0216 18.8438 20
#Original data set copied to make 180 rows:
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 275.6845 279.06390 281.871350 281.95420 284.16025 287.5412 20
tmfmnk_lubrid 1136.1109 1161.35915 1176.073630 1169.81685 1176.87130 1277.6444 20
M_M_data.tab 19.0258 19.33070 19.766890 19.45450 19.63410 24.7390 20
Ronak_stack_base 22.2327 22.56530 23.234095 22.85260 23.20790 27.0589 20
akrun_purrr 27.8797 28.50225 29.146325 28.71840 29.11915 33.3277 20
akrun_base 22.3477 22.61135 23.370780 22.81920 23.41300 28.6941 20
cole_base 3.4258 3.50735 3.642605 3.62470 3.67595 3.9780 20
cole_vec_seq 19.9366 20.08345 21.359275 20.17250 22.48055 25.7780 20
cole_dt_version 3.9992 4.09905 4.207690 4.16135 4.28265 4.5052 20
# Original dataset copied to make 30 rows
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 51.2437 52.16495 54.524465 52.55520 56.19050 66.9461 20
tmfmnk_lubrid 192.1206 196.99550 198.501640 197.64815 201.42050 203.7031 20
M_M_data.tab 4.9511 5.05215 5.215670 5.19315 5.33075 5.7740 20
Ronak_stack_base 4.3609 4.51110 4.995405 4.54885 4.79490 8.8183 20
akrun_purrr 10.9024 10.96420 11.622235 11.07575 11.58300 15.7751 20
akrun_base 4.4919 4.55905 4.843730 4.60825 4.73760 8.4334 20
cole_base 1.4225 1.48635 1.738995 1.58685 1.60780 5.2324 20
cole_vec_seq 4.0648 4.16095 4.318665 4.24445 4.48420 4.7344 20
cole_dt_version 1.9733 2.06385 2.132040 2.13965 2.18945 2.3612 20
#Original 5 row dataset
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 14.7549 14.90780 15.463195 15.10195 15.62030 18.9115 20
tmfmnk_lubrid 37.2571 37.58240 41.583090 38.18540 40.57435 86.6058 20
M_M_data.tab 2.6235 2.85145 3.037975 2.90815 2.97045 5.3476 20
Ronak_stack_base 1.3305 1.38490 1.465170 1.49175 1.53355 1.5978 20
akrun_purrr 7.7036 7.86260 8.212875 7.98790 8.18055 11.7898 20
akrun_base 1.4046 1.43715 1.501945 1.51890 1.56545 1.6176 20
cole_base 1.0560 1.09905 1.169260 1.16010 1.21595 1.3601 20
cole_vec_seq 1.3547 1.40685 1.452515 1.45645 1.51385 1.5328 20
cole_dt_version 1.5662 1.70555 1.813365 1.78930 1.84720 2.5267 20
Code for reference:
library(data.table)
library(dplyr)
library(purrr)
library(tidyverse)
library(microbenchmark)
library(lubridate)
dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))
dates_dt <- as.data.table(dates)
dates <- rbind(dates,dates,dates,dates,dates,dates) #repeat this as many times as you want
dates_dt <- as.data.table(dates)
vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)
microbenchmark(
tmfmnk_complete = {
dates %>%
rowid_to_column() %>%
gather(var, Date, -rowid) %>%
group_by(rowid) %>%
complete(Date = seq(min(Date), max(Date), by = "1 day")) %>%
ungroup() %>%
count(Date)
}
, tmfmnk_lubrid = {
dates %>%
rowwise() %>%
mutate(Date = interval(Start, End)/days(1),
Date = list(Start + days(0:Date))) %>%
ungroup() %>%
unnest() %>%
count(Date)
}
, M_M_data.tab = {
dates_dt[ ,.(Date = seq(Start, End, by = "day")),
by = 1:nrow(dates_dt)][,
.(count = .N), by = Date]
}
, Ronak_stack_base = {
stack(table(do.call(c, Map(seq, dates$Start, dates$End, by = "1 day"))))
}
, akrun_purrr = {
dates %>%
transmute(Date = map2(Start, End, seq, by = "1 day")) %>%
unnest(Date) %>%
count(Date)
}
, akrun_base = {
lst1 <- do.call(Map, c(f = seq, unname(dates), by = "1 day"))
data.frame(table(do.call(c, lst1)))
}
, cole_base = {
date_diffs <- dates$End - dates$Start
x <- c(dates[['Start']],
with(subset(dates, subset = date_diffs > 0)
,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
))
data.frame(table(x))
}
, cole_vec_seq = {
data.frame(table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1))))
}
, cole_dt_version = {
date_diffs <- dates$End - dates$Start
dates_dt[date_diffs > 0, data.frame(table({diff_sub = End - Start -1; c(dates_dt[['Start']], End, rep(Start, diff_sub) + sequence(diff_sub))}))]
}
, times = 20
)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With