df <- data.frame(group = c("a", "a", "b", "b"),
start = c("2017-05-01", "2019-04-03", "2011-03-03", "2014-05-07"),
end = c("2018-09-01", "2020-04-03", "2012-05-03", "2016-04-02"))
Let's say I have the following df:
group start end
1 a 2017-05-01 2018-09-01
2 a 2019-04-03 2020-04-03
3 b 2011-03-03 2012-05-03
4 b 2014-05-07 2016-04-02
I want to get it into this format, with each record split into starting date and 31/12 of that and subsequent years:
group start end
1 a 2017-05-01 2017-12-31
2 a 2018-01-01 2018-09-01
3 a 2019-04-03 2019-12-31
4 a 2020-01-01 2020-04-03
5 b 2011-03-03 2011-12-31
6 b 2012-01-01 2012-05-03
7 b 2014-05-07 2014-12-31
8 b 2015-01-01 2015-12-31
9 b 2016-01-01 2016-04-02
Any ideas on how to tackle this?
Edit:
My primary concerns are not the date ranges that are within the same year. However, as chinsoon12 pointed out, it would indeed be helpful if the approach could handle them as well, as for instance in this dataset:
df <- data.frame(group = c("a", "a", "b", "b", "c"),
start = c("2017-05-01", "2019-04-03", "2011-03-03", "2014-05-07", "2017-02-01"),
end = c("2018-09-01", "2020-04-03", "2012-05-03", "2016-04-02", "2017-04-05"))
The end result would leave the last line as it was:
group start end
1 a 2017-05-01 2017-12-31
2 a 2018-01-01 2018-09-01
3 a 2019-04-03 2019-12-31
4 a 2020-01-01 2020-04-03
5 b 2011-03-03 2011-12-31
6 b 2012-01-01 2012-05-03
7 b 2014-05-07 2014-12-31
8 b 2015-01-01 2015-12-31
9 b 2016-01-01 2016-04-02
10 c 2017-02-01 2017-04-05
A possible solution with data.table:
library(data.table)
setDT(df)
df[df[, rep(.I, 1 + year(end) - year(start))]
][, `:=` (start = pmax(start[1], as.Date(paste0(year(start[1]) + 0:(.N-1), '-01-01'))),
end = pmin(end[.N], as.Date(paste0(year(end[.N]) - (.N-1):0, '-12-31'))))
, by = .(group, rleid(start))][]
which gives:
group start end 1: a 2017-05-01 2017-12-31 2: a 2018-01-01 2018-09-01 3: a 2019-04-03 2019-12-31 4: a 2020-01-01 2020-04-03 5: b 2011-03-03 2011-12-31 6: b 2012-01-01 2012-05-03 7: b 2014-05-07 2014-12-31 8: b 2015-01-01 2015-12-31 9: b 2016-01-01 2016-04-02 10: c 2017-02-01 2017-04-05
Two alternative solutions with data.table:
# alternative 1:
df[, ri := rowid(group)
][df[, rep(.I, 1 + year(end) - year(start))]
][, `:=` (start = if (.N == 1) start else c(start[1], as.Date(paste0(year(start[1]) + 1:(.N-1), '-01-01') )),
end = if (.N == 1) end else c(as.Date(paste0(year(end[.N]) - (.N-1):1, '-12-31') ), end[.N]))
, by = .(group, ri)][, ri := NULL][]
# alternative 2:
df[, ri := rowid(group)
][df[, rep(.I, 1 + year(end) - year(start))]
][, `:=` (start = pmax(start[1], as.Date(paste0(year(start[1]) + 0:(.N-1), '-01-01'))),
end = pmin(end[.N], as.Date(paste0(year(end[.N]) - (.N-1):0, '-12-31'))))
, by = .(group, ri)][, ri := NULL][]
Used data:
df <- data.frame(group = c("a", "a", "b", "b", "c"),
start = c("2017-05-01", "2019-04-03", "2011-03-03", "2014-05-07", "2017-02-01"),
end = c("2018-09-01", "2020-04-03", "2012-05-03", "2016-04-02", "2017-04-05"))
df[2:3] <- lapply(df[2:3], as.Date)
library(tidyverse)
library(lubridate)
df%>%
mutate(end=as.Date(end),
start=as.Date(start),
diff=Map(":",0,1+year(end)-year(start)-1))%>%
unnest()%>%
mutate(end=pmin(end,as.Date(paste0(year(start)+diff,"-12-31"))),
start=pmax(start,as.Date(paste0(year(start)+diff,"-1-1"))),
diff=NULL)
A tibble: 9 x 3
group start end
<fct> <date> <date>
1 a 2017-05-02 2017-12-31
2 a 2018-01-01 2018-09-01
3 a 2019-04-03 2019-12-31
4 a 2020-01-01 2020-04-03
5 b 2020-01-01 2011-12-31
6 b 2012-01-01 2012-05-03
7 b 2014-05-07 2014-12-31
8 b 2015-01-01 2015-12-31
9 b 2015-01-01 2016-04-02
with the updated data just ran this exact function you will get:
group start end
1 a 2017-05-01 2017-12-31
2 a 2018-01-01 2018-09-01
3 a 2019-04-03 2019-12-31
4 a 2020-01-01 2020-04-03
5 b 2011-03-03 2011-12-31
6 b 2012-01-01 2012-05-03
7 b 2014-05-07 2014-12-31
8 b 2015-01-01 2015-12-31
9 b 2016-01-01 2016-04-02
10 c 2017-02-01 2017-04-05
Here's a no-tidyverse/no-data.table version:
df <- data.frame(group = c("a", "a", "b", "b"),
start = c("2017-05-01", "2019-04-03", "2011-03-03", "2014-05-07"),
end = c("2018-09-01", "2020-04-03", "2012-05-03", "2016-04-02"), stringsAsFactors=FALSE)
# added stringsAsFactors =FALSE to your df for sanity
# reformatting start and end as Date
df$start <- as.Date(df$start)
df$end <- as.Date(df$end)
dfs <- split(df, rownames(df))
# split the data frame by rows
res <- do.call(rbind, lapply(dfs, function(.){
s <- seq(from=.$start, to=.$end, by="day")
# sequence form df$start to df$end, by days
y <- format(s, "%Y")
# years of that sequence
s2 <- as.character(s)
# formatting s as character -- otherwise sapply will get rid of the
# Date class and the result will look as numeric
ys <- split(s2,y)
# split the sequence by years
data.frame(group=.$group, start=sapply(ys, head,1), end = sapply(ys, tail, 1), stringsAsFactors=FALSE)
# take the first and last element from each "sub-vector" of the split sequence
}))
rownames(res) <- NULL # kill the nasty rownames
res
group start end
1 a 2017-05-01 2017-12-31
2 a 2018-01-01 2018-09-01
3 a 2019-04-03 2019-12-31
4 a 2020-01-01 2020-04-03
5 b 2011-03-03 2011-12-31
6 b 2012-01-01 2012-05-03
7 b 2014-05-07 2014-12-31
8 b 2015-01-01 2015-12-31
9 b 2016-01-01 2016-04-02
Notice that the result has start
and end
columns as character
as it was in your original data frame.
I'm sorry for the way base R treats Date (and POSIXct) objects - you never know when they may lose their class and become simple numbers. Here I avoided this "feature" by treating dates as character, except when date operations were needed e.g in creating the sequence of days.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With