I have two datasets: df1
contains windows of time that represent peak activity by id
. These are non-continous time series, and there are multiple windows (events) per id
, i.e. each id
has multiple peak activity periods. Below is a reproducible example I've made up but is not the real data (NOTE: I updated the data according to the comments below).
df1<-data.frame(start_date=seq(as.POSIXct("2014-09-04 00:00:00"), by = "hour", length.out = 10),
end_date=seq(as.POSIXct("2014-09-04 05:00:00"), by = "hour", length.out = 10),
values=runif(20,10,50),id=rep(seq(from=1,to=5,by=1),2))
df2
is a set of continuous timeseries of activity by id
. I want to subset the date.date
for each entry/peak activity in df1
(by id
).
date1<-data.frame(date=seq(as.POSIXct("2012-09-04 02:00:00"), by = "hour", length.out = 20), id=1)
date2<-data.frame(date=seq(as.POSIXct("2014-09-03 07:00:00"), by = "hour", length.out = 20),id=2)
date3<-data.frame(date=seq(as.POSIXct("2014-09-04 01:00:00"), by = "hour", length.out = 20),id=3)
df2<-data.frame(date=rbind(date1,date2,date3),values=runif(60,50,90))
The Goal: subset the continuous time series in df2
only between the start_time
to end_time
in df1
(by id), and keep the values
field from each df. There is a somewhat similar question here, but in that case the time period was static and known. I am struggling with how to do this given the multiple events per id.
data.table
has the function foverlaps
, which does what you need.
foverlaps
stands for “fast overlap joins”. The function takes two data frames (data.tables in this case) and returns the join.
Both data.tables need a start
and end
columns to calculate the overlaps. Since you only have one date column in df2
, I’m just creating a dummy_end
column with the same date as date.date
in df2
.
You can use the options by.x
and by.y
to indicate the start
and end
columns. However, you can also use keys to do so using the setkey
statement. The last two elements of the setkey
must be the start
and end
columns. The advantage of using setkey
is that you can add additional keys (before start and end) to filter the join further. In the present case, I’ll also set a key for id
.
The [, dummy_end := NULL]
is used to delete the dummy_end column.
library(data.table)
dt1 <- data.table(df1)
dt2 <- data.table(df2)
setnames(dt2,"date.id","id") #change name to "id" for easier comparison
dt2[, dummy_end := date.date] #create dumme end date column
setkey(dt1, id, start_date, end_date)
setkey(dt2, id, date.date, dummy_end)
foverlaps(dt2, dt1, nomatch = NULL)[, dummy_end := NULL]
In terms of performance, foverlaps
is marginally faster than dplyr
for this particular problem (but still slower than base R). Indeed, you can see below that I reran Paul's microbenchmark to add data.table
. However, I like the clean and simple data.table
syntax.
DATA and Benchmark
library(dplyr)
library(microbenchmark)
library(data.table)
df1 <- data.frame(start_date=seq(as.POSIXct("2014-09-04 00:00:00"),
by = "hour", length.out = 10),
end_date=seq(as.POSIXct("2014-09-04 05:00:00"),
by = "hour", length.out = 10),
values=runif(20,10,50),id=rep(seq(from=1,to=5,by=1),2))
date1 <-data.frame(date = seq(as.POSIXct("2012-09-04 02:00:00"),
by = "hour",
length.out = 20), id = 1)
date2 <-data.frame(date = seq(as.POSIXct("2014-09-03 07:00:00"),
by = "hour",
length.out = 20),id = 2)
date3 <-data.frame(date = seq(as.POSIXct("2014-09-04 01:00:00"),
by = "hour", length.out = 20),id = 3)
df2 <-data.frame(date = rbind(date1,date2,date3), values = runif(60,50,90))
dt1 <- data.table(df1)
dt2 <- data.table(df2)
setnames(dt2,"date.id","id") #change name to "id" for easier comparison
dt2[, dummy_end := date.date] #create dumme end date column
setkey(dt1, id, start_date, end_date)
setkey(dt2, id, date.date, dummy_end)
dplyr2 <- function(df1, df2) {
df <- left_join(df1, df2, by = c("id" = "date.id")) %>%
group_by(id) %>%
filter(date.date >= start_date &
date.date <= end_date) %>%
select(start_date,
end_date,
x_values = values.x,
y_values = values.y,
id,
date.date) %>%
ungroup()
}
baseR2 <- function(df1, df2) {
df_bR <- merge(df1, df2, by.x = "id", by.y = "date.id")
df_bR <- subset(
df_bR,
subset = df_bR$date.date >= df_bR$start_date &
df_bR$date.date <= df_bR$end_date,
select = c(start_date, end_date, values.x, values.y, id, date.date)
)
}
data.table2 <- function(dt1, dt2) {
foverlaps(dt2, dt1,nomatch = NULL)[, dummy_end := NULL]
}
microbenchmark(baseR = baseR2(df1, df2),
dplyr = dplyr2(df1, df2),
data.table=data.table2(dt1, dt2),
times = 50)
Unit: milliseconds
expr min lq mean median uq max neval
baseR 1.2328 1.3973 1.632302 1.4713 1.5596 7.0549 50
dplyr 8.2126 8.6865 9.628708 8.8531 9.2621 19.5883 50
data.table 6.6931 7.3884 7.974340 7.9406 8.3973 11.0060 50
Your goal is not entirely clear to me but this is my reading: if the time (ignore date) in date.date is within start_date and end_date you would like to subset by Id.
Here's how I approached this:
library(dplyr)
df1<-data.frame(start_date=seq(as.POSIXct("2014-09-04 00:00:00"), by = "hour", length.out = 10),
end_date=seq(as.POSIXct("2014-09-04 05:00:00"), by = "hour", length.out = 10),
values=runif(20,10,50),id=rep(seq(from=1,to=5,by=1),2))
date1<-data.frame(date=seq(as.POSIXct("2012-10-01 00:00:00"), by = "hour", length.out = 20), id=1)
date2<-data.frame(date=seq(as.POSIXct("2014-10-01 07:00:00"), by = "hour", length.out = 20), id=2)
date3<-data.frame(date=seq(as.POSIXct("2015-10-01 01:00:00"), by = "hour", length.out = 20), id=3)
df2<-data.frame(date=rbind(date1,date2,date3),values=runif(60,50,90))
df <- left_join(df1, df2, by = c("id" = "date.id")) %>%
mutate(date.date.hms = strftime(date.date, format = "%H:%M:%S"),
start_date.hms = strftime(start_date, format = "%H:%M:%S"),
end_date.hms = strftime(end_date, format = "%H:%M:%S")) %>%
mutate(date.date.hms = as.POSIXct(date.date.hms, format="%H:%M:%S"),
start_date.hms = as.POSIXct(start_date.hms, format="%H:%M:%S"),
end_date.hms = as.POSIXct(end_date.hms, format="%H:%M:%S")) %>%
group_by(id) %>%
filter(date.date.hms >= start_date.hms & date.date.hms <= end_date.hms) %>%
select(start_date, end_date, x_values = values.x, y_values = values.y, id, date.date) %>%
ungroup()
This results in the following data frame:
> df
# A tibble: 62 x 6
start_date end_date x_values y_values id date.date
<dttm> <dttm> <dbl> <dbl> <dbl> <dttm>
1 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 77.5 1 2012-10-01 00:00:00
2 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 54.5 1 2012-10-01 01:00:00
3 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 70.3 1 2012-10-01 02:00:00
4 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 85.5 1 2012-10-01 03:00:00
5 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 82.2 1 2012-10-01 04:00:00
6 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 57.4 1 2012-10-01 05:00:00
7 2014-09-04 01:00:00 2014-09-04 06:00:00 37.0 78.8 2 2014-10-02 01:00:00
8 2014-09-04 01:00:00 2014-09-04 06:00:00 37.0 51.9 2 2014-10-02 02:00:00
9 2014-09-04 02:00:00 2014-09-04 07:00:00 34.1 85.8 3 2015-10-01 02:00:00
10 2014-09-04 02:00:00 2014-09-04 07:00:00 34.1 69.4 3 2015-10-01 03:00:00
My approach was to first join the DFs by Id and then split the time information from the date (in the .hms columns) as a string and convert this back to a POSIXct object. This adds today's date to the time but that's ok if I just want to apply a filter on the time (and not the date). This results in a DF where the records have a date.date TIME within start_date and end_date. it is now easy to subset by Id column.
Is this what you are after?
UPDATE
LauraR explained that there is overlap in the dates in df1 & df2. She updated df1 & df2 in her example. With that update I can rewrite the code without the conversions of POSIXct to character and vice versa. it appears that as.POSIXct is a slow operation.
I can now do the following:
With the code:
library(dplyr)
library(microbenchmark)
df1 <- data.frame(start_date=seq(as.POSIXct("2014-09-04 00:00:00"), by = "hour", length.out = 10),
end_date=seq(as.POSIXct("2014-09-04 05:00:00"), by = "hour", length.out = 10),
values=runif(20,10,50),id=rep(seq(from=1,to=5,by=1),2))
date1 <-data.frame(date = seq(as.POSIXct("2012-09-04 02:00:00"),
by = "hour",
length.out = 20), id = 1)
date2 <-data.frame(date = seq(as.POSIXct("2014-09-03 07:00:00"),
by = "hour",
length.out = 20),id = 2)
date3 <-data.frame(date = seq(as.POSIXct("2014-09-04 01:00:00"),
by = "hour", l
ength.out = 20),id = 3)
df2 <-data.frame(date = rbind(date1,date2,date3), values = runif(60,50,90))
dplyr2 <- function(df1, df2) {
df <- left_join(df1, df2, by = c("id" = "date.id")) %>%
group_by(id) %>%
filter(date.date >= start_date &
date.date <= end_date) %>%
select(start_date,
end_date,
x_values = values.x,
y_values = values.y,
id,
date.date) %>%
ungroup()
}
baseR2 <- function(df1, df2) {
df_bR <- merge(df1, df2, by.x = "id", by.y = "date.id")
df_bR <- subset(
df_bR,
subset = df_bR$date.date >= df_bR$start_date &
df_bR$date.date <= df_bR$end_date,
select = c(start_date, end_date, values.x, values.y, id, date.date)
)
}
data_baseR <- baseR2(df1, df2)
data_dplyr <- dplyr2(df1, df2)
microbenchmark(baseR = baseR2(df1, df2),
dplyr = dplyr2(df1, df2),
times = 5)
This code is a lot faster than before and I am sure it will require less memory. Comparing between dplyr and baseR:
> data_baseR <- baseR2(df1, df2)
> microbenchmark(baseR = baseR2(df1, df2),
+ dplyr = dplyr2(df1, df2),
+ times = 5)
Unit: microseconds
expr min lq mean median uq max neval
baseR 897.5 905.3 1868.66 991.2 1041.0 5508.3 5
dplyr 5755.9 5970.2 6158.88 6277.4 6393.3 6397.6 5
shows that the baseR code runs a lot faster.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With