I have a long dataset with columns representing start and stop times, and I want to drop a row if it overlaps with another and has a higher priority (eg 1 is the highest priority). My example data is
library(tidyverse)
library(lubridate)
times_df <- tibble(start = as_datetime(c("2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45")),
stop = as_datetime(c("2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10")), priority = c(5,3,4,3,4))
The way I've come up with attacks the problem backwards by finding the overlaps with a higher priority value and then using an anti_join
to remove them from the original dataframe. This code doesn't work if there are three periods overlapping the same timepoint and I'm sure there's a more efficient and functional way to do this.
dropOverlaps <- function(df) {
drops <- df %>%
filter(stop > lead(start) | lag(stop) > start) %>%
mutate(group = ({seq(1, nrow(.)/2)} %>%
rep(each=2))) %>%
group_by(group) %>%
filter(priority == max(priority))
anti_join(df, drops)
}
dropOverlaps(times_df)
#> Joining, by = c("start", "stop", "priority")
#> # A tibble: 3 x 3
#> start stop priority
#> <dttm> <dttm> <dbl>
#> 1 2019-10-05 14:05:25 2019-10-05 14:19:20 5
#> 2 2019-10-05 17:30:20 2019-10-05 17:45:15 3
#> 3 2019-10-06 04:43:55 2019-10-06 04:59:00 3
Can anyone help me get the same output but with a cleaner function? Bonus if it can handle an input with three or more time periods that all overlap.
I've got a helper function that groups overlapping data/time data using the igraph package (it can include an overlap buffer, i.e. terminus are within 1 minute ...)
I used it to group your data based on intervals in lubridate, then do some data-wrangling to get only the top priority entry from overlapping times.
I'm not sure how well it will scale.
#### library ----
library(dplyr)
library(lubridate)
library(igraph)
#### data ----
times_df <- tibble(start = as_datetime(c("2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45")),
stop = as_datetime(c("2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10")), priority = c(5,3,4,3,4))
#### group_interval function ----
# buffer to take a form similar to: days(1), weeks(2), etc.
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
# apply buffer period to intervals
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
# Find groups via graph theory See igraph package
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
# create a 2 column df with row (index) and group number, arrange on row number and return distinct values
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
# returns
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
#### data munging ----
mutate(times_df, group = group_interval(start, stop)) %>%
group_by(group) %>%
top_n(1, desc(priority)) %>% # not sure why desc is needed, but top_n was giving the lower
ungroup() %>%
select(-group)
Which gives:
# A tibble: 3 x 3
start stop priority
<dttm> <dttm> <dbl>
1 2019-10-05 14:05:25 2019-10-05 14:19:20 5
2 2019-10-05 17:30:20 2019-10-05 17:45:15 3
3 2019-10-06 04:43:55 2019-10-06 04:59:00 3
Here is a data.table
solution using foverlaps
to detect the overlapping records (as already mentioned by @GenesRus).
The overlapping records are assigned to groups to filter the record with max. priority in the group. I added two more records to your example data, to show that this procedure is also working for three or more overlapping records:
Edit: I modified and translated @pgcudahy's solution to data.table
which gives even faster code:
library(data.table)
library(lubridate)
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
resultDT <- setDT(times_df, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
# old approach ------------------------------------------------------------
# times_dt <- as.data.table(times_df)
# setkey(times_dt, start, stop)[, index := .I]
# overlaps_dt <- foverlaps(times_dt, times_dt, type = "any", which = TRUE)[xid != yid][, group := fifelse(xid > yid, yes = paste0(yid, "_", xid), no = paste0(xid, "_", yid))]
# overlaps_merged <- merge(times_dt, overlaps_dt, by.x = "index", by.y = "xid")[, .(delete_index = index[priority == max(priority)]), by = "group"]
# result_dt <- times_dt[!unique(overlaps_merged$delete_index)][, index := NULL]
For further details please see ?foverlaps
- There are some more useful features implemented to control what is considered a overlap such as maxgap
, minoverlap
or type
(any, within, start, end and equal).
Update - new benchmark
Unit: microseconds
expr min lq mean median uq max neval
Paul 25572.550 26105.2710 30183.930 26514.342 29614.272 153810.600 100
MKa 5100.447 5276.8350 6508.333 5401.275 5832.270 23137.879 100
pgcudahy 3330.243 3474.4345 4284.640 3556.802 3748.203 21241.260 100
ismirsehregal 711.084 913.3475 1144.829 1013.096 1433.427 2316.159 100
Benchmark code:
#### library ----
library(dplyr)
library(lubridate)
library(igraph)
library(data.table)
library(microbenchmark)
#### data ----
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
times_tib <- as_tibble(times_df)
times_dt <- as.data.table(times_df)
#### group_interval function ----
# buffer to take a form similar to: days(1), weeks(2), etc.
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
# apply buffer period to intervals
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
# Find groups via graph theory See igraph package
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
# create a 2 column df with row (index) and group number, arrange on row number and return distinct values
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
# returns
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
#### benchmark ----
library(igraph)
library(data.table)
library(dplyr)
library(lubridate)
library(microbenchmark)
df_Paul <- df_MKa <- df_pgcudahy <- df_ismirsehregal <- times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-07 06:00:00",
"2019-10-07 06:10:00",
"2019-10-07 06:20:00",
"2019-10-08 06:00:00",
"2019-10-08 06:10:00",
"2019-10-08 06:20:00",
"2019-10-09 03:00:00",
"2019-10-09 03:10:00",
"2019-10-10 03:00:00",
"2019-10-10 03:10:00",
"2019-10-11 05:00:00",
"2019-10-11 05:00:00")
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-07 06:18:00",
"2019-10-07 06:28:00",
"2019-10-07 06:38:00",
"2019-10-08 06:18:00",
"2019-10-08 06:28:00",
"2019-10-08 06:38:00",
"2019-10-09 03:30:00",
"2019-10-09 03:20:00",
"2019-10-10 03:30:00",
"2019-10-10 03:20:00",
"2019-10-11 05:40:00",
"2019-10-11 05:40:00")
),
priority = c(5, 3, 4, 3, 4, 5, 4, 3, 4, 3, 4, 3, 1, 2, 2, 1, 3, 4)
)
benchmarks <- microbenchmark(Paul = {
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
times_tib <- as_tibble(df_Paul)
mutate(times_tib, group = group_interval(start, stop)) %>%
group_by(group) %>%
top_n(1, desc(priority)) %>%
ungroup() %>%
select(-group)
},
MKa = {
df_MKa$id <- 1:nrow(df_MKa)
# Create consolidated df which we will use to check if stop date is in between start and stop
my_df <- bind_rows(replicate(n = nrow(df_MKa), expr = df_MKa, simplify = FALSE))
my_df$stop_chk <- rep(df_MKa$stop, each = nrow(df_MKa))
# Flag if stop date sits in between start and stop
my_df$chk <- my_df$stop_chk >= my_df$start & my_df$stop_chk <= my_df$stop
my_df$chk_id <- df_MKa[match(my_df$stop_chk, df_MKa$stop), "id"]
# Using igrpah to cluster ids to create unique groups
# this will identify any overlapping groups
library(igraph)
g <- graph.data.frame(my_df[my_df$chk == TRUE, c("id", "chk_id")])
df_g <- data.frame(clusters(g)$membership)
df_g$chk_id <- row.names(df_g)
# copy the unique groups to the df
my_df$new_id <- df_g[match(my_df$chk_id, df_g$chk_id), "clusters.g..membership"]
my_df %>%
filter(chk == TRUE) %>%
arrange(priority) %>%
filter(!duplicated(new_id)) %>%
select(start, stop, priority) %>%
arrange(start)
}, pgcudahy = {
df_pgcudahy %>%
arrange(start) %>%
mutate(remove1 = ifelse((stop >= lead(start, default=FALSE)) &
(priority > lead(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
mutate(remove2 = ifelse((start <= lag(stop, default=FALSE)) &
(priority > lag(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
filter(remove1 == FALSE & remove2 == FALSE) %>%
select(1:3)
}, ismirsehregal = {
setDT(df_ismirsehregal, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
})
benchmarks
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With