R dplyr join by range or virtual column

Question

I want to join two tibbles by a range or a virtual column. but it seems the by - parameter just allow to handle chr oder vector(chr) of existing column names.

In my example I have a tibble d with a column value, and a tibble r with a from and a to column.

d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])

> d
# A tibble: 26 x 1
   value
   <dbl>
 1   1.0
 2   1.2
 3   1.4
 4   1.6
 5   1.8
 6   2.0
 7   2.2
 8   2.4
 9   2.6
10   2.8
# ... with 16 more rows

> r
# A tibble: 6 x 3
   from    to class
  <int> <dbl> <chr>
1     1     2     A
2     2     3     B
3     3     4     C
4     4     5     D
5     5     6     E
6     6   Inf     F

now I want to join the value column in d within the range of from and to in r:

d %>% inner_join(r, by = "value between from and to")     # >= and <

I can't find a way to do this so decided to join the floor of value in d with the from column in r

d %>% inner_join(r, by = c("floor(value)" = "from"))

of course i can create a second column to solve that:

d %>% 
  mutate(join_value = floor(value)) %>% 
  inner_join(r, by = c("join_value" = "from")) %>% 
  select(value, class)

# A tibble: 26 x 2
   value class
   <dbl> <chr>
 1   1.0     A
 2   1.2     A
 3   1.4     A
 4   1.6     A
 5   1.8     A
 6   2.0     B
 7   2.2     B
 8   2.4     B
 9   2.6     B
10   2.8     B
# ... with 16 more rows

but isn't there a more comfortable way?

Thanks

acylam · Accepted Answer

I don't think inequality joins is implemented in dplyr yet, or it ever will (see this discussion on Join on inequality constraints), but this is a good situation to use an SQL join:

library(tibble)
library(sqldf)

as.tibble(sqldf("select d.value, r.class from d
                join r on d.value >= r.'from' and 
                          d.value < r.'to'"))

Alternatively, if you want to integrate the join into your dplyr chain, you can use fuzzyjoin::fuzzy_join:

library(dplyr)
library(fuzzyjoin)

d %>%
  fuzzy_join(r, by = c("value" = "from", "value" = "to"), 
             match_fun = list(`>=`, `<`)) %>%
  select(value, class)

Result:

# A tibble: 31 x 2
   value class
   <dbl> <chr>
 1   1.0     A
 2   1.2     A
 3   1.4     A
 4   1.6     A
 5   1.8     A
 6   2.0     A
 7   2.0     B
 8   2.2     B
 9   2.4     B
10   2.6     B
# ... with 21 more rows

Notice I added single quotes around from and to since those are reserved words for the SQL language.

WiWeber · Answer

Ok thanks for advices, this was pretty interesting. I finally wrote a function range_join (inspired by @ycw's code) and compared all described solution in view of runtime.

I like fuzzy_join but with only 50k rows in d it needs more than 40sec. Thats too slow.

Here the result with 5k rows in d

library(dplyr)
library(fuzzyjoin)
library(sqldf)

#join by range by @WiWeber
range_join <- function(x, y, value, left, right){
  x_result <- tibble()
  for (y_ in split(y, 1:nrow(y)))
    x_result <-  x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
  return(x_result)
}

#dynamic join by @ycw
dynamic_join <- function(d, r){
  d$type <- NA_character_
  for (r_ in split(r, r$type))
    d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
  return(d)
}

d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)

# @useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
                join r on d.value >= r.'from' and 
                d.value < r.'to'"))
Sys.time() - start

# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
  fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
  select(value, type)
Sys.time() - start


# @jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start

# @WiWeber floor
start <- Sys.time()
d2 <- d %>% 
  mutate(join_value = floor(value)) %>% 
  inner_join(r, by = c("join_value" = "from")) %>% 
  select(value, type)
Sys.time() - start

#  @WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
  inner_join(r, by = "join") %>% 
  filter(value >= from, value < to) %>%
  select(value, type)
Sys.time() - start

# @hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
  mutate(
    type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
  ) %>% 
  select(value, type)
Sys.time() - start

# @ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start

# @WiWeber range_join
start <- Sys.time()
d2 <- d %>% 
  range_join(r, "value", "from", "to") %>%
  select(value, type)
Sys.time() - start

Results:

# @useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs

# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs

# @jonathande4 cut pretty fast
Time difference of 0.004637003 secs

# @WiWeber floor
Time difference of 0.02223396 secs

# @WiWeber cross join - filter
Time difference of 0.0201931 secs

# @hardik-gupta sapply
Time difference of 5.166633 secs

# @ycw dynamic join
Time difference of 0.03124094 secs

# @WiWeber range_join
Time difference of 0.02691698 secs

greez WiWeber

detroyejr · Answer

You use the cut function to create a "class" in object d and then use a left join.

d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])

d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)

To get the right buckets, you just need to work with the cut function to get what you want.

R dplyr join by range or virtual column

Tags:

r

dplyr

WiWeber

3 Answers

acylam

WiWeber

detroyejr

Recent Activity

Donate For Us

R dplyr join by range or virtual column

Tags:

r

dplyr

WiWeber

3 Answers

acylam

WiWeber

detroyejr

Related questions

Recent Activity

Donate For Us