Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

fast partial match checking in R (or Python or Julia)

Tags:

r

stringr

I have two dataset with names and I need to compare names in both datasets. I just need to keep the union of the two datasets based on the names. However, a name is still considered 'matched' if it is part of the another name even if it is not a full match and vice versa. For example, "seb" should match to "seb", but also to "sebas". I am using str_detect(), but it is too slow. I am wondering if there is any way to speed up this process. I tried some other packages and functions, but nothing really improved the speed. I am open for any R or Python solution.

Create two dummy datasets

library(dplyr)
library(stringr)

set.seed(1)

data_set_A <- tibble(name =  unique(replicate(2000, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>% 
  mutate(ID_A = 1:n())
                    
set.seed(2)

data_set_B <- tibble(name_2 =  unique(replicate(2000, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>% 
  mutate(ID_B = 1:n())

Test matching of full matches only

# This is almost instant
data_set_A %>%
  rowwise() %>%
  filter(any(name %in% data_set_B$name_2) | any(data_set_B$name_2 %in% name)) %>%
  ungroup()
# A tibble: 4 x 2
  name   ID_A
  <chr> <int>
1 vnt     112
2 fly     391
3 cug    1125
4 xgv    1280

Include partial matches (This is what I want to optimize)

This of course only gives me the subset of dataset A, but that is ok.

# This takes way too long
data_set_A %>%
  rowwise() %>%
  filter(any(str_detect(name, data_set_B$name_2)) | any(str_detect(data_set_B$name_2, name))) %>%
  ungroup()
A tibble: 237 x 2
   name       ID_A
   <chr>     <int>
 1 wknrsauuj     2
 2 lyw           7
 3 igwsvrzpk    16
 4 zozxjpu      18
 5 cgn          22
 6 oqo          45
 7 gkritbe      47
 8 uuq          92
 9 lhwfyksz     94
10 tuw         100

Fuzzyjoin method.

This also works, but is equally slow

bind_rows(
  fuzzyjoin::fuzzy_inner_join(
    data_set_A,
    data_set_B,
    by = c("name" = "name_2"),
    match_fun = stringr::str_detect
  ) %>%
    select(name, ID_A),
  fuzzyjoin::fuzzy_inner_join(
    data_set_B,
    data_set_A,
    by = c("name_2" = "name"),
    match_fun = stringr::str_detect
  ) %>%
    select(name, ID_A)
) %>%
  distinct()

data.table solution

not much faster unfortunately

library(data.table)

setDT(data_set_A)
setDT(data_set_B)

data_set_A[data_set_A[, .I[any(str_detect(name, data_set_B$name_2)) | 
                    any(str_detect(data_set_B$name_2, name))], by = .(ID_A)]$V1]

like image 460
L Smeets Avatar asked Oct 28 '25 13:10

L Smeets


1 Answers

This is an [r] option aimed at reducing the number of times you are calling str_detect() (i.e., your example is slow because the function is called several thousand times; and for not using fixed() or fixed = TRUE as jpiversen already pointed out). Answer explained in comments in the code; I will try to jump on tomorrow to explain a bit more.

This should scale reasonably well and be more memory efficient than the current approach too because reduces the rowwise computations to an absolute minimum.

Benchmarks:

n = 2000

# A tibble: 4 × 13
  expression             min   median `itr/sec` mem_alloc `gc/sec` n_itr
  <bch:expr>        <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int>
1 original()           6.67s    6.67s     0.150   31.95MB    0.300     1
2 using_fixed()     496.54ms 496.54ms     2.01    61.39MB    4.03      1
3 using_map_fixed() 493.35ms 493.35ms     2.03    60.27MB    6.08      1
4 andrew_fun()      167.78ms 167.78ms     5.96     1.59MB    0         1

n = 4000

Note: I am not sure if you need the answer to scale; but the approach of reducing the memory-intensive part does seem to do just that (although the time difference is negligible for n = 4000 for 1 iteration, IMO).

# A tibble: 4 × 13
  expression             min   median `itr/sec` mem_alloc `gc/sec` n_itr
  <bch:expr>        <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int>
1 original()          26.63s   26.63s    0.0376  122.33MB    0.150     1
2 using_fixed()        1.91s    1.91s    0.525   243.96MB    3.67      1
3 using_map_fixed()    1.87s    1.87s    0.534   236.62MB    3.20      1
4 andrew_fun()      674.36ms 674.36ms    1.48      7.59MB    0         1

Code w/ comments:

# This is so we do not retain the strings with the max number of
# characters in our pattern because we are checking with %in% already
nchar_a = nchar(data_set_A$name)
nchar_b = nchar(data_set_B$name_2)

# Creating large patterns (excluding values w/ max number of characters)
pattern_a = str_c(unique(data_set_A$name[nchar_a != max(nchar_a, na.rm = TRUE)]), collapse = "|")
pattern_b = str_c(unique(data_set_B$name_2[nchar_b != max(nchar_b, na.rm = TRUE)]), collapse = "|")

# First checking using %in% 
idx_a = data_set_A$name %in% data_set_B$name_2

# Next, IDing when a(string) matches b(pattern) 
idx_a[!idx_a] = str_detect(data_set_A$name[!idx_a], pattern_b)

# IDing a(pattern) matches b(string) so we do not run every row of 
# a(as a pattern) against all of b
b_to_check = data_set_B$name_2[str_detect(data_set_B$name_2, pattern_a)]

# Using unmatched values of a as a pattern for the reduced set for b
idx_a[!idx_a] = vapply(data_set_A$name[!idx_a], function(name) {
  any(grepl(name, b_to_check, fixed = TRUE))
}, logical(1L), USE.NAMES = FALSE)

data_set_A[idx_a, ]
# A tibble: 237 × 2
   name       ID_A
   <chr>     <int>
 1 wknrsauuj     2
 2 lyw           7
 3 igwsvrzpk    16
 4 zozxjpu      18
 5 cgn          22
 6 oqo          45
 7 gkritbe      47
 8 uuq          92
 9 lhwfyksz     94
10 tuw         100
# … with 227 more rows

Reproducible R code for benchmarks

The following code is largely taken from jpiversen who provided a great answer:

library(dplyr)
library(stringr)

n = 2000

set.seed(1)
data_set_A <- tibble(name =  unique(replicate(n, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>% 
  mutate(ID_A = 1:n())

set.seed(2)
data_set_B <- tibble(name_2 =  unique(replicate(n, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>% 
  mutate(ID_B = 1:n())


original <- function() {
  
  data_set_A %>%
    rowwise() %>%
    filter(any(str_detect(name, data_set_B$name_2)) | any(str_detect(data_set_B$name_2, name))) %>%
    ungroup()
  
}

using_fixed <- function() {
  
  data_set_A %>%
    rowwise() %>%
    filter(any(str_detect(name, fixed(data_set_B$name_2))) | any(str_detect(data_set_B$name_2, fixed(name)))) %>%
    ungroup()
  
}

using_map_fixed <- function() {
  
  logical_vec <- data_set_A$name %>% 
    purrr::map_lgl(
      ~any(stringr::str_detect(.x, fixed(data_set_B$name_2))) || 
        any(stringr::str_detect(data_set_B$name_2, fixed(.x)))
    )
  
  
  data_set_A[logical_vec, ]
  
}

andrew_fun = function() {
  
  nchar_a = nchar(data_set_A$name)
  nchar_b = nchar(data_set_B$name_2)
  
  pattern_a = str_c(unique(data_set_A$name[nchar_a != max(nchar_a, na.rm = TRUE)]), collapse = "|")
  pattern_b = str_c(unique(data_set_B$name_2[nchar_b != max(nchar_b, na.rm = TRUE)]), collapse = "|")
  
  idx_a = data_set_A$name %in% data_set_B$name_2
  
  idx_a[!idx_a] = str_detect(data_set_A$name[!idx_a], pattern_b)
  
  b_to_check = data_set_B$name_2[str_detect(data_set_B$name_2, pattern_a)]
  
  idx_a[!idx_a] = vapply(data_set_A$name[!idx_a], function(name) {
    any(grepl(name, b_to_check, fixed = TRUE))
  }, logical(1L), USE.NAMES = FALSE)
  
  data_set_A[idx_a, ]
  
}


bm = bench::mark(
  original(),
  using_fixed(),
  using_map_fixed(),
  andrew_fun(),
  iterations = 1
)
like image 146
Andrew Avatar answered Oct 31 '25 03:10

Andrew



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!