I have two dataset with names and I need to compare names in both datasets. I just need to keep the union of the two datasets based on the names. However, a name is still considered 'matched' if it is part of the another name even if it is not a full match and vice versa. For example, "seb" should match to "seb", but also to "sebas". I am using str_detect(), but it is too slow. I am wondering if there is any way to speed up this process. I tried some other packages and functions, but nothing really improved the speed. I am open for any R or Python solution.
library(dplyr)
library(stringr)
set.seed(1)
data_set_A <- tibble(name = unique(replicate(2000, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>%
mutate(ID_A = 1:n())
set.seed(2)
data_set_B <- tibble(name_2 = unique(replicate(2000, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>%
mutate(ID_B = 1:n())
# This is almost instant
data_set_A %>%
rowwise() %>%
filter(any(name %in% data_set_B$name_2) | any(data_set_B$name_2 %in% name)) %>%
ungroup()
# A tibble: 4 x 2
name ID_A
<chr> <int>
1 vnt 112
2 fly 391
3 cug 1125
4 xgv 1280
This of course only gives me the subset of dataset A, but that is ok.
# This takes way too long
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, data_set_B$name_2)) | any(str_detect(data_set_B$name_2, name))) %>%
ungroup()
A tibble: 237 x 2
name ID_A
<chr> <int>
1 wknrsauuj 2
2 lyw 7
3 igwsvrzpk 16
4 zozxjpu 18
5 cgn 22
6 oqo 45
7 gkritbe 47
8 uuq 92
9 lhwfyksz 94
10 tuw 100
This also works, but is equally slow
bind_rows(
fuzzyjoin::fuzzy_inner_join(
data_set_A,
data_set_B,
by = c("name" = "name_2"),
match_fun = stringr::str_detect
) %>%
select(name, ID_A),
fuzzyjoin::fuzzy_inner_join(
data_set_B,
data_set_A,
by = c("name_2" = "name"),
match_fun = stringr::str_detect
) %>%
select(name, ID_A)
) %>%
distinct()
not much faster unfortunately
library(data.table)
setDT(data_set_A)
setDT(data_set_B)
data_set_A[data_set_A[, .I[any(str_detect(name, data_set_B$name_2)) |
any(str_detect(data_set_B$name_2, name))], by = .(ID_A)]$V1]
This is an [r] option aimed at reducing the number of times you are calling str_detect() (i.e., your example is slow because the function is called several thousand times; and for not using fixed() or fixed = TRUE as jpiversen already pointed out). Answer explained in comments in the code; I will try to jump on tomorrow to explain a bit more.
This should scale reasonably well and be more memory efficient than the current approach too because reduces the rowwise computations to an absolute minimum.
Benchmarks:
n = 2000
# A tibble: 4 × 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int>
1 original() 6.67s 6.67s 0.150 31.95MB 0.300 1
2 using_fixed() 496.54ms 496.54ms 2.01 61.39MB 4.03 1
3 using_map_fixed() 493.35ms 493.35ms 2.03 60.27MB 6.08 1
4 andrew_fun() 167.78ms 167.78ms 5.96 1.59MB 0 1
n = 4000
Note: I am not sure if you need the answer to scale; but the approach of reducing the memory-intensive part does seem to do just that (although the time difference is negligible for n = 4000 for 1 iteration, IMO).
# A tibble: 4 × 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int>
1 original() 26.63s 26.63s 0.0376 122.33MB 0.150 1
2 using_fixed() 1.91s 1.91s 0.525 243.96MB 3.67 1
3 using_map_fixed() 1.87s 1.87s 0.534 236.62MB 3.20 1
4 andrew_fun() 674.36ms 674.36ms 1.48 7.59MB 0 1
Code w/ comments:
# This is so we do not retain the strings with the max number of
# characters in our pattern because we are checking with %in% already
nchar_a = nchar(data_set_A$name)
nchar_b = nchar(data_set_B$name_2)
# Creating large patterns (excluding values w/ max number of characters)
pattern_a = str_c(unique(data_set_A$name[nchar_a != max(nchar_a, na.rm = TRUE)]), collapse = "|")
pattern_b = str_c(unique(data_set_B$name_2[nchar_b != max(nchar_b, na.rm = TRUE)]), collapse = "|")
# First checking using %in%
idx_a = data_set_A$name %in% data_set_B$name_2
# Next, IDing when a(string) matches b(pattern)
idx_a[!idx_a] = str_detect(data_set_A$name[!idx_a], pattern_b)
# IDing a(pattern) matches b(string) so we do not run every row of
# a(as a pattern) against all of b
b_to_check = data_set_B$name_2[str_detect(data_set_B$name_2, pattern_a)]
# Using unmatched values of a as a pattern for the reduced set for b
idx_a[!idx_a] = vapply(data_set_A$name[!idx_a], function(name) {
any(grepl(name, b_to_check, fixed = TRUE))
}, logical(1L), USE.NAMES = FALSE)
data_set_A[idx_a, ]
# A tibble: 237 × 2
name ID_A
<chr> <int>
1 wknrsauuj 2
2 lyw 7
3 igwsvrzpk 16
4 zozxjpu 18
5 cgn 22
6 oqo 45
7 gkritbe 47
8 uuq 92
9 lhwfyksz 94
10 tuw 100
# … with 227 more rows
Reproducible R code for benchmarks
The following code is largely taken from jpiversen who provided a great answer:
library(dplyr)
library(stringr)
n = 2000
set.seed(1)
data_set_A <- tibble(name = unique(replicate(n, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>%
mutate(ID_A = 1:n())
set.seed(2)
data_set_B <- tibble(name_2 = unique(replicate(n, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>%
mutate(ID_B = 1:n())
original <- function() {
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, data_set_B$name_2)) | any(str_detect(data_set_B$name_2, name))) %>%
ungroup()
}
using_fixed <- function() {
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, fixed(data_set_B$name_2))) | any(str_detect(data_set_B$name_2, fixed(name)))) %>%
ungroup()
}
using_map_fixed <- function() {
logical_vec <- data_set_A$name %>%
purrr::map_lgl(
~any(stringr::str_detect(.x, fixed(data_set_B$name_2))) ||
any(stringr::str_detect(data_set_B$name_2, fixed(.x)))
)
data_set_A[logical_vec, ]
}
andrew_fun = function() {
nchar_a = nchar(data_set_A$name)
nchar_b = nchar(data_set_B$name_2)
pattern_a = str_c(unique(data_set_A$name[nchar_a != max(nchar_a, na.rm = TRUE)]), collapse = "|")
pattern_b = str_c(unique(data_set_B$name_2[nchar_b != max(nchar_b, na.rm = TRUE)]), collapse = "|")
idx_a = data_set_A$name %in% data_set_B$name_2
idx_a[!idx_a] = str_detect(data_set_A$name[!idx_a], pattern_b)
b_to_check = data_set_B$name_2[str_detect(data_set_B$name_2, pattern_a)]
idx_a[!idx_a] = vapply(data_set_A$name[!idx_a], function(name) {
any(grepl(name, b_to_check, fixed = TRUE))
}, logical(1L), USE.NAMES = FALSE)
data_set_A[idx_a, ]
}
bm = bench::mark(
original(),
using_fixed(),
using_map_fixed(),
andrew_fun(),
iterations = 1
)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With