For a list:
terms <- list(Item1 = c("a", "b", "c", "d"),
Item2 = c("a", "e", "f", "g"),
Item3 = c("b", "e", "h", "i"),
Item4 = c("j", "k"))
I would like to get the number of shared letters between each pair of items in the list. The expected output is therefore:
[,1] [,2] [,3] [,4]
[1,] 4 1 1 0
[2,] 1 4 1 0
[3,] 1 1 4 0
[4,] 0 0 0 2
From a previous StackOverflow answer, I found one possible solution:
overlapLength <- function(x, y) mapply(function(x, y)
length(intersect(x, y)), terms[x], terms[y])
s <- seq_along(terms)
outer(s, s, overlapLength)
But this is very slow for my list, which is very large (~9,000 items).
Is there a faster way to do this?
Thanks everyone for your input. I timed all answers with the first 100 items of my list.
> system.time(f_crossprod(go))
user system elapsed
0.024 0.001 0.025
> system.time(f_crossprod2(go))
user system elapsed
0.007 0.000 0.008
> system.time(f_mapply(go))
user system elapsed
2.018 0.032 2.059
> system.time(f_outer(go))
user system elapsed
1.950 0.016 1.979
> system.time(f_combn(go))
user system elapsed
1.056 0.005 1.062
> system.time(f_Rcpp(go))
user system elapsed
163.236 84.226 249.240
I then timed the outer
and Matrix::crossprod
solutions with the entire list of ~9,000 elements. The outer
solution ran in about 55 minutes. The Matrix::crossprod
solution ran in about 0.1 seconds!
It is possible I have made an error in implementation of the Rcpp function. However, @alexis_laz if you make your comment an answer I will accept it.
By the way, sorry I was not clear, I am not interested in the values on the diagonal.
We can use outer
outer(names(terms), names(terms), FUN = function(x,y)
lengths(Map(intersect, terms[x], terms[y])))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
Or more compactly
outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
# Item1 Item2 Item3 Item4
#Item1 4 1 1 0
#Item2 1 4 1 0
#Item3 1 1 4 0
#Item4 0 0 0 2
We could also implement this in Rcpp
. Below is the test1.cpp
file
#include <Rcpp.h>
#include <math.h>
using namespace Rcpp;
//[[Rcpp::export]]
List foo(List xs) {
List x(xs);
List x1 = Rcpp::clone(xs);
List y1 = Rcpp::clone(xs);
int n = x1.size();
NumericVector res;
for( int i=0; i<n; i++){
for(int j=0; j<n; j++){
CharacterVector xd = x1[i];
CharacterVector yd = y1[j];
res.push_back(intersect(xd, yd).length());
}
}
return wrap(res) ;
We call it in R
using
library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
In addition to the functions above, we included another version with a RcppEigen
implementation that was posted here
n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
library(Matrix)
library(inline)
library(Rcpp)
alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <- function() {Matrix::crossprod(xtabs( ~ values + ind,
stack(terms1), sparse = TRUE)) }
akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
funCPr(tbl, tbl)[[1]]}
db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
sapply(terms1, function(a)
sum(unlist(terms1[i]) %in% unlist(a)))))}
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
combn(seq_along(terms1), 2,
function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
c("col1", "col2", "counts"))}
and the benchmark output for n at 100
are
library(microbenchmark)
microbenchmark(alexis1(), alexis2(), akrun1(), akrun2(),akrun3(), db(), lmo(),
unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.035975 1.032101 1.031239 1.010472 1.044217 1.129092 10 a
# alexis2() 3.896928 3.656585 3.461980 3.386301 3.335469 3.288161 10 a
# akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661 10 d
# akrun2() 84.239272 79.073087 88.594414 75.719853 78.277769 129.731990 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 86.921164 82.201117 80.358097 75.113471 73.311414 105.761977 10 b
# lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212 10 c
With a slightly higher n at 200
n <- 200
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.117234 1.164198 1.181280 1.166070 1.230077 1.229899 10 a
# alexis2() 3.428904 3.425942 3.337112 3.379675 3.280729 3.164852 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635 10 c
With n set at 9000
n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(), akrun3(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329 10 b
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927 10 c
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
Checking the output
res1 <- alexis1()
res2 <- akrun3()
res3 <- alexis2()
all.equal(res1, res2, check.attributes = FALSE)
#[1] TRUE
all.equal(res1, as.matrix(res3), check.attributes = FALSE)
#[1] TRUE
Based on the comments from @alexis_laz included 3 more functions to replace the table/stack
part to compare the efficiency for n at 9000
alexis3 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
crossprod(tab, tab)
}
alexis4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = sparseMatrix(x = 1L, i = match(unlt, u),
j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))
Matrix::crossprod(tab, tab, sparse = TRUE)
}
akrun4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
funCPr(tab, tab)[[1]]
}
and the benchmarks are
microbenchmark(alexis1(),alexis2(), alexis3(), alexis4(),
akrun3(), akrun4(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.1888254 2.2897883 2.204237 2.169618 2.162955 2.122552 10 b
# alexis2() 3.7651292 3.9178071 3.672550 3.616577 3.587886 3.426039 10 c
# alexis3() 2.1776887 2.2410663 2.197293 2.137106 2.192834 2.241645 10 b
# alexis4() 4.1640895 4.3431379 4.262192 4.187449 4.388335 4.172607 10 d
# akrun3() 1.0000000 1.0000000 1.000000 1.000000 1.000000 1.000000 10 a
# akrun4() 0.9364288 0.9692772 1.043292 1.063931 1.090301 1.171245 10 a
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With