Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Get all combinations of shared elements in a list

For a list:

terms <- list(Item1 = c("a", "b", "c", "d"),
              Item2 = c("a", "e", "f", "g"),
              Item3 = c("b", "e", "h", "i"),
              Item4 = c("j", "k"))

I would like to get the number of shared letters between each pair of items in the list. The expected output is therefore:

     [,1] [,2] [,3] [,4]
[1,]    4    1    1    0
[2,]    1    4    1    0
[3,]    1    1    4    0
[4,]    0    0    0    2

From a previous StackOverflow answer, I found one possible solution:

overlapLength <- function(x, y) mapply(function(x, y) 
  length(intersect(x, y)), terms[x], terms[y])
s <- seq_along(terms)
outer(s, s, overlapLength)

But this is very slow for my list, which is very large (~9,000 items).

Is there a faster way to do this?


Thanks everyone for your input. I timed all answers with the first 100 items of my list.

> system.time(f_crossprod(go))
   user  system elapsed 
  0.024   0.001   0.025 
> system.time(f_crossprod2(go))
   user  system elapsed 
  0.007   0.000   0.008 
> system.time(f_mapply(go))
   user  system elapsed 
  2.018   0.032   2.059 
> system.time(f_outer(go))
   user  system elapsed 
  1.950   0.016   1.979 
> system.time(f_combn(go))
   user  system elapsed 
  1.056   0.005   1.062 
> system.time(f_Rcpp(go))
   user  system elapsed 
163.236  84.226 249.240 

I then timed the outer and Matrix::crossprod solutions with the entire list of ~9,000 elements. The outer solution ran in about 55 minutes. The Matrix::crossprod solution ran in about 0.1 seconds!

It is possible I have made an error in implementation of the Rcpp function. However, @alexis_laz if you make your comment an answer I will accept it.

By the way, sorry I was not clear, I am not interested in the values on the diagonal.

like image 550
dentist_inedible Avatar asked Dec 03 '22 13:12

dentist_inedible


1 Answers

We can use outer

outer(names(terms), names(terms), FUN = function(x,y) 
              lengths(Map(intersect, terms[x], terms[y])))
#     [,1] [,2] [,3] [,4]
#[1,]    4    1    1    0
#[2,]    1    4    1    0
#[3,]    1    1    4    0
#[4,]    0    0    0    2

Or more compactly

outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
#      Item1 Item2 Item3 Item4
#Item1     4     1     1     0
#Item2     1     4     1     0
#Item3     1     1     4     0
#Item4     0     0     0     2

We could also implement this in Rcpp. Below is the test1.cpp file

#include <Rcpp.h>
#include <math.h>

using namespace Rcpp;
//[[Rcpp::export]]

List foo(List xs) {
    List x(xs);
    List x1 = Rcpp::clone(xs);
    List y1 = Rcpp::clone(xs);
    int n = x1.size();



    NumericVector res;


    for( int i=0; i<n; i++){
        for(int j=0; j<n; j++){
         CharacterVector xd = x1[i];
         CharacterVector yd = y1[j];

        res.push_back(intersect(xd, yd).length());
        }
    }
    return wrap(res) ;

We call it in R using

library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
#     [,1] [,2] [,3] [,4]
#[1,]    4    1    1    0
#[2,]    1    4    1    0
#[3,]    1    1    4    0
#[4,]    0    0    0    2

Benchmarks

In addition to the functions above, we included another version with a RcppEigen implementation that was posted here

n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10), 
         replace = TRUE)), paste0("Item", seq_len(n)))

library(Matrix)
library(inline)
library(Rcpp)

alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <-  function() {Matrix::crossprod(xtabs( ~ values + ind, 
            stack(terms1), sparse = TRUE)) }

akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
                      funCPr(tbl, tbl)[[1]]}

db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
    sapply(terms1, function(a)
        sum(unlist(terms1[i]) %in% unlist(a)))))} 
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
                      combn(seq_along(terms1), 2,
                            function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
         c("col1", "col2", "counts"))}

and the benchmark output for n at 100 are

library(microbenchmark)
microbenchmark(alexis1(), alexis2(),   akrun1(), akrun2(),akrun3(), db(), lmo(),
           unit = "relative", times = 10L)
#Unit: relative
#      expr        min         lq       mean     median         uq        max neval  cld
# alexis1()   1.035975   1.032101   1.031239   1.010472   1.044217   1.129092    10 a   
# alexis2()   3.896928   3.656585   3.461980   3.386301   3.335469   3.288161    10 a   
#  akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661    10    d
#  akrun2()  84.239272  79.073087  88.594414  75.719853  78.277769 129.731990    10  b  
#  akrun3()   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000    10 a   
#      db()  86.921164  82.201117  80.358097  75.113471  73.311414 105.761977    10  b  
#     lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212    10   c 

With a slightly higher n at 200

n <- 200
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
      replace = TRUE)), paste0("Item", seq_len(n)))

microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L)
#Unit: relative
#      expr        min         lq       mean     median         uq        max neval cld
# alexis1()   1.117234   1.164198   1.181280   1.166070   1.230077   1.229899    10 a  
# alexis2()   3.428904   3.425942   3.337112   3.379675   3.280729   3.164852    10  b 
#  akrun3()   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000    10 a  
#      db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635    10   c

With n set at 9000

n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10), 
                replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(),  akrun3(), unit = "relative", times = 10L)
#Unit: relative
#     expr      min       lq     mean   median       uq      max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329    10  b 
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927    10   c
#  akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10 a  

Checking the output

res1 <- alexis1()
res2 <- akrun3()
res3 <- alexis2()
all.equal(res1, res2, check.attributes = FALSE)
#[1] TRUE
all.equal(res1, as.matrix(res3), check.attributes = FALSE)
#[1] TRUE

Based on the comments from @alexis_laz included 3 more functions to replace the table/stack part to compare the efficiency for n at 9000

alexis3 <- function() {
    unlt = unlist(terms1, use.names = FALSE)
    u = unique(unlt)
    tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
    tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
    crossprod(tab, tab)
    }

alexis4 <- function() {
        unlt = unlist(terms1, use.names = FALSE)
        u = unique(unlt)

       tab = sparseMatrix(x = 1L, i = match(unlt, u),
           j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))

       Matrix::crossprod(tab, tab, sparse = TRUE)
       }

akrun4 <- function() {
        unlt = unlist(terms1, use.names = FALSE)
        u = unique(unlt)
        tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
        tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
        funCPr(tab, tab)[[1]]
      }

and the benchmarks are

microbenchmark(alexis1(),alexis2(), alexis3(), alexis4(),
         akrun3(), akrun4(),  unit = "relative", times = 10L)
#Unit: relative
#      expr       min        lq     mean   median       uq      max neval  cld
# alexis1() 2.1888254 2.2897883 2.204237 2.169618 2.162955 2.122552    10  b  
# alexis2() 3.7651292 3.9178071 3.672550 3.616577 3.587886 3.426039    10   c 
# alexis3() 2.1776887 2.2410663 2.197293 2.137106 2.192834 2.241645    10  b  
# alexis4() 4.1640895 4.3431379 4.262192 4.187449 4.388335 4.172607    10    d
#  akrun3() 1.0000000 1.0000000 1.000000 1.000000 1.000000 1.000000    10 a   
#  akrun4() 0.9364288 0.9692772 1.043292 1.063931 1.090301 1.171245    10 a   
like image 158
akrun Avatar answered Dec 19 '22 06:12

akrun