I have two data sets of at least 420,500 observations each, e.g.
dataset1 <- data.frame(col1=c("microsoft","apple","vmware","delta","microsoft"),
                     col2=paste0(c("a","b","c",4,"asd"),".exe"),
                     col3=rnorm(5))
dataset2 <- data.frame(col1=c("apple","cisco","proactive","dtex","microsoft"),
                     col2=paste0(c("a","b","c",4,"asd"),".exe"),
                     col3=rnorm(5))
> dataset1
       col1    col2 col3
1 microsoft   a.exe    2
2     apple   b.exe    1
3    vmware   c.exe    3
4     delta   4.exe    4
5 microsoft asd.exe    5
> dataset2
       col1    col2 col3
1     apple   a.exe    3
2     cisco   b.exe    4
3    vmware   d.exe    1
4     delta   5.exe    5
5 microsoft asd.exe    2
I would like to print all the observations in dataset1 that do not intersect one in dataset2 (comparing both col1 and col2 in each), which in this case would print everything except the last observation - observations 1 & 2 match on col2 but not col1 and observation 3 & 4 match on col1 but not col2, i.e.:
        col1  col2 col3 
1:     apple b.exe    1 
2:     delta 4.exe    4 
3: microsoft a.exe    2 
4:    vmware c.exe    3 
                You could use anti_join from dplyr
 library(dplyr)
 anti_join(df1, df2, by = c('col1', 'col2'))
 #      col1  col2       col3
 #1     delta 4.exe -0.5836272
 #2    vmware c.exe  0.4196231
 #3     apple b.exe  0.5365853
 #4 microsoft a.exe -0.5458808
 set.seed(24)
 df1 <- data.frame(col1 = c('microsoft', 'apple', 'vmware', 'delta', 
 'microsoft'), col2= c('a.exe', 'b.exe', 'c.exe', '4.exe', 'asd.exe'), 
    col3=rnorm(5), stringsAsFactors=FALSE)
 set.seed(22)
 df2 <- data.frame(col1 = c( 'apple', 'cisco', 'proactive', 'dtex', 
 'microsoft'), col2= c('a.exe', 'b.exe', 'c.exe', '4.exe', 'asd.exe'), 
  col3=rnorm(5), stringsAsFactors=FALSE)
                        data.table solution inspired by this:
library(data.table) #1.9.5+
setDT(dataset1,key=c("col1","col2"))
setDT(dataset2,key=key(dataset1))
dataset1[!dataset2]
        col1  col2 col3
1:     apple b.exe    1
2:     delta 4.exe    4
3: microsoft a.exe    2
4:    vmware c.exe    3
You could also try without keying:
library(data.table) #1.9.5+
setDT(dataset1); setDT(dataset2)
dataset1[!dataset2,on=c("col1","col2")]
                        If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With