Let's say I have this data.table (actual data is 25061 x 5862):
require(data.table)
df
# gene P1 P2 P3 P4 P5
# 1: gene1 0.111 0.319 0.151 NA -0.397
# 2: gene10 1.627 2.252 1.462 -1.339 -0.644
# 3: gene2 -1.766 -0.056 -0.369 1.910 0.981
# 4: gene3 -1.346 1.283 0.322 -0.465 0.403
# 5: gene4 -0.783 NA -0.005 1.761 0.066
# 6: gene5 0.386 -0.309 -0.886 -0.072 0.161
# 7: gene6 0.547 -0.144 -0.725 -0.133 1.059
# 8: gene7 0.785 -1.827 0.986 1.555 -0.798
# 9: gene8 -0.186 NA 0.401 0.900 -1.075
# 10: gene9 -0.177 1.497 -1.370 -1.628 -1.044
I'd like to know how, making advantage of the data.table structure, I can efficiently compute, for each pair of gene values, how many couples there are with no NA. For example, for the pair gene1, gene2, I'd like the result to be 4.
With base R, I do it this way:
calc_nonNA <- !is.na(df[, -1, with=F])
Effectifs <- calc_nonNA %*% t(calc_nonNA)
# or, as suggested by @DavidArenburg and @Khashaa, more efficiently:
Effectifs <- tcrossprod(calc_nonNA)
But, with a large df, it takes hours...
My desired output, with the provided example is this:
gene1 gene10 gene2 gene3 gene4 gene5 gene6 gene7 gene8 gene9
gene1 4 4 4 4 3 4 4 4 3 4
gene10 4 5 5 5 4 5 5 5 4 5
gene2 4 5 5 5 4 5 5 5 4 5
gene3 4 5 5 5 4 5 5 5 4 5
gene4 3 4 4 4 4 4 4 4 4 4
gene5 4 5 5 5 4 5 5 5 4 5
gene6 4 5 5 5 4 5 5 5 4 5
gene7 4 5 5 5 4 5 5 5 4 5
gene8 3 4 4 4 4 4 4 4 4 4
gene9 4 5 5 5 4 5 5 5 4 5
data
df <- structure(list(gene = c("gene1", "gene10", "gene2", "gene3",
"gene4", "gene5", "gene6", "gene7", "gene8", "gene9"), P1 = c(0.111,
1.627, -1.766, -1.346, -0.783, 0.386, 0.547, 0.785, -0.186, -0.177
), P2 = c(0.319, 2.252, -0.056, 1.283, NA, -0.309, -0.144, -1.827,
NA, 1.497), P3 = c(0.151, 1.462, -0.369, 0.322, -0.005, -0.886,
-0.725, 0.986, 0.401, -1.37), P4 = c(NA, -1.339, 1.91, -0.465,
1.761, -0.072, -0.133, 1.555, 0.9, -1.628), P5 = c(-0.397, -0.644,
0.981, 0.403, 0.066, 0.161, 1.059, -0.798, -1.075, -1.044)), .Names = c("gene",
"P1", "P2", "P3", "P4", "P5"), class = c("data.table", "data.frame"
), row.names = c(NA, -10L), .internal.selfref = <pointer: 0x022524a0>)
To remove all rows having NA, we can use na. omit function. For Example, if we have a data frame called df that contains some NA values then we can remove all rows that contains at least one NA by using the command na. omit(df).
In R, missing values are represented by the symbol NA (not available). Impossible values (e.g., dividing by zero) are represented by the symbol NaN (not a number). Unlike SAS, R uses the same symbol for character and numeric data.
Using dplyr, transform data wide to long, then join to itself and summarise. Not sure if it is more efficient than your solution, some benchmarking anyone?
library(dplyr)
library(tidyr)
# reshaping from wide to long
x <- df %>% gather(key = P, value = value, -c(1)) %>%
mutate(value=(!is.na(value)))
# result
left_join(x,x,by="P") %>%
group_by(gene.x,gene.y) %>%
summarise(N=sum(value.x & value.y)) %>%
spread(gene.y,N)
EDIT:
Shame, this dplyr solution fails for bigger dataset 2600x600, can't join to itself - internal vecseq reached physical limit
, about 2^31 rows...
By the way, here is benchmark for t
vs tcrossprod
:
library(ggplot2)
library(microbenchmark)
op <- microbenchmark(
BASE_t={
calc_nonNA <- !is.na(df[, -1, with=F])
calc_nonNA %*% t(calc_nonNA)
},
BASE_tcrossprod={
calc_nonNA <- !is.na(df[, -1, with=F])
tcrossprod(calc_nonNA)
},
times=10
)
qplot(y=time, data=op, colour=expr) + scale_y_log10()
I tried this out with random data of 25061x5862 and it quickly chewed up 50gb of ram (including swap space) and, as such, is way way way less memory efficient than using tcrossprod
but if you have an obscene amount of memory then maybe (but probably not) this could be faster.
#generate cross columns for all matches
crossDT<-data.table(gene=rep(df1[,unique(gene)],length(df1[,unique(gene)])),gene2=rep(df1[,unique(gene)],each=length(df1[,unique(gene)])))
#create datatable with row for each combo
df2<-merge(df1,crossDT,by="gene")
setkey(df2,gene2)
setkey(df1,gene)
#make datatable with a set of P columns for each gene
df3<-df1[df2]
#find middle column and then make name vectors
pivotcol<-match("i.gene",names(df3))
names1<-names(df3)[2:(pivotcol-1)]
names2<-names(df3)[(pivotcol+1):ncol(df3)]
names3<-paste0("new",names1)
#make third set of P columns where the new value is False if either of the previous sets of P columns is NA
df3[,(names3):=lapply(1:length(names1),function(x) !any(is.na(c(get(names1[x]),get(names2[x]))))),by=c("gene","i.gene")]
#delete first sets of P columns
df3[,c(names1,names2):=NULL]
#sum up true columns
df3[,Sum:=rowSums(.SD),.SDcols=names3]
#delete set of P columns
df3[,(names3):=NULL]
#cast results to desired shape
dcast.data.table(df3,gene~i.gene,value.var='Sum')
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With