Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Grouping elements from different data

Tags:

r

In my work I'm trying to find which of genes usually comes together. So I set up some experiments and now trying to analyze the data. I already wrote a nice script for analyzing it but still it's not enough.

What I want to do this time is to analyze couple of tables and establish which genes are usually together - in the same cluster.

That's my data:

First table:

    > dput(tbl_col_clu1[1:20,])
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `20` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `52.5` = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `81` = c(0, 0, 0, 0, 
0, 0, 0.64209043, 0, 0, 0, 0, 0, 0, 0, 0.636411741, 0.183490041, 
0, 0, 0, 0), `110` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `140.5` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `189` = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0.84958569, 0, 0, 0, 0, 0), `222.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 1, 0.37119221, 0, 0, 0, 1, 0, 0, 0, 0, 
0), `278` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0), `340` = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `397` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `453.5` = c(0, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `529` = c(0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `580` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `630.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `683.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `735.5` = c(0, 
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `784` = c(0, 
0, 0, 0, 0, 0, 0, 0.399952462, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
0.959211661, 1), `832` = c(0, 0.1266780707, 0, 0, 0, 0, 0, 0.2132893016, 
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.959211661, 1), `882.5` = c(0, 
0.12667807, 0, 0, 0, 1, 0, 0.08480435, 0, 0, 0, 0, 0, 1, 0, 0, 
0, 0, 1, 0.70163097), `926.5` = c(0, 1, 0, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), `973` = c(0, 0.12621196, 0, 
0, 0, 0, 0, 0.11813646, 0, 0, 0, 1, 0, 0, 0.59389934, 1, 0, 0, 
0, 0), `1108` = c(0, 0.092444384, 0, 0, 0, 0, 0, 0.115758222, 
0, 0, 0, 0.925835779, 0, 0, 1, 0.303482426, 0.848464317, 0, 0, 
0), `1200` = c(0, 0.120055749, 0, 1, 0, 0, 0, 0.150055416, 0, 
0, 0, 0.558015841, 0, 0, 0.796949668, 0.276321753, 1, 0, 0, 0
), Clusters = structure(c(1L, 64L, 45L, 102L, 11L, 77L, 170L, 
55L, 59L, 316L, 316L, 98L, 90L, 77L, 232L, 178L, 101L, 50L, 51L, 
51L), .Label = c("10", "10,13,15", "10,15", "10,15,16", "10,20,21,22,23,24", 
"10,22,23,24", "11", "11,12,13,14,15", "11,12,13,14,15,16", "11,12,13,14,15,16,17", 
"12", "12,13", "12,13,14", "12,13,14,15", "12,13,14,15,16", "12,13,14,15,16,17", 
"12,13,14,15,16,17,18,19,20,21,22,23,24", "12,13,15", "12,13,17", 
"13", "13,14", "13,14,15", "13,14,15,16", "13,14,15,16,17", "13,15", 
"13,15,16,17", "14", "14,15", "14,15,16", "14,15,16,17", "14,15,16,17,18,19,20,21,22,23,24", 
"14,19", "15", "15,16", "15,16,17", "15,16,17,18,19,20,21,22,23,24", 
"15,16,17,19,20,21,22,23,24", "15,17", "15,17,24", "15,22,23,24", 
"15,23", "15,24", "16", "16,17", "17", "17,18,19,20", "17,18,19,20,21,22,23,24", 
"17,21,22,23,24", "18", "18,19", "18,19,20", "18,19,20,21", "18,19,20,21,22", 
"18,19,20,21,22,23", "18,19,20,21,22,23,24", "18,19,21", "18,19,22,23", 
"18,20", "19", "19,20", "19,20,21", "19,20,21,22", "19,20,21,22,23", 
"19,20,21,22,23,24", "19,20,22", "19,20,22,23", "19,20,22,23,24", 
"19,20,23", "19,21", "19,22", "19,23", "19,24", "2", "2,18,19,20", 
"2,19,20", "2,3,4", "20", "20,21", "20,21,22", "20,21,22,23", 
"20,21,22,23,24", "20,21,23", "20,22", "20,22,23", "20,22,23,24", 
"20,22,24", "20,23", "20,23,24", "20,24", "21", "21,22", "21,22,23", 
"21,22,23,24", "21,23,24", "21,24", "22", "22,23", "22,23,24", 
"22,24", "23", "23,24", "24", "3", "3,10", "3,18,19,20", "3,18,19,20,21,22,23,24", 
"3,19,20", "3,19,20,21", "3,19,20,22,23,24", "3,20,21,22,23,24", 
"3,20,22,23,24", "3,21,23,24", "3,22,23,24", "3,22,24", "3,23", 
"3,23,24", "3,24", "3,4", "3,4,10", "3,4,18,19", "3,4,18,19,20", 
"3,4,18,19,20,21,22,23", "3,4,18,19,20,21,22,23,24", "3,4,19,20,21", 
"3,4,21", "3,4,21,22,23", "3,4,21,22,23,24", "3,4,22,23", "3,4,22,23,24", 
"3,4,22,24", "3,4,23,24", "3,4,24", "3,4,5", "3,4,5,10", "3,4,5,10,23,24", 
"3,4,5,20", "3,4,5,22,23,24", "3,4,5,23,24", "3,4,5,24", "3,4,5,6", 
"3,4,5,6,10", "3,4,5,6,20,22,23,24", "3,4,5,6,7", "3,4,5,6,7,10", 
"3,4,5,6,7,24", "3,4,5,6,7,8", "3,4,5,6,7,8,10", "3,4,5,6,7,8,10,13", 
"3,4,5,6,7,8,10,22,23,24", "3,4,5,6,7,8,12", "3,4,5,6,7,8,15", 
"3,4,5,6,7,8,18,19,20,21,22,23,24", "3,4,5,6,7,8,22,23,24", "3,4,5,6,7,8,9,10", 
"3,4,5,6,7,8,9,10,11,12", "3,4,5,6,7,8,9,10,11,12,13,14,15", 
"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"3,4,5,6,7,8,9,10,11,14,15", "3,4,5,6,7,8,9,10,19,20,21,22,23,24", 
"3,4,5,6,7,8,9,10,22,23,24", "3,4,6", "3,4,6,7,20,21,22,23,24", 
"3,4,7", "3,4,7,8", "3,5,6,7,8", "3,5,8", "3,7", "3,7,19,20,22,23", 
"4", "4,10", "4,10,24", "4,18,19,20", "4,19,20", "4,20,21,22", 
"4,20,21,22,23,24", "4,20,22,23,24", "4,22,23,24", "4,23,24", 
"4,24", "4,5", "4,5,10", "4,5,10,21", "4,5,10,23,24", "4,5,19,20,21,22,23", 
"4,5,19,20,22,23,24", "4,5,20,21,22,23,24", "4,5,20,22,23,24", 
"4,5,22,23,24", "4,5,24", "4,5,6", "4,5,6,10", "4,5,6,10,20,22,23,24", 
"4,5,6,19", "4,5,6,22,23,24", "4,5,6,7", "4,5,6,7,10", "4,5,6,7,19,20,21,22,23,24", 
"4,5,6,7,22,23,24", "4,5,6,7,8", "4,5,6,7,8,10", "4,5,6,7,8,10,19,20,21,22,23,24", 
"4,5,6,7,8,10,20,21,22,23,24", "4,5,6,7,8,10,21,22,23,24", "4,5,6,7,8,10,22,23,24", 
"4,5,6,7,8,10,23,24", "4,5,6,7,8,15", "4,5,6,7,8,17,18,19,20,21,22,23,24", 
"4,5,6,7,8,19,20", "4,5,6,7,8,19,20,21,22,23,24", "4,5,6,7,8,20,21,22,23,24", 
"4,5,6,7,8,21,22,23,24", "4,5,6,7,8,22,23,24", "4,5,6,7,8,9,10", 
"4,5,6,7,8,9,10,11,12", "4,5,6,7,8,9,10,11,12,13,14,15", "4,5,6,7,8,9,10,11,12,13,14,15,16,17", 
"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18", "4,5,6,7,8,9,10,12,13", 
"4,5,6,7,8,9,14,15,16", "4,5,7,9", "4,5,8,22", "4,6", "4,6,7,22,23,24", 
"4,6,7,23,24", "4,6,7,8,15,17", "4,6,7,8,23,24", "4,7", "4,7,20,21", 
"4,7,21,22,23,24", "4,7,8", "4,7,8,22,23,24", "5", "5,10", "5,17", 
"5,18,19,20,21,22,23", "5,19,20,21,22,23,24", "5,20", "5,22,23,24", 
"5,24", "5,6", "5,6,10", "5,6,7", "5,6,7,10", "5,6,7,10,19", 
"5,6,7,22,23,24", "5,6,7,8", "5,6,7,8,10", "5,6,7,8,10,15", "5,6,7,8,10,22,23,24", 
"5,6,7,8,15", "5,6,7,8,18,19,20,21,22,23,24", "5,6,7,8,21,22,23,24", 
"5,6,7,8,22,23,24", "5,6,7,8,9", "5,6,7,8,9,10", "5,6,7,8,9,10,11,12,13", 
"5,6,7,8,9,10,11,12,13,14,15", "5,6,7,8,9,12", "5,6,7,8,9,13", 
"5,7", "5,7,8", "5,8", "6", "6,10", "6,21,22,23", "6,22", "6,22,23,24", 
"6,7", "6,7,10,17", "6,7,22,23,24", "6,7,23,24", "6,7,24", "6,7,8", 
"6,7,8,10", "6,7,8,13,14,15,16,17", "6,7,8,15", "6,7,8,19,20", 
"6,7,8,20,21,22,23,24", "6,7,8,21,22,23,24", "6,7,8,23,24", "6,7,8,9", 
"6,7,8,9,10", "6,7,8,9,10,11,12", "6,7,8,9,10,11,12,13,14,15,16,17", 
"6,7,8,9,10,15,16", "6,7,8,9,10,18,19,20,21,22,23,24", "6,7,8,9,15", 
"6,8", "7", "7,15", "7,15,17", "7,16,18,21", "7,17", "7,19,20", 
"7,19,20,21,22", "7,20,21,22,23,24", "7,20,22,23,24", "7,22,23,24", 
"7,24", "7,8", "7,8,10", "7,8,10,22,23,24", "7,8,13,15", "7,8,14", 
"7,8,15", "7,8,15,16", "7,8,15,23", "7,8,20", "7,8,22", "7,8,23", 
"7,8,9", "7,8,9,10", "7,8,9,13", "7,8,9,15,16,17", "8", "8,10", 
"8,15", "8,17", "8,22", "8,24", "8,9", "8,9,10", "9", "9,10,11,12,13,14,15,16,17"
), class = "factor")), .Names = c("10", "20", "52.5", "81", "110", 
"140.5", "189", "222.5", "278", "340", "397", "453.5", "529", 
"580", "630.5", "683.5", "735.5", "784", "832", "882.5", "926.5", 
"973", "1108", "1200", "Clusters"), row.names = c("at1g01050.1", 
"at1g01080.1", "at1g01090.1", "at1g01220.1", "at1g01320.2", "at1g01420.1", 
"at1g01710.1", "at1g01800.1", "at1g01920.2", "at1g01940.1", "at1g01960.1", 
"at1g02020.2", "at1g02100.2", "at1g02140.1", "at1g02150.1", "at1g02500.2", 
"at1g02560.1", "at1g02880.3", "at1g02920.1", "at1g02930.2"), class = "data.frame")

Second table:

> dput(tbl_col_clu2[1:20,])
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `20` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `52.5` = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `81` = c(0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `110` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `140.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `189` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `222.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `278` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `340` = c(0, 
0, 0, 0, 0, 0, 0.583163048, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 
1, 0.218194067), `397` = c(0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 
0, 0.63953839, 0, 1, 0, 0, 0, 1), `453.5` = c(0, 0.66069369, 
0, 0, 0, 1, 0.57541627, 1, 1, 0, 0, 0, 1, 0.64615661, 0, 0.45209671, 
0, 0, 0, 0.17022498), `529` = c(0, 0.521435654, 0, 0, 1, 0, 0.175996209, 
0, 0, 0, 1, 0, 0, 0, 0, 0.886059888, 0, 0, 0, 0.17022498), `580` = c(0, 
0.437291195, 0, 0, 1, 0, 0.20731698, 0, 0, 0, 1, 0, 0, 0, 0, 
0.719755907, 0, 0, 0, 0.033248127), `630.5` = c(0, 0.52204783, 
0, 0, 0, 0, 0.48815538, 0, 0, 0, 0, 1, 0, 0, 0, 0.82709638, 0, 
0, 0, 0.09539534), `683.5` = c(0, 0.52429838, 0, 0, 0, 0, 0.59605685, 
0, 0, 0, 0, 0, 0, 0, 0, 0.27845748, 0.28224351, 0, 0, 0), `735.5` = c(1, 
0.3768651, 0, 1, 0, 0, 0.51381348, 0, 0, 0, 0, 0, 0, 0, 0, 0.39914361, 
0.22206677, 0, 0, 0), `784` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 1, 0, 0, 0), `832` = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.16189002, 0, 0, 0), `882.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `926.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), `973` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.86100786, 0, 0, 0, 0, 
0), `1108` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0), `1200` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), Clusters = structure(c(168L, 32L, 246L, 
168L, 81L, 44L, 8L, 44L, 27L, 318L, 81L, 132L, 15L, 3L, 219L, 
32L, 156L, 318L, 1L, 6L), .Label = c("10", "10,11", "10,11,12", 
"10,11,12,13", "10,11,12,13,14", "10,11,12,13,14,15", "10,11,12,13,14,15,16", 
"10,11,12,13,14,15,16,17", "10,11,12,13,14,15,16,17,18,19", "10,11,12,13,14,15,16,17,18,19,20", 
"10,11,12,13,14,15,16,17,18,19,20,21", "10,11,12,13,14,16", "10,11,12,13,15,16,17,18,19,20,21", 
"10,11,12,13,19", "10,12", "10,12,13", "10,12,13,14", "10,12,13,14,15", 
"10,12,13,14,15,16,17", "10,12,13,15", "10,12,21", "10,13", "10,13,14", 
"10,17,18", "10,20", "11", "11,12", "11,12,13", "11,12,13,14", 
"11,12,13,14,15", "11,12,13,14,15,16", "11,12,13,14,15,16,17", 
"11,12,13,14,15,16,17,18,19", "11,12,13,14,15,16,17,18,19,20", 
"11,12,13,14,15,16,17,18,19,20,21,22,23", "11,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"11,12,13,14,15,16,17,18,19,21,22", "11,12,13,14,15,16,18", "11,12,13,17,18,19", 
"11,12,14", "11,13", "11,13,14,15,16", "11,15", "12", "12,13", 
"12,13,14", "12,13,14,15", "12,13,14,15,16", "12,13,14,15,16,17", 
"12,13,14,15,16,17,18", "12,13,14,15,16,17,18,19", "12,13,14,15,16,17,18,19,20", 
"12,13,14,15,16,17,18,19,20,21", "12,13,14,15,16,17,18,19,20,21,22", 
"12,13,14,15,16,17,18,19,20,21,22,23", "12,13,14,15,16,17,18,19,20,21,22,23,24", 
"12,13,14,15,16,17,18,19,23,24", "12,13,14,15,16,17,19", "12,13,14,15,16,17,19,20,21", 
"12,13,14,15,16,17,21", "12,13,14,15,16,18", "12,13,14,15,17", 
"12,13,14,16,17,19", "12,13,14,18", "12,13,15", "12,13,16", "12,13,16,17,18,19", 
"12,13,16,19", "12,13,17", "12,13,21,22,23", "12,14", "12,14,15", 
"12,14,15,16", "12,14,15,17,19", "12,15", "12,15,16,17", "12,16,17", 
"12,20", "12,21,23", "13", "13,14", "13,14,15", "13,14,15,16", 
"13,14,15,16,17", "13,14,15,16,17,18", "13,14,15,16,17,18,19", 
"13,14,15,16,17,18,19,20", "13,14,15,16,17,18,19,20,21", "13,14,15,16,17,18,19,20,21,22", 
"13,14,15,16,17,18,19,20,21,22,23", "13,14,15,16,17,18,19,20,21,22,23,24", 
"13,14,15,16,17,18,19,21", "13,14,15,16,17,18,19,21,22,23", "13,14,15,16,17,19", 
"13,14,15,16,17,21", "13,14,15,16,18,23", "13,14,17", "13,14,19,20,21,22,23", 
"13,14,23,24", "13,15", "13,15,16", "13,15,16,18,19", "13,15,17", 
"13,16,17", "13,17", "13,17,19", "13,19", "13,21", "14", "14,15", 
"14,15,16", "14,15,16,17", "14,15,16,17,18", "14,15,16,17,18,19", 
"14,15,16,17,18,19,20", "14,15,16,17,18,19,20,21", "14,15,16,17,18,19,20,21,22", 
"14,15,16,17,18,19,20,21,22,23", "14,15,16,17,18,19,20,21,22,23,24", 
"14,15,16,17,18,19,20,22,23,24", "14,15,16,17,19", "14,15,16,17,19,20", 
"14,15,16,17,19,20,21", "14,15,16,17,22", "14,15,16,19", "14,15,17", 
"14,15,19", "14,17", "14,17,18,19", "14,19", "14,21", "15", "15,16", 
"15,16,17", "15,16,17,18", "15,16,17,18,19", "15,16,17,18,19,20", 
"15,16,17,18,19,20,21", "15,16,17,18,19,20,21,22,23", "15,16,17,18,19,20,21,22,23,24", 
"15,16,17,19", "15,16,17,19,20,21", "15,16,17,19,24", "15,16,17,20,21", 
"15,16,17,21", "15,16,17,23", "15,16,18,19", "15,16,19,20", "15,17", 
"15,18,19,20", "15,18,19,20,21", "15,19", "16", "16,17", "16,17,18", 
"16,17,18,19", "16,17,18,19,20", "16,17,18,19,20,21", "16,17,18,19,20,21,22", 
"16,17,18,19,20,21,22,23", "16,17,18,19,20,21,22,23,24", "16,17,19", 
"16,17,19,20", "16,17,19,20,21", "16,17,19,21", "16,17,23", "16,19", 
"17", "17,18", "17,18,19", "17,18,19,20", "17,18,19,20,21", "17,18,19,20,21,22", 
"17,18,19,20,21,22,23", "17,18,19,20,21,22,23,24", "17,18,19,21", 
"17,19", "17,19,20", "17,19,20,21", "17,19,20,21,22,23,24", "17,19,23", 
"17,20,21", "17,20,21,23", "17,21,22", "17,23", "17,24", "18", 
"18,19", "18,19,20", "18,19,20,21", "18,19,20,21,22", "18,19,20,21,22,23", 
"18,19,20,21,22,23,24", "18,19,20,21,23", "18,20", "19", "19,20", 
"19,20,21", "19,20,21,22", "19,20,21,22,23", "19,20,21,22,23,24", 
"19,20,21,23,24", "19,20,22", "19,21", "19,22", "19,23", "2", 
"2,17", "2,3,4,5,6", "2,3,4,5,6,7", "20", "20,21", "20,21,22", 
"20,21,22,23", "20,21,22,23,24", "20,21,23", "20,21,23,24", "21", 
"21,22", "21,22,23", "21,22,23,24", "21,23", "22", "22,23", "22,23,24", 
"23", "23,24", "24", "3", "3,23,24", "3,4", "3,4,23,24", "3,4,5", 
"3,4,5,6", "3,4,5,6,13,14,15,16,17,18,19,20,21,22,23,24", "3,4,5,6,7", 
"3,4,5,6,7,8,9", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"3,4,5,6,7,8,9,20,21,22,23,24", "3,4,5,6,7,8,9,21,22,23,24", 
"3,4,5,6,8,9", "3,4,5,7,8,9,15,16,17,18,19,20,21,22,23", "3,4,6,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"3,8,9,10,11,12,13,14,15,16,17,18,19,20", "4", "4,17,18,19,20,21,22,23,24", 
"4,19,20,21,22,23,24", "4,21", "4,22,23,24", "4,5,17,18,19,20,21,22,23,24", 
"4,5,21,22,23,24", "4,5,6", "4,5,6,22,23,24", "4,5,6,7,8,9", 
"4,5,6,7,8,9,10", "4,5,6,7,8,9,10,15,16,17,18,19,20,21,22,23,24", 
"4,5,6,7,8,9,12,13,14,15,16,17,18,19,20,21,22,23,24", "4,5,6,7,8,9,13", 
"4,5,6,7,8,9,14,15,16,17,18,19,20,21,22,23,24", "4,5,6,7,8,9,17,18,19,20,21,22,23,24", 
"4,5,6,7,8,9,19,20,21,22,23,24", "4,5,6,7,8,9,19,23,24", "4,5,6,7,8,9,23,24", 
"4,5,7,8,9", "4,8,9,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"4,8,9,23,24", "5", "5,22,23", "5,6", "5,6,15,16,17,18,19,20,21,22,23,24", 
"5,6,19,20,21,22,23,24", "5,6,24", "5,6,7", "5,6,7,8", "5,6,7,8,19,20,21,22,23,24", 
"5,6,7,8,9", "5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14,15,16,17", 
"5,6,7,8,9,15,23,24", "5,6,9", "5,7", "5,8,9", "6", "6,15,16,17,18,19,20,21,22,23,24", 
"6,19,20,21,22,23,24", "6,20,21,22,23,24", "6,21,22,23,24", "6,7", 
"6,7,8", "6,7,8,9", "6,7,8,9,15,16,17,18,19,20,21,22,23,24", 
"6,7,8,9,23,24", "6,7,9", "6,8,15,16,17,18,19,20,21,22,23", "6,8,9", 
"6,9", "7", "7,14,24", "7,8,9", "7,8,9,10,11,12,13,14,15", "7,8,9,20,21,22,23,24", 
"7,8,9,23,24", "7,9", "7,9,10", "8", "8,19,20,21", "8,19,20,21,22,23,24", 
"8,9", "8,9,10,11,12,13,14,15,16,17", "8,9,10,17,18,19,20,21,22", 
"8,9,12,13,14,15,16,17,18,19", "8,9,14,15,16,17,18,19,20,21,22,23,24", 
"8,9,15,16,17,18,19,20,21,22", "8,9,19", "8,9,19,20,21,22,23", 
"8,9,21,22", "9", "9,10", "9,10,11,12,13,14", "9,10,11,12,13,14,15,16", 
"9,10,11,12,13,14,15,16,17", "9,10,11,12,13,14,15,16,17,18,19", 
"9,10,11,12,13,14,15,16,17,18,19,20,21", "9,10,11,12,13,14,15,16,17,18,19,20,21,22,23", 
"9,10,11,12,13,14,15,16,17,19", "9,12", "9,12,13", "9,12,13,14", 
"9,13", "9,13,14,15", "9,13,14,15,16,17", "9,13,14,15,18", "9,14", 
"9,14,15,16", "9,15", "9,15,16,17", "9,16", "9,16,17,18,19,21,22", 
"9,16,17,19", "9,17", "9,17,18", "9,19", "9,19,20", "9,19,20,21", 
"9,19,21", "9,20", "9,20,21", "9,20,21,22", "9,21", "9,22", "9,23"
), class = "factor")), .Names = c("10", "20", "52.5", "81", "110", 
"140.5", "189", "222.5", "278", "340", "397", "453.5", "529", 
"580", "630.5", "683.5", "735.5", "784", "832", "882.5", "926.5", 
"973", "1108", "1200", "Clusters"), row.names = c("at1g01050.1", 
"at1g01080.1", "at1g01090.1", "at1g01220.1", "at1g01420.1", "at1g01470.1", 
"at1g01800.1", "at1g01910.5", "at1g01920.2", "at1g01980.1", "at1g02020.2", 
"at1g02100.2", "at1g02130.1", "at1g02140.1", "at1g02150.1", "at1g02500.2", 
"at1g02560.1", "at1g02780.1", "at1g02880.3", "at1g02920.1"), class = "data.frame")

Third Table:

> dput(tbl_col_clu3[1:20,])
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `33.95` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `58.66` = c(0, 0, 0, 0, 0.328143363, 
0.552139556, 0.495919686, 0, 0, 0, 0, 0, 0, 0, 0, 0.416266322, 
0.886125103, 1, 1, 0), `84.42` = c(0, 0, 0, 0, 1, 1, 0, 0, 0, 
0, 0, 0.327004551, 0, 0, 0, 0.956778355, 1, 0.175277617, 0.240402438, 
0), `110.21` = c(0, 0, 0, 0, 0, 0.151581882, 0, 0, 0, 0, 0, 1, 
0, 0, 1, 0, 0.091367379, 0.029316359, 0, 0), `134.16` = c(0.190968551, 
0, 0, 0, 0, 0.164736594, 0, 0, 0, 0, 0, 0.650199285, 0, 0, 0, 
0, 0.097800974, 0.007393484, 0, 0), `164.69` = c(0.5342874459, 
0, 0.3619993464, 0, 0, 0.1891527151, 0, 0, 0, 0, 0, 0.4926963182, 
0, 0, 0, 0, 0, 0, 0, 0), `199.1` = c(0.866134859, 0, 0.405387979, 
0, 0, 0.274468991, 0, 0, 0, 0, 0, 0.352737127, 0.170514318, 0, 
0, 0, 0, 0, 0, 0), `234.35` = c(1, 0, 0.446118481, 0, 0, 0.338427523, 
0, 0, 0, 0, 0, 0.204601923, 0.343919727, 0, 0, 0, 0, 0, 0, 0), 
    `257.19` = c(0.732231652, 0, 0.666653103, 0, 0, 0.403078017, 
    0, 0, 0, 0, 0, 0.315665123, 1, 0, 0, 0, 0, 0, 0, 0), `361.84` = c(0.660960044, 
    0, 1, 0, 0, 0.202578329, 0, 0, 0, 0, 0, 0.320183046, 0.424361453, 
    0, 0, 0, 0, 0, 0, 0), `432.74` = c(0.47961801, 0, 0.48323321, 
    0, 0, 0.25926071, 0, 0, 0, 0, 0, 0.36362413, 0.43039587, 
    0, 0, 0, 0, 0, 0, 0), `506.34` = c(0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0.22943212, 0.19354376, 0, 0, 0, 0, 0, 0, 0), `581.46` = c(0, 
    0.52783556, 0, 1, 0, 0, 0, 0.64407392, 0, 0.70701938, 0, 
    0.2596209, 0.29757967, 0, 0, 0, 0, 0, 0, 0), `651.71` = c(0, 
    0.32678969, 0, 0.36428195, 0, 0, 0, 0.64951761, 0, 0.80866933, 
    1, 0.18614028, 0.21567888, 0.32813633, 0, 0, 0, 0, 0, 0), 
    `732.59` = c(0, 0.229023369, 0, 0.312832425, 0, 0, 0, 0.696041374, 
    0, 0.590471454, 0, 0.108699479, 0.187935709, 0.275177957, 
    0, 0, 0, 0, 0, 0.243080694), `817.56` = c(0, 0.25668583, 
    0, 0.4003249, 0, 0, 0, 0.53376606, 0, 0.85524485, 0, 0.22539659, 
    0.27977127, 0.55089774, 0, 0, 0, 0, 0, 1), `896.24` = c(0, 
    0.31675535, 0, 0.50882005, 0, 0, 0, 0.74705458, 0.12936306, 
    1, 0, 0.1949139, 0.21957859, 0.75063327, 0, 0, 0, 0, 0, 0.63346358
    ), `971.77` = c(0, 0.27811949, 0, 0.48419038, 0, 0, 0, 0.8563439, 
    0.39897143, 0.84491933, 0, 0.13935282, 0.17670128, 0.84111004, 
    0, 0, 0, 0, 0, 0), `1038.91` = c(0, 1, 0, 0.52506752, 0, 
    0, 0, 1, 1, 0.85617714, 0, 0.13507463, 0, 1, 0, 0, 0, 0, 
    0, 0), Clusters = structure(c(222L, 88L, 237L, 88L, 145L, 
    155L, 143L, 88L, 122L, 88L, 97L, 180L, 260L, 102L, 186L, 
    145L, 149L, 149L, 145L, 106L), .Label = c("10", "10,11", 
    "10,11,12", "10,11,12,13", "10,11,12,13,14", "10,11,12,13,14,15", 
    "10,11,12,13,14,15,16", "10,11,12,13,14,15,16,17,18", "10,11,12,13,14,15,16,17,18,19", 
    "10,11,12,13,14,15,16,17,18,19,20", "10,11,12,14", "10,11,12,14,15", 
    "10,11,12,14,15,16", "10,11,12,14,15,16,17,18", "10,11,12,14,15,16,17,18,19", 
    "10,11,12,14,15,16,17,18,19,20", "10,11,12,14,15,17,18,19", 
    "10,11,12,15,16,17", "10,11,14", "10,11,15", "10,11,15,16,17", 
    "10,11,16", "10,11,17", "10,11,20", "10,12", "10,14,15,16", 
    "10,14,15,16,17,18,19", "10,15", "10,15,16", "10,15,16,18", 
    "10,16,19", "10,18,19,20", "10,19", "10,19,20", "10,20", 
    "11", "11,12", "11,12,13", "11,12,13,14", "11,12,13,14,15", 
    "11,12,13,14,15,16", "11,12,13,14,15,16,17,18", "11,12,13,14,15,16,17,18,19", 
    "11,12,13,14,15,16,17,18,19,20", "11,12,13,14,15,16,18,19", 
    "11,12,14,15", "11,12,14,15,16,17", "11,12,14,15,16,17,18", 
    "11,12,14,15,16,17,18,19", "11,12,14,15,16,17,18,19,20", 
    "11,12,18", "11,12,19", "11,12,20", "12", "12,13", "12,13,14", 
    "12,13,14,15", "12,13,14,15,16", "12,13,14,15,16,17,18", 
    "12,13,14,15,16,17,18,19,20", "12,14", "12,14,15", "12,14,15,16", 
    "12,14,15,16,17", "12,14,15,16,17,18", "12,14,15,16,17,18,19", 
    "12,14,15,16,17,18,19,20", "12,14,15,16,20", "12,14,15,18,19,20", 
    "12,15", "12,16", "12,16,17,18", "12,18,19,20", "12,19,20", 
    "12,20", "13", "13,14", "13,14,15", "13,14,15,16,17,18,19,20", 
    "13,16", "13,20", "14", "14,15", "14,15,16", "14,15,16,17", 
    "14,15,16,17,18", "14,15,16,17,18,19", "14,15,16,17,18,19,20", 
    "14,15,16,18", "14,15,17", "14,15,18", "14,16", "14,16,17", 
    "14,16,17,18,19,20", "14,18,19,20", "14,19", "15", "15,16", 
    "15,16,17", "15,16,17,18", "15,16,17,18,19", "15,16,17,18,19,20", 
    "15,20", "16", "16,17", "16,17,18", "16,17,18,19", "16,17,18,19,20", 
    "16,17,18,20", "16,17,19", "16,18,19,20", "16,19,20", "17", 
    "17,18", "17,18,19", "17,18,19,20", "17,18,20", "17,19,20", 
    "17,20", "18", "18,19", "18,19,20", "19", "19,20", "2", "2,19,20", 
    "2,3", "2,3,4", "2,3,4,5", "2,3,4,5,11", "2,3,4,5,6", "2,3,4,5,6,7,8", 
    "2,3,4,5,6,7,8,11,12", "2,3,4,5,6,7,8,9", "2,3,4,5,6,7,8,9,10", 
    "2,3,4,5,6,7,8,9,10,11", "2,3,4,5,6,7,8,9,10,11,12", "2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "2,4", "2,5", "2,5,6,7", "20", "3", "3,18", "3,4", "3,4,10", 
    "3,4,20", "3,4,5", "3,4,5,6", "3,4,5,6,7", "3,4,5,6,7,8", 
    "3,4,5,6,7,8,9", "3,4,5,6,7,8,9,10", "3,4,5,6,7,8,9,10,11", 
    "3,4,5,6,7,8,9,10,11,12", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17", 
    "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "3,4,8", "3,4,8,9", "3,5", "3,7", "3,9", "4", "4,5", "4,5,12,13", 
    "4,5,16", "4,5,6", "4,5,6,16,17,18,19,20", "4,5,6,20", "4,5,6,7", 
    "4,5,6,7,8", "4,5,6,7,8,10,11", "4,5,6,7,8,9", "4,5,6,7,8,9,10", 
    "4,5,6,7,8,9,10,11", "4,5,6,7,8,9,10,11,12", "4,5,6,7,8,9,10,11,12,13,14,15", 
    "4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19", "4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20", "4,5,6,7,8,9,16,17", 
    "4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "4,6,7", "4,7,13", 
    "5", "5,11,12,14,15,16,17,18,19", "5,14", "5,14,15,16", "5,16,19", 
    "5,17,18,19,20", "5,18", "5,6", "5,6,7", "5,6,7,10", "5,6,7,8", 
    "5,6,7,8,10", "5,6,7,8,9", "5,6,7,8,9,10", "5,6,7,8,9,10,11", 
    "5,6,7,8,9,10,11,12", "5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14", 
    "5,6,7,8,9,10,11,12,13,14,15,16", "5,6,7,8,9,10,11,12,13,14,15,16,17,18", 
    "5,6,7,8,9,10,11,12,13,14,15,16,17,18,19", "5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "5,6,7,8,9,16,17,18,19,20", "5,6,8", "5,7,8,9,10", "5,7,8,9,10,14,15,16,17,18", 
    "5,8", "6", "6,7", "6,7,16", "6,7,8", "6,7,8,10,11,12,15,16,17,18", 
    "6,7,8,19", "6,7,8,9", "6,7,8,9,10", "6,7,8,9,10,11", "6,7,8,9,10,11,12", 
    "6,7,8,9,10,11,12,13,14", "6,7,8,9,10,11,12,13,14,15,16,17", 
    "6,7,8,9,10,11,12,13,14,15,16,17,18,19", "6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "6,7,8,9,10,11,12,14,15,16", "6,7,8,9,10,18,19", "7", "7,10,11,14,15", 
    "7,12", "7,8", "7,8,12", "7,8,9", "7,8,9,10", "7,8,9,10,11", 
    "7,8,9,10,11,12", "7,8,9,10,11,12,13", "7,8,9,10,11,12,13,14,15,16", 
    "7,8,9,10,11,12,13,14,15,16,17,18", "7,8,9,10,11,12,13,14,15,16,17,18,19", 
    "7,8,9,10,11,12,13,14,15,16,17,18,19,20", "7,8,9,10,11,12,14,15,16,17,18,19", 
    "7,8,9,10,11,12,14,15,16,17,18,19,20", "7,8,9,10,12,15,16,17,18", 
    "7,9,10,11,12,13,14,15,16,17,18,19,20", "8", "8,10", "8,10,20", 
    "8,14,15,16,17,18,19,20", "8,16,17", "8,9", "8,9,10", "8,9,10,11", 
    "8,9,10,11,12", "8,9,10,11,12,13,14", "8,9,10,11,12,13,14,15", 
    "8,9,10,11,12,13,14,15,16", "8,9,10,11,12,13,14,15,16,17,18", 
    "8,9,10,11,12,13,14,15,16,17,18,19", "8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "8,9,10,11,12,14,15,16", "8,9,10,11,12,14,15,16,17,18,19,20", 
    "8,9,10,14,15,16,17,18,19,20", "8,9,17", "9", "9,10", "9,10,11", 
    "9,10,11,12", "9,10,11,12,13,14,15,16,17", "9,10,11,12,13,14,15,16,17,18", 
    "9,10,11,12,13,14,15,16,17,18,19", "9,10,11,12,13,14,15,16,17,18,19,20", 
    "9,10,11,12,14,15,16", "9,10,11,12,14,15,16,17,18", "9,10,11,12,14,15,16,17,18,19", 
    "9,10,11,12,14,15,16,17,18,19,20", "9,10,11,12,16,17,18,19,20", 
    "9,10,11,14,15,16,17", "9,10,12,14,15,16,17", "9,10,14,15", 
    "9,11,12", "9,11,12,14", "9,12,14", "9,20"), class = "factor")), .Names = c("10", 
"33.95", "58.66", "84.42", "110.21", "134.16", "164.69", "199.1", 
"234.35", "257.19", "361.84", "432.74", "506.34", "581.46", "651.71", 
"732.59", "817.56", "896.24", "971.77", "1038.91", "Clusters"
), row.names = c("at1g01050.1", "at1g01080.1", "at1g01090.1", 
"at1g01320.2", "at1g01470.1", "at1g01800.1", "at1g01910.5", "at1g01960.1", 
"at1g01980.1", "at1g02150.1", "at1g02470.1", "at1g02500.2", "at1g02560.1", 
"at1g02780.1", "at1g02816.1", "at1g02880.2", "at1g02920.1", "at1g02930.2", 
"at1g03030.1", "at1g03090.2"), class = "data.frame")

The last column (Clusters) is important for us and the row.names. This column says in which column we can find any abundance for that gene. It doesn't matter for me in which exaclty cluster is gene but which genes come together with it.

Let's use an example:

Those genes belong to the same cluster (cluster 5) in data1.
at1g09640.1
at1g07250.1
at1g08200.1
at1g09300.2    ##
at1g09490.2    ## Those
at1g09760.1    ##
at1g09780.1

If we analyze other data set (data2). We can see that some of those genes can be found together again. Maybe it's different cluster (cluster 20) or so but they are together and that's most important for me.

at1g02880.3
at1g01220.1
at1g09300.2   ## 
at1g09490.2   ## Those
at1g09760.1   ## 
at1g02130.1

I have like 15 similar data sets and I would like to be able to ask R: show me genes which can be found together in 15 of 15 data sets or 13 of 15 data sets and so on....

Any ideas ?

like image 766
Shaxi Liver Avatar asked Nov 01 '22 19:11

Shaxi Liver


1 Answers

First, you need to turn those comma delimited lists into columns- it is much easier to work with them that way. Then, you want to find which genes have the matching columns. Finally, you can aggregate to get totals of how many genes match other genes.

Note that you will have both orders of genes, as well as genes matched with themselves. Also, the "Clusters" column will tell you how many times they were in the same exact set of clusters.

This will run in O(n^2) time, meaning that doubling the number of genes analyzed will quadruple the time. My quick timing tests estimate it would take 15 hours on my computer to do 15 data frames of 2300 rows.

library(plyr)

frame_list <- list(tbl_col_clu1, tbl_col_clu2, tbl_col_clu3)

turn_numbers_into_columns <- function(x) {
  # Creates a data.frame that has the group numbers as columns
  x[, strsplit(x$Clusters, ",")[[1]]] <- 1   
  return(x) 
}

get_comparison <- function(current_table) {
  # Creates a comparison data frame for a single input table   
  simplified_frame <- data.frame(
    "gene" = row.names(current_table), 
    "Clusters" = as.character(current_table$Clusters),
    stringsAsFactors = FALSE)
  split_f <- adply(simplified_frame, 1, turn_numbers_into_columns)

  #This is the slow line
  comparison_frame <- ddply(split_f, "gene", function(x) {
    ddply(split_f, "gene", function (y) {
      output <- as.data.frame(x == y)
      output$gene <- x$gene
      output$gene2 <- y$gene
      return(output)
    })   
  })
  return(comparison_frame) 
}

combined_frame <- ldply(frame_list, get_comparison)


sum_frame <- aggregate(
  combined_frame[, !(names(combined_frame) %in% c("gene", "gene2"))], 
  by = combined_frame[, c("gene", "gene2")], 
  FUN = sum,
  na.rm = T)

View(sum_frame)

If you had consistently the same set of genes and groupings, you could turn everything into arrays, which run faster than data frames, cutting your time by a factor of about six. The part that runs very slowly would be replaced with something like this. It returns 3-dimensional arrays that you could add together.

comparison_frame <- aaply(split_f, 1, function(x) {
    print(x)
    output <- aaply(split_f, 1, function (y) {
      output <- array(x == y, c(1, length(x)))
      return(output)
    })  
    return(output)
  })
like image 93
waternova Avatar answered Nov 15 '22 07:11

waternova