Start with the data:
> dput(Data1)
structure(list(X1 = structure(c(17L, 14L, 20L, 16L, 1L, 2L, 3L,
4L, 15L, 8L, 9L, 10L, 11L, 12L, 13L, 21L, 22L, 23L, 18L, 19L,
5L, 6L, 7L), .Label = c("Astra_1", "Astra_2", "Astra_3", "Astra_4",
"Audi_1", "Audi_2", "Audi_3", "BMW_1", "BMW_2", "BMW_3", "BMW_4",
"BMW_5", "Fiat_1", "Mazda_2", "Mercedes_1", "Nexia_1", "Porsche_1",
"Scania_1", "Scania_2", "Tico_1", "VW_1", "VW_2", "VW_3"), class = "factor"),
X2 = structure(c(2L, 3L, 10L, 7L, 8L, 12L, 9L, 14L, 11L,
4L, 5L, 6L, 15L, 13L, 4L, 5L, 9L, 14L, 11L, 1L, 3L, 10L,
16L), .Label = c("Astra_1", "Astra_3", "Astra_4", "Audi_1",
"Audi_2", "Audi_3", "BMW_1", "BMW_2", "Mazda_2", "Mercedes_1",
"Nexia_1", "Porsche_1", "Scania_2", "Tico_1", "VW_2", "VW_3"
), class = "factor"), AUC_1 = c(5860133.702, 1296009.939,
333123.4932, 250348.9407, 1376193.334, 4080502.863, 3777603.233,
3503973.487, 99101538.62, 231873.8462, 87258.75465, 147430.9913,
1028986.892, 1451482.832, 8136.72382, 25311.41683, 131352.7137,
565410.8186, 30196.23792, 70184.82268, 2526321.019, 381643.2138,
819687.9824), AUC_2 = c(4849720.322, 928980.4715, 320547.6185,
223287.2029, 1340641.323, 4720329.699, 4369150.434, 3371021.243,
108591253.3, 266489.7601, 85384.84604, 165726.7626, 1052130.559,
1470876.65, 9499.927679, 49309.74984, 138482.765, 444600.7911,
25132.73714, 55453.67019, 2038911.81, 422559.3293, 1445477.433
), ratio = c(1.20834467, 1.395088463, 1.03923247, 1.121196994,
1.02651866, 0.864452935, 0.864608186, 1.039439753, 0.91261069,
0.87010415, 1.021946618, 0.889602795, 0.978003046, 0.98681479,
0.856503765, 0.513314647, 0.948513078, 1.271726974, 1.201470327,
1.265647926, 1.2390536, 0.90317072, 0.567070757), Country = structure(c(1L,
1L, 2L, 3L, 5L, 1L, 5L, 1L, 4L, 7L, 4L, 7L, 7L, 7L, 6L, 6L,
6L, 6L, 8L, 8L, 6L, 6L, 7L), .Label = c("France", "Germany",
"Italy", "Norway", "Poland", "Spain", "Sweden", "Ukraine"
), class = "factor"), Comp = structure(c(3L, 5L, 16L, 9L,
8L, 9L, 12L, 14L, 4L, 15L, 11L, 14L, 16L, 17L, 10L, 10L,
12L, 13L, 1L, 2L, 5L, 6L, 7L), .Label = c("11,12", "12,13",
"12,13,14", "14,15", "14,15,16", "15,16,17", "16,17,18",
"2,3", "2,3,4", "3,4", "3,4,5", "4,5,6", "5,6", "5,6,7",
"5,6,7,8", "6,7,8", "7,8,9"), class = "factor")), .Names = c("X1",
"X2", "AUC_1", "AUC_2", "ratio", "Country", "Comp"), class = "data.frame", row.names = c(NA,
-23L))
Head of the data look like that:
X1 X2 AUC_1 AUC_2 ratio Country Comp
1 Porsche_1 Astra_3 5860133.7 4849720.3 1.2083447 France 12,13,14
2 Mazda_2 Astra_4 1296009.9 928980.5 1.3950885 France 14,15,16
3 Tico_1 Mercedes_1 333123.5 320547.6 1.0392325 Germany 6,7,8
4 Nexia_1 BMW_1 250348.9 223287.2 1.1211970 Italy 2,3,4
5 Astra_1 BMW_2 1376193.3 1340641.3 1.0265187 Poland 2,3
6 Astra_2 Porsche_1 4080502.9 4720329.7 0.8644529 France 2,3,4
Now we are going to focus on two last columns: Country
and Comp
. I would like to extract all rows which contains the same country and than compare if any of the numbers in column Comp
is the same the strings from X1 and X2 should be stored together - possibly in the separate vectors or in the matrix. It's possible that one row may belong to different "clusters"/"vectors".
Example of desired output. That's just an example and the clustering is completly random. Any method for visualization of the output is acceptable.
Country 1 2 3 4 5 6
1 France Astra_3 Scania_2 Tico_1 NA NA NA
2 Poland Astra_4 Mazda_2 VW_3 Tico_2 NA NA
3 Sweden Mercedes_1 BMW_1 BMW_2 Audi_1 VW_3 NA
4 Norway BMW_1 Astra_1 Scania_2 Audi_3 NA NA
Assuming dat
is your data.
library(data.table)
library(stringr)
setDT(dat)
dat[, `:=`(X1 = as.character(X1), X2 = as.character(X2),
Comp = str_split(as.character(Comp), ","))]
dat[, lapply(.SD, unlist), by = 1:nrow(dat)
][, .(X = paste(sort(unique(c(X1, X2))), collapse = ",")), by = .(Country, Comp)
][, .(SharedComp = paste(Comp, collapse = ",")), by = .(Country, X)] -> result
head(result)
Country X SharedComp
1: France Astra_3,Porsche_1 12,13
2: France Astra_3,Astra_4,Mazda_2,Porsche_1 14
3: France Astra_4,Mazda_2 15,16
4: Germany Mercedes_1,Tico_1 6,7,8
5: Italy BMW_1,Nexia_1 2,3,4
6: Poland Astra_1,BMW_2 2,3
If you want output to look more like in your question, it's necessary to do some reshaping.
dcast(result[, .(Country, SharedComp, X = str_split(X, ","))
][, lapply(.SD, unlist), by = 1:nrow(result)
][, i := seq_len(.N), by = nrow],
nrow + Country ~ i, value.var = "X")
nrow Country 1 2 3 4 5 6 7 8
1: 1 France Astra_3 Porsche_1 NA NA NA NA NA NA
2: 2 France Astra_3 Astra_4 Mazda_2 Porsche_1 NA NA NA NA
3: 3 France Astra_4 Mazda_2 NA NA NA NA NA NA
4: 4 Germany Mercedes_1 Tico_1 NA NA NA NA NA NA
5: 5 Italy BMW_1 Nexia_1 NA NA NA NA NA NA
6: 6 Poland Astra_1 BMW_2 NA NA NA NA NA NA
---
11: 11 Sweden Audi_1 Audi_3 BMW_1 BMW_3 NA NA NA NA
12: 12 Sweden Audi_1 Audi_3 BMW_1 BMW_3 BMW_4 VW_2 NA NA
13: 13 Sweden Audi_1 Audi_3 BMW_1 BMW_3 BMW_4 BMW_5 Scania_2 VW_2
---
25: 25 Spain Audi_2 Mercedes_1 NA NA NA NA NA NA
26: 26 Sweden Audi_3 VW_3 NA NA NA NA NA NA
nrow Country 1 2 3 4 5 6 7 8
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With