If fullfiles two rules store the names in the vector

Question

Start with the data:

> dput(Data1)
structure(list(X1 = structure(c(17L, 14L, 20L, 16L, 1L, 2L, 3L, 
4L, 15L, 8L, 9L, 10L, 11L, 12L, 13L, 21L, 22L, 23L, 18L, 19L, 
5L, 6L, 7L), .Label = c("Astra_1", "Astra_2", "Astra_3", "Astra_4", 
"Audi_1", "Audi_2", "Audi_3", "BMW_1", "BMW_2", "BMW_3", "BMW_4", 
"BMW_5", "Fiat_1", "Mazda_2", "Mercedes_1", "Nexia_1", "Porsche_1", 
"Scania_1", "Scania_2", "Tico_1", "VW_1", "VW_2", "VW_3"), class = "factor"), 
    X2 = structure(c(2L, 3L, 10L, 7L, 8L, 12L, 9L, 14L, 11L, 
    4L, 5L, 6L, 15L, 13L, 4L, 5L, 9L, 14L, 11L, 1L, 3L, 10L, 
    16L), .Label = c("Astra_1", "Astra_3", "Astra_4", "Audi_1", 
    "Audi_2", "Audi_3", "BMW_1", "BMW_2", "Mazda_2", "Mercedes_1", 
    "Nexia_1", "Porsche_1", "Scania_2", "Tico_1", "VW_2", "VW_3"
    ), class = "factor"), AUC_1 = c(5860133.702, 1296009.939, 
    333123.4932, 250348.9407, 1376193.334, 4080502.863, 3777603.233, 
    3503973.487, 99101538.62, 231873.8462, 87258.75465, 147430.9913, 
    1028986.892, 1451482.832, 8136.72382, 25311.41683, 131352.7137, 
    565410.8186, 30196.23792, 70184.82268, 2526321.019, 381643.2138, 
    819687.9824), AUC_2 = c(4849720.322, 928980.4715, 320547.6185, 
    223287.2029, 1340641.323, 4720329.699, 4369150.434, 3371021.243, 
    108591253.3, 266489.7601, 85384.84604, 165726.7626, 1052130.559, 
    1470876.65, 9499.927679, 49309.74984, 138482.765, 444600.7911, 
    25132.73714, 55453.67019, 2038911.81, 422559.3293, 1445477.433
    ), ratio = c(1.20834467, 1.395088463, 1.03923247, 1.121196994, 
    1.02651866, 0.864452935, 0.864608186, 1.039439753, 0.91261069, 
    0.87010415, 1.021946618, 0.889602795, 0.978003046, 0.98681479, 
    0.856503765, 0.513314647, 0.948513078, 1.271726974, 1.201470327, 
    1.265647926, 1.2390536, 0.90317072, 0.567070757), Country = structure(c(1L, 
    1L, 2L, 3L, 5L, 1L, 5L, 1L, 4L, 7L, 4L, 7L, 7L, 7L, 6L, 6L, 
    6L, 6L, 8L, 8L, 6L, 6L, 7L), .Label = c("France", "Germany", 
    "Italy", "Norway", "Poland", "Spain", "Sweden", "Ukraine"
    ), class = "factor"), Comp = structure(c(3L, 5L, 16L, 9L, 
    8L, 9L, 12L, 14L, 4L, 15L, 11L, 14L, 16L, 17L, 10L, 10L, 
    12L, 13L, 1L, 2L, 5L, 6L, 7L), .Label = c("11,12", "12,13", 
    "12,13,14", "14,15", "14,15,16", "15,16,17", "16,17,18", 
    "2,3", "2,3,4", "3,4", "3,4,5", "4,5,6", "5,6", "5,6,7", 
    "5,6,7,8", "6,7,8", "7,8,9"), class = "factor")), .Names = c("X1", 
"X2", "AUC_1", "AUC_2", "ratio", "Country", "Comp"), class = "data.frame", row.names = c(NA, 
-23L))

Head of the data look like that:

         X1         X2     AUC_1     AUC_2     ratio Country     Comp
1 Porsche_1    Astra_3 5860133.7 4849720.3 1.2083447  France 12,13,14
2   Mazda_2    Astra_4 1296009.9  928980.5 1.3950885  France 14,15,16
3    Tico_1 Mercedes_1  333123.5  320547.6 1.0392325 Germany    6,7,8
4   Nexia_1      BMW_1  250348.9  223287.2 1.1211970   Italy    2,3,4
5   Astra_1      BMW_2 1376193.3 1340641.3 1.0265187  Poland      2,3
6   Astra_2  Porsche_1 4080502.9 4720329.7 0.8644529  France    2,3,4

Now we are going to focus on two last columns: Country and Comp. I would like to extract all rows which contains the same country and than compare if any of the numbers in column Comp is the same the strings from X1 and X2 should be stored together - possibly in the separate vectors or in the matrix. It's possible that one row may belong to different "clusters"/"vectors".

Example of desired output. That's just an example and the clustering is completly random. Any method for visualization of the output is acceptable.

    Country         1        2        3         4        5       6
1    France    Astra_3  Scania_2   Tico_1       NA       NA       NA 
2    Poland    Astra_4   Mazda_2   VW_3       Tico_2     NA       NA
3    Sweden Mercedes_1    BMW_1    BMW_2      Audi_1    VW_3      NA  
4    Norway      BMW_1   Astra_1  Scania_2    Audi_3     NA       NA

inscaven · Accepted Answer

Assuming dat is your data.

library(data.table)
library(stringr)

setDT(dat)

dat[, `:=`(X1 = as.character(X1), X2 = as.character(X2), 
           Comp = str_split(as.character(Comp), ","))]

dat[, lapply(.SD, unlist), by = 1:nrow(dat)
    ][, .(X = paste(sort(unique(c(X1, X2))), collapse = ",")), by = .(Country, Comp)
      ][, .(SharedComp = paste(Comp, collapse = ",")), by = .(Country, X)] -> result

head(result)

   Country                                 X SharedComp
1:  France                 Astra_3,Porsche_1      12,13
2:  France Astra_3,Astra_4,Mazda_2,Porsche_1         14
3:  France                   Astra_4,Mazda_2      15,16
4: Germany                 Mercedes_1,Tico_1      6,7,8
5:   Italy                     BMW_1,Nexia_1      2,3,4
6:  Poland                     Astra_1,BMW_2        2,3

If you want output to look more like in your question, it's necessary to do some reshaping.

dcast(result[, .(Country, SharedComp, X = str_split(X, ","))
             ][, lapply(.SD, unlist), by = 1:nrow(result)
               ][, i := seq_len(.N), by = nrow], 
      nrow + Country ~ i, value.var = "X")

   nrow Country          1          2        3          4        5     6        7    8
1:    1  France    Astra_3  Porsche_1       NA         NA       NA    NA       NA   NA
2:    2  France    Astra_3    Astra_4  Mazda_2  Porsche_1       NA    NA       NA   NA
3:    3  France    Astra_4    Mazda_2       NA         NA       NA    NA       NA   NA
4:    4 Germany Mercedes_1     Tico_1       NA         NA       NA    NA       NA   NA
5:    5   Italy      BMW_1    Nexia_1       NA         NA       NA    NA       NA   NA
6:    6  Poland    Astra_1      BMW_2       NA         NA       NA    NA       NA   NA
---
11:  11  Sweden     Audi_1     Audi_3    BMW_1      BMW_3       NA    NA       NA   NA
12:  12  Sweden     Audi_1     Audi_3    BMW_1      BMW_3    BMW_4  VW_2       NA   NA
13:  13  Sweden     Audi_1     Audi_3    BMW_1      BMW_3    BMW_4 BMW_5 Scania_2 VW_2
---
25:  25   Spain     Audi_2 Mercedes_1       NA         NA       NA    NA       NA   NA
26:  26  Sweden     Audi_3       VW_3       NA         NA       NA    NA       NA   NA
   nrow Country          1          2        3          4        5     6        7    8

If fullfiles two rules store the names in the vector

Tags:

r

Shaxi Liver

1 Answers

inscaven

Recent Activity

Donate For Us

If fullfiles two rules store the names in the vector

Tags:

r

Shaxi Liver

1 Answers

inscaven

Related questions

Recent Activity

Donate For Us