Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

reshape/melt an asymmetric matrix according to a rowKey

I want to reshape/melt an asymmetric matrix such that any two rows are summed across columns when both column cells are non-zero according to a rowKey. I've tried various option but none that have worked. I'm looking for a general solution that works for large asymmetric matrices.

#Dummy data    
set.seed(123)    

mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))

mat
  a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
C 1 1 3 0 0
D 2 2 1 2 3

rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")

#Desired output

    N1 N2 N3 value
1   A  C  a     0
2   A  C  b     4
3   A  C  c     4
4   A  C  d     0
5   A  C  e     0
6   B  C  a     3
7   B  C  b     0
8   B  C  c     4
9   B  C  d     0
10  B  C  e     0
11  A  D  a     0
12  A  D  b     5
13  A  D  c     2
14  A  D  d     3
15  A  D  e     0
16  B  D  a     4
17  B  D  b     0
18  B  D  c     2
19  B  D  d     3
20  B  D  e     0

Any pointer is highly appreciated!

like image 376
jO. Avatar asked Nov 02 '17 20:11

jO.


3 Answers

temp = expand.grid(c(split(names(rowKey), rowKey), list(N3 = colnames(mat))))

temp2 = sapply(1:nrow(temp), function(i)
    mat[row.names(mat) == temp$N1[i] | row.names(mat) == temp$N2[i],
        colnames(mat) == temp$N3[i]])

temp$value = colSums(temp2) * (colSums(temp2 > 0) == nrow(temp2))
temp
#   N1 N2 N3 value
#1   A  C  a     0
#2   B  C  a     3
#3   A  D  a     0
#4   B  D  a     4
#5   A  C  b     4
#6   B  C  b     0
#7   A  D  b     5
#8   B  D  b     0
#9   A  C  c     4
#10  B  C  c     4
#11  A  D  c     2
#12  B  D  c     2
#13  A  C  d     0
#14  B  C  d     0
#15  A  D  d     3
#16  B  D  d     3
#17  A  C  e     0
#18  B  C  e     0
#19  A  D  e     0
#20  B  D  e     0
like image 150
d.b Avatar answered Nov 01 '22 15:11

d.b


Here is a longer method with your data:

set.seed(123)
mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))

rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")

Split the matrix by rowKey:

> n1 <- names(which(rowKey=="N1"))
> mat[n1,]
  a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
> n2 <- names(which(rowKey=="N2"))
> mat[n2,]
  a b c d e
C 1 1 3 0 0
D 2 2 1 2 3

Convert data then into molten data frames.

> library(reshape2)
> mmat1 <- melt(mat[n1,])
> mmat1
   Var1 Var2 value
1     A    a     0
2     B    a     2
3     A    b     3
4     B    b     0
5     A    c     1
6     B    c     1
7     A    d     1
8     B    d     1
9     A    e     0
10    B    e     0
> mmat2 <- melt(mat[n2,])
> mmat2
   Var1 Var2 value
1     C    a     1
2     D    a     2
3     C    b     1
4     D    b     2
5     C    c     3
6     D    c     1
7     C    d     0
8     D    d     2
9     C    e     0
10    D    e     3

Then merge the data frames by column name, beware of column names in merging

> colnames(mmat1) <- c("N1","N3","Val1")
> colnames(mmat2) <- c("N2","N3","Val2")
> mmat12 <- merge(mmat1,mmat2)

Now we may compose the final matrix with the conditions

> res <- cbind(mmat12[c('N1','N2','N3')],mmat12['Val1']+mmat12['Val2'])
> res[(mmat12['Val1'] == 0)|(mmat12['Val2'] == 0),4] <- 0
> res[with(res, order(N1,N2,N3)),]
   N1 N2 N3 Val1
1   A  C  a    0
5   A  C  b    4
9   A  C  c    4
13  A  C  d    0
17  A  C  e    0
2   A  D  a    0
6   A  D  b    5
10  A  D  c    2
14  A  D  d    3
18  A  D  e    0
3   B  C  a    3
7   B  C  b    0
11  B  C  c    4
15  B  C  d    0
19  B  C  e    0
4   B  D  a    4
8   B  D  b    0
12  B  D  c    2
16  B  D  d    3
20  B  D  e    0
like image 39
Heikki Avatar answered Nov 01 '22 14:11

Heikki


Here's some data.frame-centered options using the tidyverse for manipulations:

library(tidyverse)
set.seed(123)    

mat <- matrix(rbinom(20, 100, 0.01), 4, 5, 
              dimnames = list(LETTERS[1:4], letters[1:5]))
rowKey <- c("A" = "N1", "B" = "N1", "C" = "N2", "D" = "N2")

output1 <- mat %>% 
    as.data.frame() %>% 
    rownames_to_column('N1') %>% 
    gather(N3, value, -N1) %>%    # reshape to long form
    crossing(N2 = .$N1) %>%    # add combinations of rowname values
    filter(N1 != N2, rowKey[N1] != rowKey[N2]) %>%    # drop unwanted combinations
    mutate(value = na_if(value, 0),    # change 0 values to NA so sum will be 0
           # sort rowname values to make group ID column for aggregation
           id = map2_chr(N1, N2, ~toString(sort(c(.x, .y))))) %>% 
    group_by(id, N3) %>% 
    summarise(N1 = min(N1),    # get alpabetically first rowname for N1
              N2 = max(N2),    # and last for N2
              value = coalesce(sum(value), 0L)) %>%    # sum and replace NAs with 0s again
    # clean up
    ungroup() %>% 
    select(N1, N2, N3, value) %>% 
    arrange(N2)

output1
#> # A tibble: 20 x 4
#>       N1    N2    N3 value
#>    <chr> <chr> <chr> <int>
#>  1     A     C     a     0
#>  2     A     C     b     4
#>  3     A     C     c     4
#>  4     A     C     d     0
#>  5     A     C     e     0
#>  6     B     C     a     3
#>  7     B     C     b     0
#>  8     B     C     c     4
#>  9     B     C     d     0
#> 10     B     C     e     0
#> 11     A     D     a     0
#> 12     A     D     b     5
#> 13     A     D     c     2
#> 14     A     D     d     3
#> 15     A     D     e     0
#> 16     B     D     a     4
#> 17     B     D     b     0
#> 18     B     D     c     2
#> 19     B     D     d     3
#> 20     B     D     e     0

Like expand.grid, tidyr::crossing expands more than necessary (e.g. A/A combinations), which may slow things down at scale. A combn-based approach may be faster, if more annoying to write.

Splitting and using a self-join is a more direct way to create the combinations through adding columns instead of rows. It requires some light list gymnastics, either with split:

output2 <- mat %>% 
    as.data.frame() %>% 
    rownames_to_column('N') %>% 
    gather(N3, value, -N) %>% 
    mutate(key = rowKey[N],    # add column with key
           value = na_if(value, 0)) %>% 
    split(.$key) %>%    # split list by key
    # join list elements to add N1/N2 and value combinations
    reduce(full_join, by = 'N3', suffix = sub('N', '', names(.))) %>% 
    transmute(N1, N2, N3, 
              value = coalesce(value1 + value2, 0L)) %>%
    arrange(N2, N1)

all_equal(output1, output2)
#> [1] TRUE

...or tidyr::nest:

output3 <- mat %>% 
    as.data.frame() %>% 
    rownames_to_column('N') %>% 
    gather(N3, value, -N) %>% 
    mutate(key = rowKey[N], 
           value = na_if(value, 0)) %>% 
    nest(-key) %>%    # store all but key column as nested data frame
    # join nested data frames by N3 to get N1/N2 and value combinations
    { reduce(.$data, full_join, by = 'N3', suffix = sub('N', '', .$key)) } %>% 
    transmute(N1, N2, N3, 
              value = coalesce(value1 + value2, 0L)) %>% 
    arrange(N2, N1)

all_equal(output1, output3)
#> [1] TRUE

The reduce calls could be replaced by purrr::invoke/do.call as reduce only calls full_join once, but reducing over a join is a common idiom and may make the approach easier to scale.

like image 35
alistaire Avatar answered Nov 01 '22 15:11

alistaire