I want to reshape/melt an asymmetric matrix such that any two rows are summed across columns when both column cells are non-zero according to a rowKey. I've tried various option but none that have worked. I'm looking for a general solution that works for large asymmetric matrices.
#Dummy data
set.seed(123)
mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))
mat
a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
C 1 1 3 0 0
D 2 2 1 2 3
rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")
#Desired output
N1 N2 N3 value
1 A C a 0
2 A C b 4
3 A C c 4
4 A C d 0
5 A C e 0
6 B C a 3
7 B C b 0
8 B C c 4
9 B C d 0
10 B C e 0
11 A D a 0
12 A D b 5
13 A D c 2
14 A D d 3
15 A D e 0
16 B D a 4
17 B D b 0
18 B D c 2
19 B D d 3
20 B D e 0
Any pointer is highly appreciated!
temp = expand.grid(c(split(names(rowKey), rowKey), list(N3 = colnames(mat))))
temp2 = sapply(1:nrow(temp), function(i)
mat[row.names(mat) == temp$N1[i] | row.names(mat) == temp$N2[i],
colnames(mat) == temp$N3[i]])
temp$value = colSums(temp2) * (colSums(temp2 > 0) == nrow(temp2))
temp
# N1 N2 N3 value
#1 A C a 0
#2 B C a 3
#3 A D a 0
#4 B D a 4
#5 A C b 4
#6 B C b 0
#7 A D b 5
#8 B D b 0
#9 A C c 4
#10 B C c 4
#11 A D c 2
#12 B D c 2
#13 A C d 0
#14 B C d 0
#15 A D d 3
#16 B D d 3
#17 A C e 0
#18 B C e 0
#19 A D e 0
#20 B D e 0
Here is a longer method with your data:
set.seed(123)
mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))
rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")
Split the matrix by rowKey:
> n1 <- names(which(rowKey=="N1"))
> mat[n1,]
a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
> n2 <- names(which(rowKey=="N2"))
> mat[n2,]
a b c d e
C 1 1 3 0 0
D 2 2 1 2 3
Convert data then into molten data frames.
> library(reshape2)
> mmat1 <- melt(mat[n1,])
> mmat1
Var1 Var2 value
1 A a 0
2 B a 2
3 A b 3
4 B b 0
5 A c 1
6 B c 1
7 A d 1
8 B d 1
9 A e 0
10 B e 0
> mmat2 <- melt(mat[n2,])
> mmat2
Var1 Var2 value
1 C a 1
2 D a 2
3 C b 1
4 D b 2
5 C c 3
6 D c 1
7 C d 0
8 D d 2
9 C e 0
10 D e 3
Then merge the data frames by column name, beware of column names in merging
> colnames(mmat1) <- c("N1","N3","Val1")
> colnames(mmat2) <- c("N2","N3","Val2")
> mmat12 <- merge(mmat1,mmat2)
Now we may compose the final matrix with the conditions
> res <- cbind(mmat12[c('N1','N2','N3')],mmat12['Val1']+mmat12['Val2'])
> res[(mmat12['Val1'] == 0)|(mmat12['Val2'] == 0),4] <- 0
> res[with(res, order(N1,N2,N3)),]
N1 N2 N3 Val1
1 A C a 0
5 A C b 4
9 A C c 4
13 A C d 0
17 A C e 0
2 A D a 0
6 A D b 5
10 A D c 2
14 A D d 3
18 A D e 0
3 B C a 3
7 B C b 0
11 B C c 4
15 B C d 0
19 B C e 0
4 B D a 4
8 B D b 0
12 B D c 2
16 B D d 3
20 B D e 0
Here's some data.frame-centered options using the tidyverse for manipulations:
library(tidyverse)
set.seed(123)
mat <- matrix(rbinom(20, 100, 0.01), 4, 5,
dimnames = list(LETTERS[1:4], letters[1:5]))
rowKey <- c("A" = "N1", "B" = "N1", "C" = "N2", "D" = "N2")
output1 <- mat %>%
as.data.frame() %>%
rownames_to_column('N1') %>%
gather(N3, value, -N1) %>% # reshape to long form
crossing(N2 = .$N1) %>% # add combinations of rowname values
filter(N1 != N2, rowKey[N1] != rowKey[N2]) %>% # drop unwanted combinations
mutate(value = na_if(value, 0), # change 0 values to NA so sum will be 0
# sort rowname values to make group ID column for aggregation
id = map2_chr(N1, N2, ~toString(sort(c(.x, .y))))) %>%
group_by(id, N3) %>%
summarise(N1 = min(N1), # get alpabetically first rowname for N1
N2 = max(N2), # and last for N2
value = coalesce(sum(value), 0L)) %>% # sum and replace NAs with 0s again
# clean up
ungroup() %>%
select(N1, N2, N3, value) %>%
arrange(N2)
output1
#> # A tibble: 20 x 4
#> N1 N2 N3 value
#> <chr> <chr> <chr> <int>
#> 1 A C a 0
#> 2 A C b 4
#> 3 A C c 4
#> 4 A C d 0
#> 5 A C e 0
#> 6 B C a 3
#> 7 B C b 0
#> 8 B C c 4
#> 9 B C d 0
#> 10 B C e 0
#> 11 A D a 0
#> 12 A D b 5
#> 13 A D c 2
#> 14 A D d 3
#> 15 A D e 0
#> 16 B D a 4
#> 17 B D b 0
#> 18 B D c 2
#> 19 B D d 3
#> 20 B D e 0
Like expand.grid
, tidyr::crossing
expands more than necessary (e.g. A
/A
combinations), which may slow things down at scale. A combn
-based approach may be faster, if more annoying to write.
Splitting and using a self-join is a more direct way to create the combinations through adding columns instead of rows. It requires some light list gymnastics, either with split
:
output2 <- mat %>%
as.data.frame() %>%
rownames_to_column('N') %>%
gather(N3, value, -N) %>%
mutate(key = rowKey[N], # add column with key
value = na_if(value, 0)) %>%
split(.$key) %>% # split list by key
# join list elements to add N1/N2 and value combinations
reduce(full_join, by = 'N3', suffix = sub('N', '', names(.))) %>%
transmute(N1, N2, N3,
value = coalesce(value1 + value2, 0L)) %>%
arrange(N2, N1)
all_equal(output1, output2)
#> [1] TRUE
...or tidyr::nest
:
output3 <- mat %>%
as.data.frame() %>%
rownames_to_column('N') %>%
gather(N3, value, -N) %>%
mutate(key = rowKey[N],
value = na_if(value, 0)) %>%
nest(-key) %>% # store all but key column as nested data frame
# join nested data frames by N3 to get N1/N2 and value combinations
{ reduce(.$data, full_join, by = 'N3', suffix = sub('N', '', .$key)) } %>%
transmute(N1, N2, N3,
value = coalesce(value1 + value2, 0L)) %>%
arrange(N2, N1)
all_equal(output1, output3)
#> [1] TRUE
The reduce
calls could be replaced by purrr::invoke
/do.call
as reduce
only calls full_join
once, but reducing over a join is a common idiom and may make the approach easier to scale.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With