Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Transform complex data frame

Tags:

dataframe

r

I have a data frame whose 1st column (weights) contains a list (of data frames?):

> head(data$weights)

> data <- structure(list(A373R11 = structure(list(Signature.1A = 0, Signature.1B = 0, 
    Signature.2 = 0, Signature.3 = 0.151631702143023, Signature.4 = 0.149799882118262, 
    Signature.5 = 0, Signature.6 = 0, Signature.7 = 0.0634912587993959, 
    Signature.8 = 0, Signature.9 = 0.173189155080817, Signature.10 = 0, 
    Signature.11 = 0, Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
    Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, Signature.18 = 0, 
    Signature.19 = 0, Signature.20 = 0, Signature.21 = 0.0905517653558877, 
    Signature.R1 = 0, Signature.R2 = 0, Signature.R3 = 0, Signature.U1 = 0.155590748898003, 
    Signature.U2 = 0.145955461287919), .Names = c("Signature.1A", 
"Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
"Signature.5", "Signature.6", "Signature.7", "Signature.8", "Signature.9", 
"Signature.10", "Signature.11", "Signature.12", "Signature.13", 
"Signature.14", "Signature.15", "Signature.16", "Signature.17", 
"Signature.18", "Signature.19", "Signature.20", "Signature.21", 
"Signature.R1", "Signature.R2", "Signature.R3", "Signature.U1", 
"Signature.U2"), row.names = "A373R11", class = "data.frame"), 
    A373R13 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.221014874027829, Signature.4 = 0, 
        Signature.5 = 0, Signature.6 = 0, Signature.7 = 0, Signature.8 = 0.279252211893692, 
        Signature.9 = 0, Signature.10 = 0, Signature.11 = 0, 
        Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
        Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, 
        Signature.18 = 0, Signature.19 = 0.115216422668955, Signature.20 = 0, 
        Signature.21 = 0, Signature.R1 = 0, Signature.R2 = 0, 
        Signature.R3 = 0.0636987713225648, Signature.U1 = 0.108875099907467, 
        Signature.U2 = 0), .Names = c("Signature.1A", "Signature.1B", 
    "Signature.2", "Signature.3", "Signature.4", "Signature.5", 
    "Signature.6", "Signature.7", "Signature.8", "Signature.9", 
    "Signature.10", "Signature.11", "Signature.12", "Signature.13", 
    "Signature.14", "Signature.15", "Signature.16", "Signature.17", 
    "Signature.18", "Signature.19", "Signature.20", "Signature.21", 
    "Signature.R1", "Signature.R2", "Signature.R3", "Signature.U1", 
    "Signature.U2"), row.names = "A373R13", class = "data.frame"), 
    A373R3 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.0795605471131758, Signature.4 = 0.0973130562439999, 
        Signature.5 = 0, Signature.6 = 0, Signature.7 = 0, Signature.8 = 0.249674548796242, 
        Signature.9 = 0.0725013504411567, Signature.10 = 0, Signature.11 = 0.064665155855146, 
        Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
        Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, 
        Signature.18 = 0, Signature.19 = 0, Signature.20 = 0, 
        Signature.21 = 0, Signature.R1 = 0, Signature.R2 = 0, 
        Signature.R3 = 0.0703546703126821, Signature.U1 = 0.21753544296676, 
        Signature.U2 = 0.0739201832004727), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A373R3", class = "data.frame"), 
    A373R5 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.113996509522102, Signature.4 = 0.114874220936966, 
        Signature.5 = 0.142056872670519, Signature.6 = 0, Signature.7 = 0, 
        Signature.8 = 0.208376707959741, Signature.9 = 0.0744527503782136, 
        Signature.10 = 0, Signature.11 = 0, Signature.12 = 0, 
        Signature.13 = 0, Signature.14 = 0, Signature.15 = 0.0771902641012979, 
        Signature.16 = 0, Signature.17 = 0, Signature.18 = 0, 
        Signature.19 = 0, Signature.20 = 0, Signature.21 = 0, 
        Signature.R1 = 0, Signature.R2 = 0, Signature.R3 = 0, 
        Signature.U1 = 0.0673567355607731, Signature.U2 = 0), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A373R5", class = "data.frame"), 
    A373R9 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.116847300193985, Signature.4 = 0, 
        Signature.5 = 0.21624751052703, Signature.6 = 0, Signature.7 = 0, 
        Signature.8 = 0.252498230882402, Signature.9 = 0, Signature.10 = 0, 
        Signature.11 = 0.119495912880994, Signature.12 = 0, Signature.13 = 0, 
        Signature.14 = 0, Signature.15 = 0, Signature.16 = 0, 
        Signature.17 = 0, Signature.18 = 0, Signature.19 = 0, 
        Signature.20 = 0, Signature.21 = 0, Signature.R1 = 0, 
        Signature.R2 = 0, Signature.R3 = 0.0725549911220892, 
        Signature.U1 = 0, Signature.U2 = 0), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A373R9", class = "data.frame"), 
    A512R19 = structure(list(Signature.1A = 0.109490572493859, 
        Signature.1B = 0, Signature.2 = 0, Signature.3 = 0, Signature.4 = 0.22010156823306, 
        Signature.5 = 0, Signature.6 = 0, Signature.7 = 0, Signature.8 = 0, 
        Signature.9 = 0, Signature.10 = 0, Signature.11 = 0, 
        Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
        Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, 
        Signature.18 = 0, Signature.19 = 0, Signature.20 = 0, 
        Signature.21 = 0, Signature.R1 = 0, Signature.R2 = 0, 
        Signature.R3 = 0.150943894106973, Signature.U1 = 0.248556502648564, 
        Signature.U2 = 0.119306892617062), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A512R19", class = "data.frame")), .Names = c("A373R11", 
"A373R13", "A373R3", "A373R5", "A373R9", "A512R19"))

Here, each row contains a sample, and each column contains a score for a particular signature:

> data[1]

$A373R11
        Signature.1A Signature.1B Signature.2 Signature.3 Signature.4 Signature.5 Signature.6 Signature.7 Signature.8 Signature.9 Signature.10 Signature.11
A373R11            0            0           0   0.1516317   0.1497999           0           0  0.06349126           0   0.1731892            0            0
        Signature.12 Signature.13 Signature.14 Signature.15 Signature.16 Signature.17 Signature.18 Signature.19 Signature.20 Signature.21 Signature.R1 Signature.R2
A373R11            0            0            0            0            0            0            0            0            0   0.09055177            0            0
        Signature.R3 Signature.U1 Signature.U2
A373R11            0    0.1555907    0.1459555

I would like to transfer this into a dataframe with the following structure:

sample  signature  score
A373R11  Signature.1A  0
A373R11  Signature.1B  0
[...]
A373R13  Signature.1A  0
A373R13  Signature.1B  0
[...]

Can anyone point me in the right direction?

like image 409
fugu Avatar asked Aug 18 '17 16:08

fugu


People also ask

How do I convert multiple columns to a single column in R?

Convert multiple columns into a single column, To combine numerous data frame columns into one column, use the union() function from the tidyr package.


2 Answers

Two approaches:

1) with the data.table-package

Using:

library(data.table)
melt(rbindlist(data, idcol = 'sample'),
     id = 'sample', variable.name = 'signature', value.name = 'score')

gives:

      sample    signature      score
  1: A373R11 Signature.1A 0.00000000
  2: A373R13 Signature.1A 0.00000000
  3:  A373R3 Signature.1A 0.00000000
  4:  A373R5 Signature.1A 0.00000000
  5:  A373R9 Signature.1A 0.00000000
 ---                                
158: A373R13 Signature.U2 0.00000000
159:  A373R3 Signature.U2 0.07392018
160:  A373R5 Signature.U2 0.00000000
161:  A373R9 Signature.U2 0.00000000
162: A512R19 Signature.U2 0.11930689

2) with base R

Using:

dat2 <- do.call(rbind, dat)
reshape(dat2, idvar = 'sample', ids = row.names(dat2),
        varying = list(1:ncol(dat2)), times = colnames(dat2),
        timevar = 'signature', v.names = 'score',
        new.row.names = NULL, direction = 'long')

gives:

                        signature      score  sample
A373R11.Signature.1A Signature.1A 0.00000000 A373R11
A373R13.Signature.1A Signature.1A 0.00000000 A373R13
A373R3.Signature.1A  Signature.1A 0.00000000  A373R3
A373R5.Signature.1A  Signature.1A 0.00000000  A373R5
A373R9.Signature.1A  Signature.1A 0.00000000  A373R9

.....

A373R13.Signature.U2 Signature.U2 0.00000000 A373R13
A373R3.Signature.U2  Signature.U2 0.07392018  A373R3
A373R5.Signature.U2  Signature.U2 0.00000000  A373R5
A373R9.Signature.U2  Signature.U2 0.00000000  A373R9
A512R19.Signature.U2 Signature.U2 0.11930689 A512R19

NOTE:

It is better not to give your data the same name as a function. See ?data.

like image 173
Jaap Avatar answered Nov 15 '22 08:11

Jaap


A tidyverse solution, where we first join all the data.frames together, then use gather to reshape them as desired:

library(dplyr)
library(tidyr)

data %>%
  bind_rows(.id = 'sample') %>%
  gather(signature, score, -sample)

Gives:

     sample    signature      score
1   A373R11 Signature.1A 0.00000000
2   A373R13 Signature.1A 0.00000000
3    A373R3 Signature.1A 0.00000000
4    A373R5 Signature.1A 0.00000000
5    A373R9 Signature.1A 0.00000000
6   A512R19 Signature.1A 0.10949057
7   A373R11 Signature.1B 0.00000000
8   A373R13 Signature.1B 0.00000000
9    A373R3 Signature.1B 0.00000000
10   A373R5 Signature.1B 0.00000000
....

Can be written as a one-liner without pipes as:

gather(bind_rows(data, .id = 'sample'), signature, score, -sample)
like image 42
Axeman Avatar answered Nov 15 '22 09:11

Axeman