Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Converting a deeply nested list to a dataframe

I have a deeply nested list I would like to convert to a data frame. Here's what the structure looks like:

ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))),
           '20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))))


> ls[['10']][['123']][['0.1']]
$Gmax.val
[1] -0.1982298

$G2.val
[1] -0.2761515

$Gmax.vec
[1] -0.4732736 -0.5714809 -0.1600405 -0.7138532  0.3503852 -0.7367241  0.3024992 -0.4931045

$G2.vec
[1] -0.2374231 -0.7927135 -0.9554769  0.8733201 -0.4126742  1.8689940  0.1576750 -0.2184344

Each sublist name is a value for a different variable: in this example, maybe:

ls[[]] = time; 10 or 20
ls[[]][[]] = seed; 123 or 456
ls[[]][[]][[]] = treatment; 0.1 or 0.2 

Ideally I'd like to have the names of sublists used as values in their own columns. I'd like the data frame to look like this:

#  time seed treatment  Gmax.val     G2.val    Gmax.vec     G2.vec
#1   10  123       0.1 0.1972457 -0.1224265  0.06121407  1.5102516
#2   10  123       0.1 0.1972457 -0.1224265 -2.53026477 -0.1320042
#3   10  123       0.1 0.1972457 -0.1224265  0.06648820 -0.2477285
#4   10  123       0.1 0.1972457 -0.1224265 -0.45594701 -0.8577670
#5   10  123       0.1 0.1972457 -0.1224265  0.90828911 -1.0710828
#6   10  123       0.1 0.1972457 -0.1224265  0.56427976  1.5086222

Thanks for the help.

like image 569
Nick O'Brien Avatar asked Sep 15 '20 05:09

Nick O'Brien


2 Answers

Another approach is to:

  1. Melt the nested list to a data.frame with rrapply() in the rrapply-package (or similarly with reshape2::melt()).
  2. Reshape the data.frame to the required format using tidyr's pivot_wider() and unnest().
library(rrapply)
library(tidyverse)

rrapply(ls, how = "melt") %>%                            ## melt to long df
  pivot_wider(names_from = "L4") %>%                     ## reshape to wide df
  unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%      ## unnest list columns
  rename(time = L1, seed = L2, treatment = L3)           ## rename columns

#> # A tibble: 64 x 7
#>    time  seed  treatment Gmax.val G2.val Gmax.vec  G2.vec
#>    <chr> <chr> <chr>        <dbl>  <dbl>    <dbl>   <dbl>
#>  1 10    123   0.1         -0.626  0.184   -0.836  1.51  
#>  2 10    123   0.1         -0.626  0.184    1.60   0.390 
#>  3 10    123   0.1         -0.626  0.184    0.330 -0.621 
#>  4 10    123   0.1         -0.626  0.184   -0.820 -2.21  
#>  5 10    123   0.1         -0.626  0.184    0.487  1.12  
#>  6 10    123   0.1         -0.626  0.184    0.738 -0.0449
#>  7 10    123   0.1         -0.626  0.184    0.576 -0.0162
#>  8 10    123   0.1         -0.626  0.184   -0.305  0.944 
#>  9 10    123   0.2          0.821  0.594    0.919 -0.478 
#> 10 10    123   0.2          0.821  0.594    0.782  0.418 
#> # … with 54 more rows

Or using data.table's dcast() to reshape the long table into wide format:

library(data.table)

long_dt <- as.data.table(rrapply(ls, how = "melt"))
wide_dt <- dcast(long_dt, L1 + L2 + L3 ~ L4)
wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))

Some benchmarks

microbenchmark::microbenchmark(
  tidyr = {
    rrapply(ls, how = "melt") %>%                            
      pivot_wider(names_from = "L4") %>%                     
      unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%      
      rename(time = L1, seed = L2, treatment = L3)
  },
  data.table = {
    wide_dt <- dcast(as.data.table(rrapply(ls, how = "melt")), L1 + L2 + L3 ~ L4)
    wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
    setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))
    wide_dt
  },
  times = 25
)
#> Unit: milliseconds
#>        expr       min        lq      mean    median        uq       max neval
#>       tidyr 17.959197 20.072647 23.662698 21.278771 25.633581 40.593022    25
#>  data.table  2.061861  2.655782  2.966581  2.784425  2.988044  5.032524    25
like image 150
Joris C. Avatar answered Sep 21 '22 17:09

Joris C.


Here's a solution that uses some of the newer "rectangling" methods in tidyr. I'm posting this mainly as an exercise to gain and share some familiarity with these functions - my sense is that this approach could definitely be, well, tidied up a bit. Still, it's a nice way to play with swinging back and forth between wide/long list unpacking.

library(tidyverse)
set.seed(1L)

tibble(time = names(data), data = data) %>%
  unnest_wider(data) %>%
  pivot_longer(-time, names_to = "seed", values_to = "treatment") %>%
  unnest_wider(treatment) %>%
  pivot_longer(-c(time, seed), names_to = "treatment", values_to = "g_data") %>%
  unnest_wider(g_data) %>%
  mutate(row_n = row_number()) %>%
  pivot_longer(c(Gmax.vec, G2.vec), names_to = "g", values_to = "g_val") %>%
  unnest_longer(g_val) %>%
  group_by(row_n, time, seed, treatment, Gmax.val, G2.val, g) %>%
  mutate(sub_n = row_number()) %>%
  pivot_wider(names_from = g, values_from = g_val) %>%
  ungroup() %>%
  select(-row_n, -sub_n) 

  # A tibble: 64 x 7
   time  seed  treatment Gmax.val G2.val Gmax.vec  G2.vec
   <chr> <chr> <chr>        <dbl>  <dbl>    <dbl>   <dbl>
 1 10    123   0.1         -0.626  0.184   -0.836  1.51  
 2 10    123   0.1         -0.626  0.184    1.60   0.390 
 3 10    123   0.1         -0.626  0.184    0.330 -0.621 
 4 10    123   0.1         -0.626  0.184   -0.820 -2.21  
 5 10    123   0.1         -0.626  0.184    0.487  1.12  
 6 10    123   0.1         -0.626  0.184    0.738 -0.0449
 7 10    123   0.1         -0.626  0.184    0.576 -0.0162
 8 10    123   0.1         -0.626  0.184   -0.305  0.944 
 9 10    123   0.2          0.821  0.594    0.919 -0.478 
10 10    123   0.2          0.821  0.594    0.782  0.418 
# … with 54 more rows
like image 23
andrew_reece Avatar answered Sep 20 '22 17:09

andrew_reece