I have a list of data.frames. For example
set.seed(1)
my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))
}
What I want is to efficiently compute for each id (in the "id" column) the variance of the "p", "m", "d", and "a" columns over all data frames in the list. Ideally, this would return a data.frame like this (based on the values drawn above):
> result.df
id var_p var_m var_d var_a
1 a 0.2371569 1.7810729 0.08264279 0.5074250
2 b 0.1091675 0.2107997 1.15051229 1.1578691
3 c 0.5385789 0.7650123 0.44215343 0.3137903
4 d 1.0174542 0.7818498 0.06414317 0.6079849
5 e 0.7343667 1.2870542 1.41615858 0.7362462
Using my_list
library(plyr)
df = do.call(rbind, my_list)
out = ddply(df, .(id), colwise(var, c('p','m','d','a')))
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
Or base R alternative, using the combination of lapply and apply
df = do.call(rbind, my_list)
df1 = do.call(rbind,
lapply(split(df, df$id),
function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))
out = transform(df1, id = row.names(df1))
#> out
# p m d a id
#a 0.2371569 1.7810729 0.08264279 0.5074250 a
#b 0.1091675 0.2107997 1.15051229 1.1578691 b
#c 0.5385789 0.7650123 0.44215343 0.3137903 c
#d 1.0174542 0.7818498 0.06414317 0.6079849 d
#e 0.7343667 1.2870542 1.41615858 0.7362462 e
Or using doBy
library(doBy)
df = do.call(rbind, my_list)
out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
Or using sqldf
library(sqldf)
df = do.call(rbind, my_list)
out = sqldf("select id, variance(p), variance(m),
variance(d), variance(a) from df group by id")
#> out
# id variance(p) variance(m) variance(d) variance(a)
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
Here is a base R approach
dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)
which gives
id p m d a
1 a 0.2371569 1.7810729 0.08264279 0.5074250
2 b 0.1091675 0.2107997 1.15051229 1.1578691
3 c 0.5385789 0.7650123 0.44215343 0.3137903
4 d 1.0174542 0.7818498 0.06414317 0.6079849
5 e 0.7343667 1.2870542 1.41615858 0.7362462
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With