Consider the following data frame:
set.seed(5678)
sub_df<- data.frame(clustersize= rep(1, 4),
lepsp= c("A", "B", "C", "D"),
dens= round(runif(4, c(0, 1)), 3),
db= sample(1:10, 4, replace=TRUE))
Let's say I wanted to run the following Bayes linear model which returns samples
, an mc.array
object:
library("rjags")
library("coda")
dataForJags <- list(dens=sub_df$dens, db=sub_df$db, N=length(sub_df$dens))
model<-"model{
for(i in 1:N){
dens[i] ~ dnorm(mu[i], tau)
# identity
mu[i] <- int + beta1*db[i]
}
tau ~ dgamma(0.1,0.1)
int ~ dnorm(0, 0.001)
beta1 ~ dnorm(0, 0.001)
}"
##compile
mod1 <- jags.model(textConnection(model),data= dataForJags,n.chains=2)
##samples returns a list of mcarray objects
samples<-jags.samples(model= mod1,variable.names=c("beta1",
"int","mu","tau"),n.iter=100000)
Given that samples$beta1[,,]
represents random samples from the posterior distribution of the parameters of the jags model, then to summarize, my next step would be to calculate the mean and the 95% credible intervals of the posterior distribution. So I would use:
coeff_output<- round(quantile(samples$beta1[,,],probs=c(0.5,0.025,0.975)),3)
Now, let's say my actual data frame has multiple levels of clustersize
.
set.seed(5672)
df<- data.frame(clustersize= c(rep(1, 4), rep(2,4), rep(3, 3)),
lepsp= c("A", "B", "C", "D", "B", "C", "D", "E", "A", "D", "F"),
dens= round(runif(11, c(0, 1)), 3),
db= sample(1:10, 11, replace=TRUE))
How would I run this model for each level of clustersize
separately and compile the output into a single result data frame using a forloop
or apply
function? For each level of clustersize
, the resulting mc.array
object samples
should be output to result_list
and the coeff_output
should be output to a data frame result_coeff
.
Below I calculate the output for each clustersize
separately, to produce the expected result list and data frame.
#clustersize==1
sub_df1<- data.frame(clustersize= rep(1, 4),
lepsp= c("A", "B", "C", "D"),
dens= round(runif(4, c(0, 1)), 3),
db= sample(1:10, 4, replace=TRUE))
dataForJags <- list(dens=sub_df$dens, db=sub_df$db, N=length(sub_df$dens))
model<-"model{
for(i in 1:N){
dens[i] ~ dnorm(mu[i], tau)
mu[i] <- int + beta1*db[i]
}
tau ~ dgamma(0.1,0.1)
int ~ dnorm(0, 0.001)
beta1 ~ dnorm(0, 0.001)
}"
mod1 <- jags.model(textConnection(model),data= dataForJags,n.chains=2)
samples1<-jags.samples(model= mod1,variable.names=c("beta1",
"int","mu","tau"),n.iter=100000)
coeff_output1<-
data.frame(as.list(round(quantile(samples1$beta1[,,],probs=c(0.5,0.025,0.975)),3)))
#clustersize==2
sub_df2<- data.frame(clustersize= rep(2,4),
lepsp= c( "B", "C", "D", "E"),
dens= round(runif(4, c(0, 1)), 3),
db= sample(1:10, 4, replace=TRUE))
dataForJags <- list(dens=sub_df$dens, db=sub_df$db, N=length(sub_df$dens))
model<-"model{
for(i in 1:N){
dens[i] ~ dnorm(mu[i], tau)
mu[i] <- int + beta1*db[i]
}
tau ~ dgamma(0.1,0.1)
int ~ dnorm(0, 0.001)
beta1 ~ dnorm(0, 0.001)
}"
mod1 <- jags.model(textConnection(model),data= dataForJags,n.chains=2)
samples2<-jags.samples(model= mod1,variable.names=c("beta1",
"int","mu","tau"),n.iter=100000)
coeff_output2<-
data.frame(as.list(round(quantile(samples2$beta1[,,],probs=c(0.5,0.025,0.975)),3)))
#clustersize==3
sub_df3<- data.frame(clustersize= rep(3, 3),
lepsp= c("A", "D", "F"),
dens= round(runif(3, c(0, 1)), 3),
db= sample(1:10, 3, replace=TRUE))
dataForJags <- list(dens=sub_df$dens, db=sub_df$db, N=length(sub_df$dens))
model<-"model{
for(i in 1:N){
dens[i] ~ dnorm(mu[i], tau)
mu[i] <- int + beta1*db[i]
}
tau ~ dgamma(0.1,0.1)
int ~ dnorm(0, 0.001)
beta1 ~ dnorm(0, 0.001)
}"
mod1 <- jags.model(textConnection(model),data= dataForJags,n.chains=2)
samples3<-jags.samples(model= mod1,variable.names=c("beta1",
"int","mu","tau"),n.iter=100000)
coeff_output3<-
data.frame(as.list(round(quantile(samples3$beta1[,,],probs=c(0.5,0.025,0.975)),3)))
Desired final output:
result_list<- list(samples1, samples2, samples3)
result_coeff<-rbind(coeff_output1, coeff_output2, coeff_output3)
Here is a link to the actual data frame. The solution should be able to process a large dataframe with clustersizes up to 600.
download.file("https://drive.google.com/file/d/1ZYIQtb_QHbYsInDGkta-5P2EJrFRDf22/view?usp=sharing",temp)
There are a few issues to consider here, which are caused by the scale of what you're trying to do. You are creating over 550 different jags.sample
objects with 100000 iterations each, and then trying to store all of them in a single list. On most machines, this will cause memory issues: the output is simply too large.
There are at least two ways that we can deal with this:
I've made a number of modifications to your code that should allow it to work with your actual dataset.
Creating Input Data:
In your original code, clustersize and db both have the data type numeric
, even though they only need to be integers. The numeric
type takes 8 bytes, while the integer
type only takes 4 bytes. If we coerce these two columns to the integer
type, we can actually reduce the memory size of the list of dataframes in the next step by about 30%.
library("tidyverse")
#### Load Raw Data ####
df <- read_csv("example.csv") %>%
select(-1) %>%
mutate(clustersize = as.integer(clustersize),
db = as.integer(db))
Initial JAGS Tuning
You are using far too many iterations for each of your chains; niter
= 100000 is extremely high. You should also be specifying a burn-in period using n.burn
, an adaptation period using n.adapt
, and a thinning parameter using thin
. The thinning parameter is especially important here - this directly reduces the number of iterations that we are saving from each chain. A thinning parameter of 50 means that we are only saving every 50th result.
There are post-hoc methods for selecting your thinning parameters, burn-in, and adaptation period, but that discussion is beyond the scope of SO. For some basic information on what all of these arguments do, there is an excellent answer here: https://stackoverflow.com/a/38875637/9598813. For now, I've provided values that will allow this code to run on your entire dataset, but I recommend that you carefully select the values that you use for your final analysis.
Using tidybayes
The following solution uses the tidybayes
package. This provides a clean output and allows us to neatly row-bind all of the coefficient summaries into a single dataframe. Note that we use coda.samples()
instead of jags.samples()
, because this provides a more universal MCMC object that we can pass to spread_draws()
. We also use dplyr::group_split()
which is slightly more computationally efficient than split()
.
library("rjags")
library("coda")
library("tidybayes")
set.seed(5672)
result <- df %>% group_split(clustersize) %>% map(~{
dataForJags <- list(dens=.x$dens, db=.x$db, N=length(.x$dens))
# Declare model structure
mod1 <- jags.model(textConnection(model),
data=dataForJags,
n.chains=2)
# samples returns a list of mcmc objects
samples<-coda.samples(model=mod1,
variable.names=c("beta1","int","mu","tau"),
n.burn=10000,
n.adapt=5000,
n.iter=25000,
thin=50
)
# Extract individual draws
samp <- spread_draws(samples, beta1)
# Summarize 95% credible intervals
coeff_output <- spread_draws(samples, beta1) %>%
median_qi(beta1)
list(samples = samp, coeff_output = coeff_output)
}) %>% transpose()
# List of sample objects
result$samples
# Dataframe of coefficient estimates and 95% credible intervals
result_coeff <- bind_rows(result$coeff_output, .id = "clustersize")
You can use map
from purrr
package and split
over the different clustersize
:
library(rjags)
library(coda)
library(purrr)
set.seed(5678)
set.seed(5672)
df<- data.frame(clustersize= c(rep(1, 4), rep(2,4), rep(3, 3)),
lepsp= c("A", "B", "C", "D", "B", "C", "D", "E", "A", "D", "F"),
dens= round(runif(11, c(0, 1)), 3),
db= sample(1:10, 11, replace=TRUE))
model<-"model{
for(i in 1:N){
dens[i] ~ dnorm(mu[i], tau)
# identity
mu[i] <- int + beta1*db[i]
}
tau ~ dgamma(0.1,0.1)
int ~ dnorm(0, 0.001)
beta1 ~ dnorm(0, 0.001)
}"
# split data for different clustersize and calculate result
result <- df %>% split(.$clustersize) %>% map(~{
dataForJags <- list(dens=.x$dens, db=.x$db, N=length(.x$dens))
##compile
mod1 <- jags.model(textConnection(model),data= dataForJags,n.chains=2)
##samples returns a list of mcarray objects
samples<-jags.samples(model= mod1,variable.names=c("beta1","int","mu","tau"),n.iter=100000)
coeff_output<- data.frame(as.list(round(quantile(samples$beta1[,,],probs=c(0.5,0.025,0.975)),3)))
list(samples = samples, coeff_output = coeff_output)
}) %>% transpose()
result$samples
result$coeff_output
Note the use of purrr::transpose
to transform the final result in a list
for samples and a list
for coefs as per you request.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With