Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to analyze Julia memory allocation and code coverage results

Tags:

memory

julia

I am writing a package for Bayesian inference using Gibbs sampling. Since these methods are generally computationally expensive, I am very concerned about the performance of my code. In fact, speed was the reason I moved from Python to Julia.

After implementing Dirichlet Process Model I analyzed the code using Coverage.jl and the --track-allocation=user commandline option.

Here is the coverage results

        - #=
        - DPM
        - 
        - Dirichlet Process Mixture Models
        - 
        - 25/08/2015
        - Adham Beyki, [email protected]
        - 
        - =#
        - 
        - type DPM{T}
        -   bayesian_component::T
        -   K::Int64
        -   aa::Float64
        -   a1::Float64
        -   a2::Float64
        -   K_hist::Vector{Int64}
        -   K_zz_dict::Dict{Int64, Vector{Int64}}
        - 
        -   DPM{T}(c::T, K::Int64, aa::Float64, a1::Float64, a2::Float64) = new(c, K, aa, a1, a2,
        -           Int64[], (Int64 => Vector{Int64})[])
        - end
        1 DPM{T}(c::T, K::Int64, aa::Real, a1::Real, a2::Real) = DPM{typeof(c)}(c, K, convert(Float64, aa),
        -           convert(Float64, a1), convert(Float64, a2))
        - 
        - function Base.show(io::IO, dpm::DPM)
        -   println(io, "Dirichlet Mixture Model with $(dpm.K) $(typeof(dpm.bayesian_component)) components")
        - end
        - 
        - function initialize_gibbs_sampler!(dpm::DPM, zz::Vector{Int64})
        -   # populates the cluster labels randomly
        1   zz[:] = rand(1:dpm.K, length(zz))
        - end
        - 
        - function DPM_sample_hyperparam(aa::Float64, a1::Float64, a2::Float64, K::Int64, NN::Int64, iters::Int64)
        - 
        -   # resampling concentration parameter based on Escobar and West 1995
      352   for n = 1:iters
     3504       eta = rand(Distributions.Beta(aa+1, NN))
     3504       rr = (a1+K-1) / (NN*(a2-log(NN)))
     3504       pi_eta = rr / (1+rr)
        - 
     3504       if rand() < pi_eta
        0           aa = rand(Distributions.Gamma(a1+K, 1/(a2-log(eta))))
        -       else
     3504           aa = rand(Distributions.Gamma(a1+K-1, 1/(a2-log(eta))))
        -       end
        -   end
      352   aa
        - end
        - 
        - function DPM_sample_pp{T1, T2}(
        -     bayesian_components::Vector{T1},
        -     xx::T2,
        -     nn::Vector{Float64},
        -     pp::Vector{Float64},
        -     aa::Float64)
        - 
  1760000   K = length(nn)
  1760000   @inbounds for kk = 1:K
 11384379     pp[kk] = log(nn[kk]) + logpredictive(bayesian_components[kk], xx)
        -   end
  1760000   pp[K+1] = log(aa) + logpredictive(bayesian_components[K+1], xx)
  1760000   normalize_pp!(pp, K+1)
  1760000   return sample(pp[1:K+1])
        - end
        - 
        - 
        - function collapsed_gibbs_sampler!{T1, T2}(
        -       dpm::DPM{T1},
        -       xx::Vector{T2},
        -       zz::Vector{Int64},
        -       n_burnins::Int64, n_lags::Int64, n_samples::Int64, n_internals::Int64; max_clusters::Int64=100)
        - 
        - 
        2   NN = length(xx)                                          # number of data points
        2   nn = zeros(Float64, dpm.K)                       # count array
        2   n_iterations = n_burnins + (n_samples)*(n_lags+1)
        2   bayesian_components = [deepcopy(dpm.bayesian_component) for k = 1:dpm.K+1]
        2   dpm.K_hist = zeros(Int64, n_iterations)
        2   pp = zeros(Float64, max_clusters)
        - 
        2   tic()
        2   for ii = 1:NN
    10000       kk = zz[ii]
    10000       additem(bayesian_components[kk], xx[ii])
    10000       nn[kk] += 1
        -   end
        2   dpm.K_hist[1] = dpm.K
        2   elapsed_time = toq()
        - 
        2   for iteration = 1:n_iterations
        - 
      352       println("iteration: $iteration, KK: $(dpm.K), KK mode: $(indmax(hist(dpm.K_hist,
        -                       0.5:maximum(dpm.K_hist)+0.5)[2])), elapsed time: $elapsed_time")
        - 
      352       tic()
      352       @inbounds for ii = 1:NN
  1760000           kk = zz[ii]
  1760000           nn[kk] -= 1
  1760000           delitem(bayesian_components[kk], xx[ii])
        - 
        -           # remove the cluster if empty
  1760000           if nn[kk] == 0
      166               println("\tcomponent $kk has become inactive")
      166               splice!(nn, kk)
      166               splice!(bayesian_components, kk)
      166               dpm.K -= 1
        - 
        -               # shifting the labels one cluster back
   830166               idx = find(x -> x>kk, zz)
      166               zz[idx] -= 1
        -           end
        - 
  1760000           kk = DPM_sample_pp(bayesian_components, xx[ii], nn, pp, dpm.aa)
        - 
  1760000           if kk == dpm.K+1
      171               println("\tcomponent $kk activated.")
      171               push!(bayesian_components, deepcopy(dpm.bayesian_component))
      171               push!(nn, 0)
      171               dpm.K += 1
        -           end
        - 
  1760000           zz[ii] = kk
  1760000           nn[kk] += 1
  1760000           additem(bayesian_components[kk], xx[ii])
        -       end
        - 
      352       dpm.aa = DPM_sample_hyperparam(dpm.aa, dpm.a1, dpm.a2, dpm.K, NN, n_internals)
      352       dpm.K_hist[iteration] = dpm.K
      352       dpm.K_zz_dict[dpm.K] = deepcopy(zz)
      352       elapsed_time = toq()
        -   end
        - end
        - 
        - function truncated_gibbs_sampler{T1, T2}(dpm::DPM{T1}, xx::Vector{T2}, zz::Vector{Int64},
        -   n_burnins::Int64, n_lags::Int64, n_samples::Int64, n_internals::Int64, K_truncation::Int64)
        - 
        -   NN = length(xx)                                             # number of data points
        -   nn = zeros(Int64, K_truncation)             # count array
        -   bayesian_components = [deepcopy(dpm.bayesian_component) for k = 1:K_truncation]
        -   n_iterations = n_burnins + (n_samples)*(n_lags+1)
        -   dpm.K_hist = zeros(Int64, n_iterations)
        -   states = (ASCIIString => Int64)[]
        -   n_states = 0
        - 
        -   tic()
        -   for ii = 1:NN
        -       kk = zz[ii]
        -       additem(bayesian_components[kk], xx[ii])
        -       nn[kk] += 1
        -   end
        -   dpm.K_hist[1] = dpm.K
        - 
        -   # constructing the sticks
        -   beta_VV = rand(Distributions.Beta(1.0, dpm.aa), K_truncation)
        -   beta_VV[end] = 1.0
        -   π = ones(Float64, K_truncation)
        -   π[2:end] = 1 - beta_VV[1:K_truncation-1]
        -   π = log(beta_VV) + log(cumprod(π))
        - 
        -   elapsed_time = toq()
        - 
        -   for iteration = 1:n_iterations
        - 
        -       println("iteration: $iteration, # active components: $(length(findn(nn)[1])), mode: $(indmax(hist(dpm.K_hist,
        -                       0.5:maximum(dpm.K_hist)+0.5)[2])), elapsed time: $elapsed_time \n", nn)
        - 
        -       tic()
        -       for ii = 1:NN
        -           kk = zz[ii]
        -           nn[kk] -= 1
        -           delitem(bayesian_components[kk], xx[ii])
        - 
        -           # resampling label
        -           pp = zeros(Float64, K_truncation)
        -           for kk = 1:K_truncation
        -               pp[kk] = π[kk] + logpredictive(bayesian_components[kk], xx[ii])
        -           end
        -           pp = exp(pp - maximum(pp))
        -           pp /= sum(pp)
        - 
        -           # sample from pp
        -           kk = sampleindex(pp)
        -           zz[ii] = kk
        -           nn[kk] += 1
        -           additem(bayesian_components[kk], xx[ii])
        - 
        -           for kk = 1:K_truncation-1
        -               gamma1 = 1 + nn[kk]
        -               gamma2 = dpm.aa + sum(nn[kk+1:end])
        -               beta_VV[kk] = rand(Distributions.Beta(gamma1, gamma2))
        -           end
        -           beta_VV[end] = 1.0
        -           π = ones(Float64, K_truncation)
        -           π[2:end] = 1 - beta_VV[1:K_truncation-1]
        -           π = log(beta_VV) + log(cumprod(π))
        - 
        -           # resampling concentration parameter based on Escobar and West 1995
        -           for internal_iters = 1:n_internals
        -                   eta = rand(Distributions.Beta(dpm.aa+1, NN))
        -               rr = (dpm.a1+dpm.K-1) / (NN*(dpm.a2-log(NN)))
        -               pi_eta = rr / (1+rr)
        - 
        -               if rand() < pi_eta
        -                   dpm.aa = rand(Distributions.Gamma(dpm.a1+dpm.K, 1/(dpm.a2-log(eta))))
        -               else
        -                   dpm.aa = rand(Distributions.Gamma(dpm.a1+dpm.K-1, 1/(dpm.a2-log(eta))))
        -               end
        -           end
        -       end
        - 
        -       nn_string = nn2string(nn)
        -       if !haskey(states, nn_string)
        -           n_states += 1
        -           states[nn_string] = n_states
        -       end
        -       dpm.K_hist[iteration] = states[nn_string]
        -       dpm.K_zz_dict[states[nn_string]] = deepcopy(zz)
        -       elapsed_time = toq()
        -   end
        -   return states
        - end
        - 
        - 
        - function posterior{T1, T2}(dpm::DPM{T1}, xx::Vector{T2}, K::Int64, K_truncation::Int64=0)
        2   n_components = 0
        1   if K_truncation == 0
        1       n_components = K
        -   else
        0       n_components = K_truncation
        -   end
        - 
        1   bayesian_components = [deepcopy(dpm.bayesian_component) for kk=1:n_components]
        1   zz = dpm.K_zz_dict[K]
        - 
        1   NN = length(xx)
        1   nn = zeros(Int64, n_components)
        - 
        1   for ii = 1:NN
     5000       kk = zz[ii]
     5000       additem(bayesian_components[kk], xx[ii])
     5000       nn[kk] += 1
        -   end
        - 
        1   return([posterior(bayesian_components[kk]) for kk=1:n_components], nn)
        - end
        - 

And here is the memory allocation:

        - #=
        - DPM
        - 
        - Dirichlet Process Mixture Models
        - 
        - 25/08/2015
        - Adham Beyki, [email protected]
        - 
        - =#
        - 
        - type DPM{T}
        -   bayesian_component::T
        -   K::Int64
        -   aa::Float64
        -   a1::Float64
        -   a2::Float64
        -   K_hist::Vector{Int64}
        -   K_zz_dict::Dict{Int64, Vector{Int64}}
        - 
        -   DPM{T}(c::T, K::Int64, aa::Float64, a1::Float64, a2::Float64) = new(c, K, aa, a1, a2,
        -           Int64[], (Int64 => Vector{Int64})[])
        - end
        0 DPM{T}(c::T, K::Int64, aa::Real, a1::Real, a2::Real) = DPM{typeof(c)}(c, K, convert(Float64, aa),
        -           convert(Float64, a1), convert(Float64, a2))
        - 
        - function Base.show(io::IO, dpm::DPM)
        -   println(io, "Dirichlet Mixture Model with $(dpm.K) $(typeof(dpm.bayesian_component)) components")
        - end
        - 
        - function initialize_gibbs_sampler!(dpm::DPM, zz::Vector{Int64})
        -   # populates the cluster labels randomly
        0   zz[:] = rand(1:dpm.K, length(zz))
        - end
        - 
        - function DPM_sample_hyperparam(aa::Float64, a1::Float64, a2::Float64, K::Int64, NN::Int64, iters::Int64)
        - 
        -   # resampling concentration parameter based on Escobar and West 1995
        0   for n = 1:iters
        0       eta = rand(Distributions.Beta(aa+1, NN))
        0       rr = (a1+K-1) / (NN*(a2-log(NN)))
        0       pi_eta = rr / (1+rr)
        - 
        0       if rand() < pi_eta
        0           aa = rand(Distributions.Gamma(a1+K, 1/(a2-log(eta))))
        -       else
        0           aa = rand(Distributions.Gamma(a1+K-1, 1/(a2-log(eta))))
        -       end
        -   end
        0   aa
        - end
        - 
        - function DPM_sample_pp{T1, T2}(
        -     bayesian_components::Vector{T1},
        -     xx::T2,
        -     nn::Vector{Float64},
        -     pp::Vector{Float64},
        -     aa::Float64)
        - 
        0   K = length(nn)
        0   @inbounds for kk = 1:K
        0     pp[kk] = log(nn[kk]) + logpredictive(bayesian_components[kk], xx)
        -   end
        0   pp[K+1] = log(aa) + logpredictive(bayesian_components[K+1], xx)
        0   normalize_pp!(pp, K+1)
        0   return sample(pp[1:K+1])
        - end
        - 
        - 
        - function collapsed_gibbs_sampler!{T1, T2}(
        -       dpm::DPM{T1},
        -       xx::Vector{T2},
        -       zz::Vector{Int64},
        -       n_burnins::Int64, n_lags::Int64, n_samples::Int64, n_internals::Int64; max_clusters::Int64=100)
        - 
        - 
   191688   NN = length(xx)                                          # number of data points
       96   nn = zeros(Float64, dpm.K)                       # count array
        0   n_iterations = n_burnins + (n_samples)*(n_lags+1)
      384   bayesian_components = [deepcopy(dpm.bayesian_component) for k = 1:dpm.K+1]
     2864   dpm.K_hist = zeros(Int64, n_iterations)
      176   pp = zeros(Float64, max_clusters)
        - 
       48   tic()
        0   for ii = 1:NN
        0       kk = zz[ii]
        0       additem(bayesian_components[kk], xx[ii])
        0       nn[kk] += 1
        -   end
        0   dpm.K_hist[1] = dpm.K
        0   elapsed_time = toq()
        - 
        0   for iteration = 1:n_iterations
        - 
  5329296       println("iteration: $iteration, KK: $(dpm.K), KK mode: $(indmax(hist(dpm.K_hist,
        -                       0.5:maximum(dpm.K_hist)+0.5)[2])), elapsed time: $elapsed_time")
        - 
    16800       tic()
 28000000       @inbounds for ii = 1:NN
        0           kk = zz[ii]
        0           nn[kk] -= 1
        0           delitem(bayesian_components[kk], xx[ii])
        - 
        -           # remove the cluster if empty
        0           if nn[kk] == 0
   161880               println("\tcomponent $kk has become inactive")
        0               splice!(nn, kk)
        0               splice!(bayesian_components, kk)
        0               dpm.K -= 1
        - 
        -               # shifting the labels one cluster back
    69032               idx = find(x -> x>kk, zz)
    42944               zz[idx] -= 1
        -           end
        - 
        0           kk = DPM_sample_pp(bayesian_components, xx[ii], nn, pp, dpm.aa)
        - 
        0           if kk == dpm.K+1
   158976               println("\tcomponent $kk activated.")
    14144               push!(bayesian_components, deepcopy(dpm.bayesian_component))
     4872               push!(nn, 0)
        0               dpm.K += 1
        -           end
        - 
        0           zz[ii] = kk
        0           nn[kk] += 1
        0           additem(bayesian_components[kk], xx[ii])
        -       end
        - 
        0       dpm.aa = DPM_sample_hyperparam(dpm.aa, dpm.a1, dpm.a2, dpm.K, NN, n_internals)
        0       dpm.K_hist[iteration] = dpm.K
 14140000       dpm.K_zz_dict[dpm.K] = deepcopy(zz)
        0       elapsed_time = toq()
        -   end
        - end
        - 
        - function truncated_gibbs_sampler{T1, T2}(dpm::DPM{T1}, xx::Vector{T2}, zz::Vector{Int64},
        -   n_burnins::Int64, n_lags::Int64, n_samples::Int64, n_internals::Int64, K_truncation::Int64)
        - 
        -   NN = length(xx)                                             # number of data points
        -   nn = zeros(Int64, K_truncation)             # count array
        -   bayesian_components = [deepcopy(dpm.bayesian_component) for k = 1:K_truncation]
        -   n_iterations = n_burnins + (n_samples)*(n_lags+1)
        -   dpm.K_hist = zeros(Int64, n_iterations)
        -   states = (ASCIIString => Int64)[]
        -   n_states = 0
        - 
        -   tic()
        -   for ii = 1:NN
        -       kk = zz[ii]
        -       additem(bayesian_components[kk], xx[ii])
        -       nn[kk] += 1
        -   end
        -   dpm.K_hist[1] = dpm.K
        - 
        -   # constructing the sticks
        -   beta_VV = rand(Distributions.Beta(1.0, dpm.aa), K_truncation)
        -   beta_VV[end] = 1.0
        -   π = ones(Float64, K_truncation)
        -   π[2:end] = 1 - beta_VV[1:K_truncation-1]
        -   π = log(beta_VV) + log(cumprod(π))
        - 
        -   elapsed_time = toq()
        - 
        -   for iteration = 1:n_iterations
        - 
        -       println("iteration: $iteration, # active components: $(length(findn(nn)[1])), mode: $(indmax(hist(dpm.K_hist,
        -                       0.5:maximum(dpm.K_hist)+0.5)[2])), elapsed time: $elapsed_time \n", nn)
        - 
        -       tic()
        -       for ii = 1:NN
        -           kk = zz[ii]
        -           nn[kk] -= 1
        -           delitem(bayesian_components[kk], xx[ii])
        - 
        -           # resampling label
        -           pp = zeros(Float64, K_truncation)
        -           for kk = 1:K_truncation
        -               pp[kk] = π[kk] + logpredictive(bayesian_components[kk], xx[ii])
        -           end
        -           pp = exp(pp - maximum(pp))
        -           pp /= sum(pp)
        - 
        -           # sample from pp
        -           kk = sampleindex(pp)
        -           zz[ii] = kk
        -           nn[kk] += 1
        -           additem(bayesian_components[kk], xx[ii])
        - 
        -           for kk = 1:K_truncation-1
        -               gamma1 = 1 + nn[kk]
        -               gamma2 = dpm.aa + sum(nn[kk+1:end])
        -               beta_VV[kk] = rand(Distributions.Beta(gamma1, gamma2))
        -           end
        -           beta_VV[end] = 1.0
        -           π = ones(Float64, K_truncation)
        -           π[2:end] = 1 - beta_VV[1:K_truncation-1]
        -           π = log(beta_VV) + log(cumprod(π))
        - 
        -           # resampling concentration parameter based on Escobar and West 1995
        -           for internal_iters = 1:n_internals
        -                   eta = rand(Distributions.Beta(dpm.aa+1, NN))
        -               rr = (dpm.a1+dpm.K-1) / (NN*(dpm.a2-log(NN)))
        -               pi_eta = rr / (1+rr)
        - 
        -               if rand() < pi_eta
        -                   dpm.aa = rand(Distributions.Gamma(dpm.a1+dpm.K, 1/(dpm.a2-log(eta))))
        -               else
        -                   dpm.aa = rand(Distributions.Gamma(dpm.a1+dpm.K-1, 1/(dpm.a2-log(eta))))
        -               end
        -           end
        -       end
        - 
        -       nn_string = nn2string(nn)
        -       if !haskey(states, nn_string)
        -           n_states += 1
        -           states[nn_string] = n_states
        -       end
        -       dpm.K_hist[iteration] = states[nn_string]
        -       dpm.K_zz_dict[states[nn_string]] = deepcopy(zz)
        -       elapsed_time = toq()
        -   end
        -   return states
        - end
        - 
        - 
        - function posterior{T1, T2}(dpm::DPM{T1}, xx::Vector{T2}, K::Int64, K_truncation::Int64=0)
        0   n_components = 0
        0   if K_truncation == 0
        0       n_components = K
        -   else
        0       n_components = K_truncation
        -   end
        - 
        0   bayesian_components = [deepcopy(dpm.bayesian_component) for kk=1:n_components]
        0   zz = dpm.K_zz_dict[K]
        - 
        0   NN = length(xx)
        0   nn = zeros(Int64, n_components)
        - 
        0   for ii = 1:NN
        0       kk = zz[ii]
        0       additem(bayesian_components[kk], xx[ii])
        0       nn[kk] += 1
        -   end
        - 
        0   return([posterior(bayesian_components[kk]) for kk=1:n_components], nn)
        - end
        - 

What I don't seem to understand is why for instance a simple assignment which only runs twice, allocates 191688 units (I assume the unit is Bytes, I am not sure though).

.cov:

    2   NN = length(xx)                  # number of data points

.mem:

   191688   NN = length(xx)              # number of data points

or this one is worse:

cov:

  352       @inbounds for ii = 1:NN

mem:

  28000000      @inbounds for ii = 1:NN
like image 225
Adham Avatar asked Sep 11 '15 09:09

Adham


1 Answers

The answer is briefly mentioned in the docs, "Under the user setting, the first line of any function directly called from the REPL will exhibit allocation due to events that happen in the REPL code itself." Also possibly relevant: "More significantly, JIT-compilation also adds to allocation counts, because much of Julia’s compiler is written in Julia (and compilation usually requires memory allocation). The recommended procedure is to force compilation by executing all the commands you want to analyze, then call Profile.clear_malloc_data() to reset all allocation counters."

The bottom line: that first line is being blamed for allocations that happen elsewhere, because it's the first line to start reporting allocation again.

like image 154
tholy Avatar answered Sep 28 '22 21:09

tholy