So I'm trying to wrap my head around Julia's parallelization options. I'm modelling stochastic processes as Markov chains. Since the chains are independent replicates, the outer loops are independent - making the problem embarrassingly parallel.
I tried to implement both a @distributed and a @threads solution, both of which seem to run fine, but aren't any faster than the sequential.
Here's a simplified version of my code (sequential):
function dummy(steps = 10000, width = 100, chains = 4)
out_N = zeros(steps, width, chains)
initial = zeros(width)
for c = 1:chains
# print("c=$c\n")
N = zeros(steps, width)
state = copy(initial)
N[1,:] = state
for i = 1:steps
state = state + rand(width)
N[i,:] = state
end
out_N[:,:,c] = N
end
return out_N
end
What would be the correct way of parallelizing this problem to increase performance?
Here is the correct way to do it (at the time of writing this answer the other answer does not work - see my comment).
I will use slightly less complex example than in the question (however very similar).
using Random
const m = MersenneTwister(0);
function dothestuff!(out_N, N, ic, m)
out_N[:, ic] .= rand(m, N)
end
function dummy_base(m=m, N=100_000,c=256)
out_N = Array{Float64}(undef,N,c)
for ic in 1:c
dothestuff!(out_N, N, ic, m)
end
out_N
end
Testing:
julia> using BenchmarkTools; @btime dummy_base();
106.512 ms (514 allocations: 390.64 MiB)
#remember to run before starting Julia:
# set JULIA_NUM_THREADS=4
# OR (Linux)
# export JULIA_NUM_THREADS=4
using Random
const mt = MersenneTwister.(1:Threads.nthreads());
# required for older Julia versions, look still good in later versions :-)
function dothestuff!(out_N, N, ic, m)
out_N[:, ic] .= rand(m, N)
end
function dummy_threads(mt=mt, N=100_000,c=256)
out_N = Array{Float64}(undef,N,c)
Threads.@threads for ic in 1:c
dothestuff!(out_N, N, ic, mt[Threads.threadid()])
end
out_N
end
Let us test the performance:
julia> using BenchmarkTools; @btime dummy_threads();
46.775 ms (535 allocations: 390.65 MiB)
using Distributed
addprocs(4)
using Random, SharedArrays
@everywhere using Random, SharedArrays, Distributed
@everywhere Random.seed!(myid())
@everywhere function dothestuff!(out_N, N, ic)
out_N[:, ic] .= rand(N)
end
function dummy_distr(N=100_000,c=256)
out_N = SharedArray{Float64}(N,c)
@sync @distributed for ic in 1:c
dothestuff!(out_N, N, ic)
end
out_N
end
Performance (note that inter-process communication takes some time and hence for small computations threads will be usually better):
julia> using BenchmarkTools; @btime dummy_distr();
62.584 ms (1073 allocations: 45.48 KiB)
You can use @distributed macro, to run processes in parallel
@everywhere using Distributed, SharedArrays
addprocs(4)
@everywhere function inner_loop!(out_N, chain_number,steps,width)
N = zeros(steps, width)
state = zeros(width)
for i = 1:steps
state .+= rand(width)
N[i,:] .= state
end
out_N[:,:,chain_number] .= N
nothing
end
function dummy(steps = 10000, width = 100, chains = 4)
out_N = SharedArray{Float64}((steps, width, chains); pids = collect(1:4))
@sync for c = 1:chains
# print("c=$c\n")
@spawnat :any inner_loop!(out_N, c, steps,width)
end
sdata(out_N)
end
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With