I tried porting the NN code presented here to Julia, hoping for a speed increase in training the network. On my desktop, this proved to be the case.
However, on my MacBook, Python + numpy beats Julia by miles.
Training with the same parameters, Python is more than twice as fast as Julia (4.4s vs 10.6s for one epoch). Considering that Julia is faster than Python (by ~2s) on my desktop, it seems like there's some resource that Python/numpy is utilizing on the mac that Julia isn't. Even parallelizing the code only gets me down to ~6.6s (although this might be due to me not being that experienced in writing parallel code). I thought the problem might be that Julia's BLAS was slower than the vecLib library used natively in mac, but experimenting with different builds didn't seem to get me much closer. I tried building both with USE_SYSTEM_BLAS = 1, and building with MKL, of which MKL gave the faster result (the times posted above).
I'll post my version info for the laptop as well as my Julia implementation below for reference. I don't have access to the desktop at this time, but I was running the same version of Julia on Windows, using openBLAS, comparing with a clean installation of Python 2.7 also using openBLAS.
Is there something I'm missing here?
EDIT: I know that my Julia code leaves a lot to be desired in terms of optimization, I really appreciate any tips to make it faster. However, this is not a case of Julia being slower on my laptop but rather Python being much faster. On my desktop, Python runs one epoch in ~13 seconds, on the laptop it only takes ~4.4s. What I'm interested in the most is where this difference comes from. I realize the question may be somewhat poorly formulated.
Versions on laptop:
julia> versioninfo()
Julia Version 0.6.2
Commit d386e40c17 (2017-12-13 18:08 UTC)
Platform Info:
OS: macOS (x86_64-apple-darwin17.4.0)
CPU: Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz
WORD_SIZE: 64
BLAS: libmkl_rt
LAPACK: libmkl_rt
LIBM: libopenlibm
LLVM: libLLVM-3.9.1 (ORCJIT, broadwell)
Python 2.7.14 (default, Mar 22 2018, 14:43:05)
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.39.2)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import numpy
>>> numpy.show_config()
lapack_opt_info:
extra_link_args = ['-Wl,-framework', '-Wl,Accelerate']
extra_compile_args = ['-msse3']
define_macros = [('NO_ATLAS_INFO', 3), ('HAVE_CBLAS', None)]
openblas_lapack_info:
NOT AVAILABLE
atlas_3_10_blas_threads_info:
NOT AVAILABLE
atlas_threads_info:
NOT AVAILABLE
openblas_clapack_info:
NOT AVAILABLE
atlas_3_10_threads_info:
NOT AVAILABLE
atlas_blas_info:
NOT AVAILABLE
atlas_3_10_blas_info:
NOT AVAILABLE
atlas_blas_threads_info:
NOT AVAILABLE
openblas_info:
NOT AVAILABLE
blas_mkl_info:
NOT AVAILABLE
blas_opt_info:
extra_link_args = ['-Wl,-framework', '-Wl,Accelerate']
extra_compile_args = ['-msse3', '-I/System/Library/Frameworks/vecLib.framework/Headers']
define_macros = [('NO_ATLAS_INFO', 3), ('HAVE_CBLAS', None)]
blis_info:
NOT AVAILABLE
atlas_info:
NOT AVAILABLE
atlas_3_10_info:
NOT AVAILABLE
lapack_mkl_info:
NOT AVAILABLE
Julia code (sequential):
using MLDatasets
mutable struct network
num_layers::Int64
sizearr::Array{Int64,1}
biases::Array{Array{Float64,1},1}
weights::Array{Array{Float64,2},1}
end
function network(sizes)
num_layers = length(sizes)
sizearr = sizes
biases = [randn(y) for y in sizes[2:end]]
weights = [randn(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
network(num_layers, sizearr, biases, weights)
end
σ(z) = 1/(1+e^(-z))
σ_prime(z) = σ(z)*(1-σ(z))
function (net::network)(a)
for (w, b) in zip(net.weights, net.biases)
a = σ.(w*a + b)
end
return a
end
function SGDtrain(net::network, training_data, epochs, mini_batch_size, η, test_data=nothing)
n_test = test_data != nothing ? length(test_data):nothing
n = length(training_data)
for j in 1:epochs
training_data = shuffle(training_data)
mini_batches = [training_data[k:k+mini_batch_size-1] for k in 1:mini_batch_size:n]
@time for batch in mini_batches
update_batch(net, batch, η)
end
if test_data != nothing
println("Epoch ", j,": ", evaluate(net, test_data), "/", n_test)
else
println("Epoch ", j," complete.")
end
end
end
function update_batch(net::network, batch, η)
∇_b = net.biases .- net.biases
∇_w = net.weights .- net.weights
for (x, y) in batch
δ_∇_b, δ_∇_w = backprop(net, x, y)
∇_b += δ_∇_b
∇_w += δ_∇_w
end
net.biases -= (η/length(batch))∇_b
net.weights -= (η/length(batch))∇_w
end
function backprop(net::network, x, y)
∇_b = copy(net.biases)
∇_w = copy(net.weights)
len = length(net.sizearr)
activation = x
activations = Array{Array{Float64,1}}(len)
activations[1] = x
zs = copy(net.biases)
for i in 1:len-1
b = net.biases[i]; w = net.weights[i]
z = w*activation .+ b
zs[i] = z
activation = σ.(z)
activations[i+1] = activation[:]
end
δ = (activations[end] - y) .* σ_prime.(zs[end])
∇_b[end] = δ[:]
∇_w[end] = δ*activations[end-1]'
for l in 1:net.num_layers-2
z = zs[end-l]
δ = net.weights[end-l+1]'δ .* σ_prime.(z)
∇_b[end-l] = δ[:]
∇_w[end-l] = δ*activations[end-l-1]'
end
return (∇_b, ∇_w)
end
function evaluate(net::network, test_data)
test_results = [(findmax(net(x))[2] - 1, y) for (x, y) in test_data]
return sum(Int(x == y) for (x, y) in test_results)
end
function loaddata(rng = 1:50000)
train_x, train_y = MNIST.traindata(Float64, Vector(rng))
train_x = [train_x[:,:,x][:] for x in 1:size(train_x, 3)]
train_y = [vectorize(x) for x in train_y]
traindata = [(x, y) for (x, y) in zip(train_x, train_y)]
test_x, test_y = MNIST.testdata(Float64)
test_x = [test_x[:,:,x][:] for x in 1:size(test_x, 3)]
testdata = [(x, y) for (x, y) in zip(test_x, test_y)]
return traindata, testdata
end
function vectorize(n)
ev = zeros(10,1)
ev[n+1] = 1
return ev
end
function main()
net = network([784, 30, 10])
traindata, testdata = loaddata()
SGDtrain(net, traindata, 10, 10, 1.25, testdata)
end
Array-wise expression (with temporaries)For small arrays (up to 1000 elements) Julia is actually faster than Python/NumPy. For intermediate size arrays (100,000 elements), Julia is nearly 2.5 times slower (and in fact, without the sum , Julia is up to 4 times slower).
Because the Numpy array is densely packed in memory due to its homogeneous type, it also frees the memory faster. So overall a task executed in Numpy is around 5 to 100 times faster than the standard python list, which is a significant leap in terms of speed.
As you can see NumPy is incredibly fast, but always a bit slower than pure C.
I started by running your code:
7.110379 seconds (1.37 M allocations: 20.570 GiB, 19.81%gc time)
Epoch 1: 7960/10000
6.147297 seconds (1.27 M allocations: 20.566 GiB, 18.33%gc time)
Ouch, 21GiB allocated per epoch? That's your issue. It's hitting garbage collection a lot, and the less memory your computer has the more it will have to. So let's tackle that.
The main idea is to pre-allocate your buffers and then modify arrays instead of creating new ones. In your code, you start backprop
with:
∇_b = copy(net.biases)
∇_w = copy(net.weights)
len = length(net.sizearr)
activation = x
activations = Array{Array{Float64,1}}(len)
activations[1] = x
zs = copy(net.biases)
The fact that you're using copy
means that you should probably be pre-allocating things! So let's start with zs
and activations
. I expanded your network to hold those cache arrays:
mutable struct network
num_layers::Int64
sizearr::Array{Int64,1}
biases::Array{Array{Float64,1},1}
weights::Array{Array{Float64,2},1}
zs::Array{Array{Float64,1},1}
activations::Array{Array{Float64,1},1}
end
function network(sizes)
num_layers = length(sizes)
sizearr = sizes
biases = [randn(y) for y in sizes[2:end]]
weights = [randn(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
zs = [randn(y) for y in sizes[2:end]]
activations = [randn(y) for y in sizes[1:end]]
network(num_layers, sizearr, biases, weights, zs, activations)
end
Then I changed your backprop
to use those caches:
function backprop(net::network, x, y)
∇_b = copy(net.biases)
∇_w = copy(net.weights)
len = length(net.sizearr)
activations = net.activations
activations[1] .= x
zs = net.zs
for i in 1:len-1
b = net.biases[i]; w = net.weights[i];
z = zs[i]; activation = activations[i+1]
z .= w*activations[i] .+ b
activation .= σ.(z)
end
δ = (activations[end] - y) .* σ_prime.(zs[end])
∇_b[end] = δ[:]
∇_w[end] = δ*activations[end-1]'
for l in 1:net.num_layers-2
z = zs[end-l]
δ = net.weights[end-l+1]'δ .* σ_prime.(z)
∇_b[end-l] = δ[:]
∇_w[end-l] = δ*activations[end-l-1]'
end
return (∇_b, ∇_w)
end
This led to a substantial decrease in the memory allocated. But there's still a lot to do. First let's change a *
to an A_mul_B!
. This function is a matrix multiplication which writes into an array C
(A_mul_B!(C,A,B)
) instead of creating a new matrix, and this can substantially decrease your memory allocations. So I did:
for l in 1:net.num_layers-2
z = zs[end-l]
δ = net.weights[end-l+1]'δ .* σ_prime.(z)
∇_b[end-l] .= vec(δ)
atransp = activations[end-l-1]'
A_mul_B!(∇_w[end-l],δ,atransp)
end
But instead of '
which allocates, instead I use reshape
because I just want a view:
for l in 1:net.num_layers-2
z = zs[end-l]
δ = net.weights[end-l+1]'δ .* σ_prime.(z)
∇_b[end-l] .= vec(δ)
atransp = reshape(activations[end-l-1],1,length(activations[end-l-1]))
A_mul_B!(∇_w[end-l],δ,atransp)
end
(also it hits a faster OpenBLAS dispatch. That may be different with MKL though). But you're still copying with
∇_b = copy(net.biases)
∇_w = copy(net.weights)
and you're allocating a bunch of δs each step, so the next change I did pre-allocates those and does it all in place (it looks just like the previous changes).
Then I did some profiling. In Juno, this is just:
@profile main()
Juno.profiler()
or if you don't use Juno you can replace the second part with ProfileView.jl. I got:
So most of the time is spent in BLAS but there's a problem. See that operations like ∇_w += δ_∇_w
are creating a bunch of matrices! Instead we want to loop through and in-place update each matrix by its change matrix. This expands out to be like:
function update_batch(net::network, batch, η)
∇_b = net.∇_b
∇_w = net.∇_w
for i in 1:length(∇_b)
fill!(∇_b[i],0.0)
end
for i in 1:length(∇_w)
fill!(∇_w[i],0.0)
end
for (x, y) in batch
δ_∇_b, δ_∇_w = backprop(net, x, y)
∇_b .+= δ_∇_b
for i in 1:length(∇_w)
∇_w[i] .+= δ_∇_w[i]
end
end
for i in 1:length(∇_b)
net.biases[i] .-= (η/length(batch)).*∇_b[i]
end
for i in 1:length(∇_w)
net.weights[i] .-= (η/length(batch)).*∇_w[i]
end
end
I did a few more changes along the same lines and my final code is the following:
mutable struct network
num_layers::Int64
sizearr::Array{Int64,1}
biases::Array{Array{Float64,1},1}
weights::Array{Array{Float64,2},1}
weights_transp::Array{Array{Float64,2},1}
zs::Array{Array{Float64,1},1}
activations::Array{Array{Float64,1},1}
∇_b::Array{Array{Float64,1},1}
∇_w::Array{Array{Float64,2},1}
δ_∇_b::Array{Array{Float64,1},1}
δ_∇_w::Array{Array{Float64,2},1}
δs::Array{Array{Float64,2},1}
end
function network(sizes)
num_layers = length(sizes)
sizearr = sizes
biases = [randn(y) for y in sizes[2:end]]
weights = [randn(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
weights_transp = [randn(x, y) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
zs = [randn(y) for y in sizes[2:end]]
activations = [randn(y) for y in sizes[1:end]]
∇_b = [zeros(y) for y in sizes[2:end]]
∇_w = [zeros(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
δ_∇_b = [zeros(y) for y in sizes[2:end]]
δ_∇_w = [zeros(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
δs = [zeros(y,1) for y in sizes[2:end]]
network(num_layers, sizearr, biases, weights, weights_transp, zs, activations,∇_b,∇_w,δ_∇_b,δ_∇_w,δs)
end
function update_batch(net::network, batch, η)
∇_b = net.∇_b
∇_w = net.∇_w
for i in 1:length(∇_b)
∇_b[i] .= 0.0
end
for i in 1:length(∇_w)
∇_w[i] .= 0.0
end
δ_∇_b = net.δ_∇_b
δ_∇_w = net.δ_∇_w
for (x, y) in batch
backprop!(net, x, y)
for i in 1:length(∇_b)
∇_b[i] .+= δ_∇_b[i]
end
for i in 1:length(∇_w)
∇_w[i] .+= δ_∇_w[i]
end
end
for i in 1:length(∇_b)
net.biases[i] .-= (η/length(batch)).*∇_b[i]
end
for i in 1:length(∇_w)
net.weights[i] .-= (η/length(batch)).*∇_w[i]
end
end
function backprop!(net::network, x, y)
∇_b = net.δ_∇_b
∇_w = net.δ_∇_w
len = length(net.sizearr)
activations = net.activations
activations[1] .= x
zs = net.zs
δs = net.δs
for i in 1:len-1
b = net.biases[i]; w = net.weights[i];
z = zs[i]; activation = activations[i+1]
A_mul_B!(z,w,activations[i])
z .+= b
activation .= σ.(z)
end
δ = δs[end]
δ .= (activations[end] .- y) .* σ_prime.(zs[end])
∇_b[end] .= vec(δ)
atransp = reshape(activations[end-1],1,length(activations[end-1]))
A_mul_B!(∇_w[end],δ,atransp)
for l in 1:net.num_layers-2
z = zs[end-l]
transpose!(net.weights_transp[end-l+1],net.weights[end-l+1])
A_mul_B!(δs[end-l],net.weights_transp[end-l+1],δ)
δ = δs[end-l]
δ .*= σ_prime.(z)
∇_b[end-l] .= vec(δ)
atransp = reshape(activations[end-l-1],1,length(activations[end-l-1]))
A_mul_B!(∇_w[end-l],δ,atransp)
end
return nothing
end
All else remains unchanged. To see that I am done, I added @time
to the backprop
call and get:
0.000070 seconds (8 allocations: 352 bytes)
0.000066 seconds (8 allocations: 352 bytes)
0.000090 seconds (8 allocations: 352 bytes)
so that's non-allocating. I added @time
to the for (x, y) in batch
loop and get
0.000636 seconds (80 allocations: 3.438 KiB) 0.000610 seconds (80 allocations: 3.438 KiB) 0.000624 seconds (80 allocations: 3.438 KiB)
so that tells me that essentially all of the allocations left are coming from the iterator (this can be improved, but likely won't improve the timing). So the final timing is:
Epoch 2: 8428/10000
4.005540 seconds (586.87 k allocations: 23.925 MiB)
Epoch 1: 8858/10000
3.488674 seconds (414.49 k allocations: 17.082 MiB)
Epoch 2: 9104/10000
That's almost 2x faster on my machine, but the memory allocations are 1200x less per loop. This means that on machines where the RAM is slower and smaller, this method should do much much better (my desktop has quite a bit of memory so it really doesn't care too much!).
The final profiles show most of the time is in A_mul_B!
calls and thus pretty much everything is limited by my OpenBLAS speed now, so I am done. Some extra things I can do are multithread some other loops, but giving the profiling the payoff will be small so I'll leave that to you (basically just put Threads.@threads
on the loops like ∇_w[i] .+= δ_∇_w[i]
).
Hopefully this not only improves your code, but teaches how to profile, pre-allocate, use in-place operations, and think about performance.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With