I was working on mandelbrot algorithm to learn Rust and I found out that empty 25mil(approx 6k image) loop takes 0.5s. I found it quite slow. So I went to test it in python and found out, it takes almost the same time. Has really python's for loop almost zero cost abstraction? Is this really the best I can get with intel i7?
Rust:
use std::time::Instant;
fn main() {
let before = Instant::now();
for i in 0..5000 {
for j in 0..5000 {}
}
println!("Elapsed time: {:.2?}", before.elapsed());
}
>>> Elapsed time: 406.90ms
Python:
import time
s = time.time()
for i in range(5000):
for j in range(5000):
pass
print(time.time()-s)
>>> 0.5715351104736328
UPDATE: If I use initialized tuple instead range, python is even faster than rust -> 0.33s
If you're doing performance testing always build with --release
. By default Cargo builds with debugging information enabled and optimizations disabled. The optimizer will completely eliminate these loops. On the Playground it drops from 975ms to 1.25µs.
Let's take a look at the assembly on Godbolt for just the loops, no timer:
pub fn main() {
for i in 0..5000 {
for j in 0..5000 {}
}
}
Without optimization:
<i32 as core::iter::range::Step>::forward_unchecked:
push rax
mov eax, esi
add edi, eax
mov dword ptr [rsp + 4], edi
mov eax, dword ptr [rsp + 4]
mov dword ptr [rsp], eax
mov eax, dword ptr [rsp]
pop rcx
ret
core::intrinsics::copy_nonoverlapping:
push rax
mov qword ptr [rsp], rsi
mov rsi, rdi
mov rdi, qword ptr [rsp]
shl rdx, 2
call memcpy@PLT
pop rax
ret
core::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt:
mov eax, dword ptr [rdi]
cmp eax, dword ptr [rsi]
setl al
and al, 1
movzx eax, al
ret
core::mem::replace:
sub rsp, 40
mov qword ptr [rsp], rdi
mov dword ptr [rsp + 12], esi
mov byte ptr [rsp + 23], 0
mov byte ptr [rsp + 23], 1
mov rax, qword ptr [rip + core::ptr::read@GOTPCREL]
call rax
mov ecx, eax
mov dword ptr [rsp + 16], ecx
jmp .LBB3_1
.LBB3_1:
mov esi, dword ptr [rsp + 12]
mov rdi, qword ptr [rsp]
mov byte ptr [rsp + 23], 0
mov rcx, qword ptr [rip + core::ptr::write@GOTPCREL]
call rcx
jmp .LBB3_4
.LBB3_2:
test byte ptr [rsp + 23], 1
jne .LBB3_8
jmp .LBB3_7
mov rcx, rax
mov eax, edx
mov qword ptr [rsp + 24], rcx
mov dword ptr [rsp + 32], eax
jmp .LBB3_2
.LBB3_4:
mov eax, dword ptr [rsp + 16]
add rsp, 40
ret
.LBB3_5:
jmp .LBB3_2
mov rcx, rax
mov eax, edx
mov qword ptr [rsp + 24], rcx
mov dword ptr [rsp + 32], eax
jmp .LBB3_5
.LBB3_7:
mov rdi, qword ptr [rsp + 24]
call _Unwind_Resume@PLT
ud2
.LBB3_8:
jmp .LBB3_7
core::ptr::read:
sub rsp, 24
mov qword ptr [rsp + 8], rdi
mov eax, dword ptr [rsp + 20]
mov dword ptr [rsp + 16], eax
jmp .LBB4_2
.LBB4_2:
mov rdi, qword ptr [rsp + 8]
lea rsi, [rsp + 16]
mov edx, 1
call qword ptr [rip + core::intrinsics::copy_nonoverlapping@GOTPCREL]
mov eax, dword ptr [rsp + 16]
mov dword ptr [rsp + 4], eax
mov eax, dword ptr [rsp + 4]
add rsp, 24
ret
core::ptr::write:
sub rsp, 4
mov dword ptr [rsp], esi
mov eax, dword ptr [rsp]
mov dword ptr [rdi], eax
add rsp, 4
ret
core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next:
push rax
call qword ptr [rip + <core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next@GOTPCREL]
mov dword ptr [rsp], eax
mov dword ptr [rsp + 4], edx
mov edx, dword ptr [rsp + 4]
mov eax, dword ptr [rsp]
pop rcx
ret
core::clone::impls::<impl core::clone::Clone for i32>::clone:
mov eax, dword ptr [rdi]
ret
<I as core::iter::traits::collect::IntoIterator>::into_iter:
mov edx, esi
mov eax, edi
ret
<core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next:
sub rsp, 40
mov rsi, rdi
mov qword ptr [rsp + 16], rsi
mov rdi, rsi
add rsi, 4
call core::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt
mov byte ptr [rsp + 31], al
mov al, byte ptr [rsp + 31]
test al, 1
jne .LBB9_3
jmp .LBB9_2
.LBB9_2:
mov dword ptr [rsp + 32], 0
jmp .LBB9_7
.LBB9_3:
mov rdi, qword ptr [rsp + 16]
call core::clone::impls::<impl core::clone::Clone for i32>::clone
mov dword ptr [rsp + 12], eax
mov edi, dword ptr [rsp + 12]
mov esi, 1
call <i32 as core::iter::range::Step>::forward_unchecked
mov dword ptr [rsp + 8], eax
mov esi, dword ptr [rsp + 8]
mov rdi, qword ptr [rsp + 16]
call qword ptr [rip + core::mem::replace@GOTPCREL]
mov dword ptr [rsp + 4], eax
mov eax, dword ptr [rsp + 4]
mov dword ptr [rsp + 36], eax
mov dword ptr [rsp + 32], 1
.LBB9_7:
mov eax, dword ptr [rsp + 32]
mov edx, dword ptr [rsp + 36]
add rsp, 40
ret
example::main:
sub rsp, 72
mov dword ptr [rsp + 24], 0
mov dword ptr [rsp + 28], 5000
mov edi, dword ptr [rsp + 24]
mov esi, dword ptr [rsp + 28]
call qword ptr [rip + <I as core::iter::traits::collect::IntoIterator>::into_iter@GOTPCREL]
mov dword ptr [rsp + 16], eax
mov dword ptr [rsp + 20], edx
mov eax, dword ptr [rsp + 20]
mov ecx, dword ptr [rsp + 16]
mov dword ptr [rsp + 32], ecx
mov dword ptr [rsp + 36], eax
.LBB10_2:
mov rax, qword ptr [rip + core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next@GOTPCREL]
lea rdi, [rsp + 32]
call rax
mov dword ptr [rsp + 44], edx
mov dword ptr [rsp + 40], eax
mov eax, dword ptr [rsp + 40]
test rax, rax
je .LBB10_5
jmp .LBB10_13
.LBB10_13:
jmp .LBB10_6
ud2
.LBB10_5:
add rsp, 72
ret
.LBB10_6:
mov dword ptr [rsp + 48], 0
mov dword ptr [rsp + 52], 5000
mov edi, dword ptr [rsp + 48]
mov esi, dword ptr [rsp + 52]
call qword ptr [rip + <I as core::iter::traits::collect::IntoIterator>::into_iter@GOTPCREL]
mov dword ptr [rsp + 8], eax
mov dword ptr [rsp + 12], edx
mov eax, dword ptr [rsp + 12]
mov ecx, dword ptr [rsp + 8]
mov dword ptr [rsp + 56], ecx
mov dword ptr [rsp + 60], eax
.LBB10_8:
mov rax, qword ptr [rip + core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next@GOTPCREL]
lea rdi, [rsp + 56]
call rax
mov dword ptr [rsp + 68], edx
mov dword ptr [rsp + 64], eax
mov eax, dword ptr [rsp + 64]
test rax, rax
je .LBB10_11
jmp .LBB10_14
.LBB10_14:
jmp .LBB10_12
ud2
.LBB10_11:
jmp .LBB10_2
.LBB10_12:
jmp .LBB10_8
__rustc_debug_gdb_scripts_section__:
.asciz "\001gdb_load_rust_pretty_printers.py"
DW.ref.rust_eh_personality:
.quad rust_eh_personality
With optimization
example::main:
ret
Python vs. Rust (seconds vs. picoseconds) - performance definitely not the same
CPython 3.8.10 / rustc 1.55.0 (running on Linux guest hosted by 10+ yo mac).
Adding some steps to the Rust code to try to ensure the loops don't get optimized into oblivion. I figure the best way to ensure this is to receive user input to initialize some variables, update those variables in the loops, and print to standard output. It's still going to optimize it, but at least the loops don't just evaporate.
use std::error::Error;
use std::env::args;
use timeit::timeit_loops;
use timeit::timeit;
fn main() -> Result<(), Box<dyn Error>>
{
let a = args().skip(1).map(|s| s.parse())
.collect::<Result<Vec<usize>, _>>()?;
let n = a[0];
let m = a[1];
let mut d = 0;
// timeit increases the nesting of the loops to get enough
// samples of the timed code to calculate a good average.
// The number of loops timeit takes for sampling is included
// in the output.
timeit!({
for i in 0..5000 {
d += m * i;
for j in 0..5000 {
d += n * j;
}
}
});
println!("d: {}", d);
Ok(())
}
Output (average nanoseconds for each timeit loop):
$ cargo run --release -- 52 3
1000000 loops: 0.000029 ns
:
The comparable Python code, set up to run 1 timeit loop - I don't have the patience to let it go for more. These nested loops are very slow on Python.
import sys
import timeit
if __name__ == '__main__':
a = list(map(int, sys.argv[1:]))
n = a[0]
m = a[1]
d = 0
def loops():
global n, m, d
for i in range(5000):
d += m * i;
for j in range(5000):
d += n * j;
# The Rust timeit version iterated 1000000; Python is too
# slow to let timeit run a fraction of that number.
print(timeit.timeit(loops, number=1))
print(d)
Output (results in seconds):
$ python loops.py 52 3
3.8085460959700868
:
Running the Rust program debug build also beats Python: 811 ms vs. 3.8 s
Running the same program on PyPy 3.6.9 (Rust wins)
$ pypy3 loops.py 52 3
0.05104120302712545
:
PyPy3 beats the debug build of the Rust program, but doesn't come close to the release version.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With