Motivated by the book From Mathematics to Generic Programming, I am playing around with functions and different integer sizes.
I have two different prime sieve implementations that work either with u16
or u32
. I benchmark them with cargo bench
and the u16
method is always a little faster than the u32
method.
Why is this the case? My assumption is that my processor (i5-7300u) is able to do two adds at once for u16
but not for u32
or u64
. Yet, I have no idea how to validate this. I've attached the assembly.
Benchmark results
test tests::bench_sift2 ... bench: 74,093 ns/iter (+/- 3,765)
test tests::bench_sift2_u16 ... bench: 61,136 ns/iter (+/- 3,389)
EDIT
Playing around with different array sizes and with the idea of using boolean arrays instead of vectors results in approx. the same speed for both functions. Actually the performance difference is only significant when the size of both vectors is 1<<15.
EDIT 2
Some interesting observation: I running this code on an windows surface computer with windows 10 Pro 10.0.1. More or less by accident, I just run the benchmark with different power saving configurations. When I set the configuration to highest performance, I see more or less the results reported below. If I set the configuration to any other level, I see results where both functions seem to behave the same, but the measurement error increases drastically.
Rust Code
#![feature(iterator_step_by)]
#![feature(test)]
extern crate test;
fn main() {
let vec = sift2(1 << 15);
// let vec = sift2_u16(1 << 15);
println!("{}",vec[0]);
}
fn sift2(n: usize) -> Vec<bool> {
let mut vec = vec![true; n];
let mut i = 0;
let mut index_square = 3;
let mut factor = 3;
while index_square < n {
if vec[i] {
mark_sieve(&mut vec[index_square..], factor);
}
i += 1;
index_square += factor;
factor += 2;
index_square += factor;
}
vec
}
fn sift2_u16(n: u16) -> Vec<bool> {
let mut vec = vec![true; n as usize];
let mut i: u16 = 0;
let mut index_square: u16 = 3;
let mut factor: u16 = 3;
while index_square < n {
if vec[i as usize] {
mark_sieve(&mut vec[index_square as usize..], factor as usize);
}
i += 1;
index_square += factor;
factor += 2;
index_square += factor;
}
vec
}
fn mark_sieve(data: &mut [bool], factor: usize) {
data.iter_mut().step_by(factor).for_each(|k| *k = false);
}
#[cfg(test)]
mod tests {
use super::*;
use test::{black_box, Bencher};
#[bench]
fn bench_sift2(b: &mut Bencher) {
b.iter(|| sift2(1 << 15));
}
#[bench]
fn bench_sift2_u16(b: &mut Bencher) {
b.iter(|| sift2_u16(1 << 15));
}
}
generated assembly for sift2
.text
.def _ZN3std2rt10lang_start17h0092a1d276f89f87E;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.globl _ZN3std2rt10lang_start17h0092a1d276f89f87E
.p2align 4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %r8, %r9
movq %rdx, %rax
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r8
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.seh_endproc
.def _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.p2align 4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *(%rcx)
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.seh_endproc
.def _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.p2align 4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *%rcx
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.seh_endproc
.def _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
movq 8(%rcx), %rdx
testq %rdx, %rdx
je .LBB3_1
movq (%rcx), %rcx
movl $1, %r8d
jmp __rust_dealloc
.LBB3_1:
retq
.def _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
retq
.def _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.p2align 4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq __rust_oom
ud2
.seh_handlerdata
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.seh_endproc
.def _ZN8chapter34main17hfb06448c1bac2398E;
.scl 3;
.type 32;
.endef
.globl __xmm@00000000000080000000000000008000
.section .rdata,"dr",discard,__xmm@00000000000080000000000000008000
.p2align 4
__xmm@00000000000080000000000000008000:
.quad 32768
.quad 32768
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
.seh_handler __CxxFrameHandler3, @unwind, @except
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $128, %rsp
.seh_stackalloc 128
leaq 128(%rsp), %rbp
.seh_setframe 5, 128
.seh_endprologue
movq $-2, -8(%rbp)
leaq -56(%rbp), %r8
movl $32768, %ecx
movl $1, %edx
callq __rust_alloc
movq %rax, %rdi
testq %rdi, %rdi
je .LBB6_21
movl $32768, %r14d
movl $1, %edx
movl $32768, %r8d
movq %rdi, %rcx
callq memset
movq %rdi, -56(%rbp)
movaps __xmm@00000000000080000000000000008000(%rip), %xmm0
movups %xmm0, -48(%rbp)
xorl %edx, %edx
movl $3, %eax
movl $3, %ecx
cmpb $0, (%rdi,%rdx)
jne .LBB6_3
jmp .LBB6_10
.p2align 4, 0x90
.LBB6_12:
addq $2, %rax
movq -56(%rbp), %rdi
cmpb $0, (%rdi,%rdx)
je .LBB6_10
.LBB6_3:
cmpq %rcx, %r14
jb .LBB6_4
cmpq %rcx, %r14
je .LBB6_10
addq %rdi, %r14
leaq (%rdi,%rcx), %rdi
leaq -1(%rax), %rsi
addq $1, %rdi
.p2align 4, 0x90
.LBB6_9:
movb $0, -1(%rdi)
movq %r14, %rbx
subq %rdi, %rbx
addq %rax, %rdi
cmpq %rsi, %rbx
ja .LBB6_9
.LBB6_10:
addq %rax, %rcx
addq %rax, %rcx
addq $2, %rcx
cmpq $32767, %rcx
ja .LBB6_14
addq $1, %rdx
movq -40(%rbp), %r14
cmpq %rdx, %r14
ja .LBB6_12
.Ltmp6:
leaq panic_bounds_check_loc.j(%rip), %rcx
movq %r14, %r8
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
jmp .LBB6_6
.LBB6_14:
movq -40(%rbp), %rax
movq %rax, -64(%rbp)
movups -56(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
cmpq $0, -64(%rbp)
je .LBB6_15
movq -80(%rbp), %rsi
movq %rsi, -96(%rbp)
leaq _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
movq %rax, -88(%rbp)
leaq ref.m(%rip), %rax
movq %rax, -56(%rbp)
movq $2, -48(%rbp)
leaq ref.n(%rip), %rax
movq %rax, -40(%rbp)
movq $1, -32(%rbp)
leaq -96(%rbp), %rax
movq %rax, -24(%rbp)
movq $1, -16(%rbp)
.Ltmp2:
leaq -56(%rbp), %rcx
callq _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp3:
movq -72(%rbp), %rdx
testq %rdx, %rdx
je .LBB6_19
movl $1, %r8d
movq %rsi, %rcx
callq __rust_dealloc
.LBB6_19:
nop
addq $128, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.LBB6_4:
.Ltmp0:
movq %r14, %rdx
callq _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp1:
.LBB6_6:
ud2
.LBB6_21:
movups -48(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movaps -80(%rbp), %xmm0
movups %xmm0, -48(%rbp)
leaq -56(%rbp), %rcx
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
ud2
.LBB6_15:
.Ltmp4:
leaq panic_bounds_check_loc.j(%rip), %rcx
xorl %edx, %edx
xorl %r8d, %r8d
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp5:
jmp .LBB6_6
.seh_handlerdata
.long ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_13:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -56(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_20:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -80(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.Lfunc_end0:
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.section .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
.long 429065506
.long 2
.long ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 0
.long 0
.long 6
.long ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 120
.long 0
.long 1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
.long -1
.long "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
.long -1
.long "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
.long .Lfunc_begin0@IMGREL
.long -1
.long .Ltmp6@IMGREL+1
.long 0
.long .Ltmp2@IMGREL+1
.long 1
.long .Ltmp0@IMGREL+1
.long 0
.long .Ltmp4@IMGREL+1
.long 1
.long .Ltmp5@IMGREL+1
.long -1
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.def main;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,main
.globl main
.p2align 4, 0x90
main:
.seh_proc main
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %rdx, %rax
movslq %ecx, %r8
leaq _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r9
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,main
.seh_endproc
.section .rdata,"dr",one_only,vtable.4
.p2align 3
vtable.4:
.quad _ZN4core3ptr13drop_in_place17hd909dec568d984beE
.quad 8
.quad 8
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.section .rdata,"dr",one_only,str.i
.p2align 4
str.i:
.ascii "C:\\projects\\rust\\src\\liballoc\\vec.rs"
.section .rdata,"dr",one_only,panic_bounds_check_loc.j
.p2align 3
panic_bounds_check_loc.j:
.quad str.i
.quad 36
.long 1551
.long 10
.section .rdata,"dr",one_only,str.k
str.k:
.section .rdata,"dr",one_only,str.l
str.l:
.byte 10
.section .rdata,"dr",one_only,ref.m
.p2align 3
ref.m:
.quad str.k
.quad 0
.quad str.l
.quad 1
.section .rdata,"dr",one_only,ref.n
.p2align 3
ref.n:
.quad 1
.quad 0
.quad 3
.zero 8
.quad 3
.zero 8
.long 32
.long 0
.byte 3
.zero 7
generated assembly for sift2_u16
U16
.text
.def _ZN3std2rt10lang_start17h0092a1d276f89f87E;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.globl _ZN3std2rt10lang_start17h0092a1d276f89f87E
.p2align 4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %r8, %r9
movq %rdx, %rax
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r8
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.seh_endproc
.def _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.p2align 4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *(%rcx)
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.seh_endproc
.def _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.p2align 4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *%rcx
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.seh_endproc
.def _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
movq 8(%rcx), %rdx
testq %rdx, %rdx
je .LBB3_1
movq (%rcx), %rcx
movl $1, %r8d
jmp __rust_dealloc
.LBB3_1:
retq
.def _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
retq
.def _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.p2align 4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq __rust_oom
ud2
.seh_handlerdata
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.seh_endproc
.def _ZN8chapter34main17hfb06448c1bac2398E;
.scl 3;
.type 32;
.endef
.globl __xmm@00000000000080000000000000008000
.section .rdata,"dr",discard,__xmm@00000000000080000000000000008000
.p2align 4
__xmm@00000000000080000000000000008000:
.quad 32768
.quad 32768
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
.seh_handler __CxxFrameHandler3, @unwind, @except
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $128, %rsp
.seh_stackalloc 128
leaq 128(%rsp), %rbp
.seh_setframe 5, 128
.seh_endprologue
movq $-2, -8(%rbp)
leaq -56(%rbp), %r8
movl $32768, %ecx
movl $1, %edx
callq __rust_alloc
movq %rax, %r14
testq %r14, %r14
je .LBB6_23
movl $32768, %edi
movl $1, %edx
movl $32768, %r8d
movq %r14, %rcx
callq memset
movq %r14, -56(%rbp)
movaps __xmm@00000000000080000000000000008000(%rip), %xmm0
movups %xmm0, -48(%rbp)
movw $3, %r8w
xorl %edx, %edx
movw $3, %r9w
cmpb $0, (%r14,%rdx)
jne .LBB6_3
jmp .LBB6_12
.p2align 4, 0x90
.LBB6_14:
movq -56(%rbp), %r14
cmpb $0, (%r14,%rdx)
je .LBB6_12
.LBB6_3:
movzwl %r9w, %ecx
cmpq %rcx, %rdi
jb .LBB6_4
testw %r8w, %r8w
je .LBB6_8
cmpq %rcx, %rdi
je .LBB6_12
addq %r14, %rcx
movzwl %r8w, %ebx
addq %r14, %rdi
leaq -1(%rbx), %rax
addq $1, %rcx
.p2align 4, 0x90
.LBB6_11:
movb $0, -1(%rcx)
movq %rdi, %rsi
subq %rcx, %rsi
addq %rbx, %rcx
cmpq %rax, %rsi
ja .LBB6_11
.LBB6_12:
addl %r8d, %r9d
addl $2, %r8d
addw %r8w, %r9w
js .LBB6_16
addq $1, %rdx
movq -40(%rbp), %rdi
cmpq %rdx, %rdi
ja .LBB6_14
.Ltmp8:
leaq panic_bounds_check_loc.j(%rip), %rcx
movq %rdi, %r8
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp9:
jmp .LBB6_6
.LBB6_16:
movq -40(%rbp), %rax
movq %rax, -64(%rbp)
movups -56(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
cmpq $0, -64(%rbp)
je .LBB6_17
movq -80(%rbp), %rsi
movq %rsi, -96(%rbp)
leaq _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
movq %rax, -88(%rbp)
leaq ref.m(%rip), %rax
movq %rax, -56(%rbp)
movq $2, -48(%rbp)
leaq ref.n(%rip), %rax
movq %rax, -40(%rbp)
movq $1, -32(%rbp)
leaq -96(%rbp), %rax
movq %rax, -24(%rbp)
movq $1, -16(%rbp)
.Ltmp4:
leaq -56(%rbp), %rcx
callq _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp5:
movq -72(%rbp), %rdx
testq %rdx, %rdx
je .LBB6_21
movl $1, %r8d
movq %rsi, %rcx
callq __rust_dealloc
.LBB6_21:
nop
addq $128, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.LBB6_4:
.Ltmp2:
movq %rdi, %rdx
callq _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp3:
jmp .LBB6_6
.LBB6_8:
.Ltmp0:
leaq ref.b(%rip), %rcx
callq _ZN4core9panicking5panic17h42feaa2e0dc2c607E
.Ltmp1:
.LBB6_6:
ud2
.LBB6_23:
movups -48(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movaps -80(%rbp), %xmm0
movups %xmm0, -48(%rbp)
leaq -56(%rbp), %rcx
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
ud2
.LBB6_17:
.Ltmp6:
leaq panic_bounds_check_loc.j(%rip), %rcx
xorl %edx, %edx
xorl %r8d, %r8d
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
jmp .LBB6_6
.seh_handlerdata
.long ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_15:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -56(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_22:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -80(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.Lfunc_end0:
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.section .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
.long 429065506
.long 2
.long ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 0
.long 0
.long 6
.long ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 120
.long 0
.long 1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
.long -1
.long "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
.long -1
.long "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
.long .Lfunc_begin0@IMGREL
.long -1
.long .Ltmp8@IMGREL+1
.long 0
.long .Ltmp4@IMGREL+1
.long 1
.long .Ltmp2@IMGREL+1
.long 0
.long .Ltmp6@IMGREL+1
.long 1
.long .Ltmp7@IMGREL+1
.long -1
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.def main;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,main
.globl main
.p2align 4, 0x90
main:
.seh_proc main
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %rdx, %rax
movslq %ecx, %r8
leaq _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r9
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,main
.seh_endproc
.section .rdata,"dr",one_only,vtable.4
.p2align 3
vtable.4:
.quad _ZN4core3ptr13drop_in_place17hd909dec568d984beE
.quad 8
.quad 8
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.section .rdata,"dr",one_only,str.9
.p2align 4
str.9:
.ascii "assertion failed: step != 0"
.section .rdata,"dr",one_only,str.a
.p2align 4
str.a:
.ascii "libcore\\iter\\iterator.rs"
.section .rdata,"dr",one_only,ref.b
.p2align 3
ref.b:
.quad str.9
.quad 27
.quad str.a
.quad 24
.long 299
.long 9
.section .rdata,"dr",one_only,str.i
.p2align 4
str.i:
.ascii "C:\\projects\\rust\\src\\liballoc\\vec.rs"
.section .rdata,"dr",one_only,panic_bounds_check_loc.j
.p2align 3
panic_bounds_check_loc.j:
.quad str.i
.quad 36
.long 1551
.long 10
.section .rdata,"dr",one_only,str.k
str.k:
.section .rdata,"dr",one_only,str.l
str.l:
.byte 10
.section .rdata,"dr",one_only,ref.m
.p2align 3
ref.m:
.quad str.k
.quad 0
.quad str.l
.quad 1
.section .rdata,"dr",one_only,ref.n
.p2align 3
ref.n:
.quad 1
.quad 0
.quad 3
.zero 8
.quad 3
.zero 8
.long 32
.long 0
.byte 3
.zero 7
I never tried Rust, but I know some good tools to do such performance analysis. So while it may not fully answer your question, you'll get tools to dig more into this.
When trying to understand low-level performance, you must look on the generated assembly, which you seems to have done by the output you provided. However, this is pretty unreadable like this. That's where my predilection tool comes in the game: Compiler Explorer. You can see your code here
From the generated assembly, we see some differences. Let's just focus on 2 elements of the loops (you can check the others, but the idea & result is the same)
The test while in 32 bits:
mov rax, qword ptr [rbp - 112]
cmp qword ptr [rbp - 64], rax
jb .LBB124_5
And the same in 16 bits:
mov ax, word ptr [rbp - 98]
cmp word ptr [rbp - 52], ax
jb .LBB125_5
mark_sieve in 32:
.LBB124_8:
mov rax, qword ptr [rbp - 64]
mov qword ptr [rbp - 48], rax
mov rsi, qword ptr [rbp - 48]
lea rdi, [rbp - 96]
call <alloc::vec::Vec<T> as core::ops::index::IndexMut<core::ops::range::RangeFrom<usize>>>::index_mut
mov qword ptr [rbp - 136], rdx
mov qword ptr [rbp - 144], rax
jmp .LBB124_9
.LBB124_9:
mov rdx, qword ptr [rbp - 56]
mov rdi, qword ptr [rbp - 144]
mov rsi, qword ptr [rbp - 136]
call example::mark_sieve
jmp .LBB124_10
And in 16:
.LBB125_8:
movzx eax, word ptr [rbp - 52]
mov ecx, eax
mov qword ptr [rbp - 48], rcx
mov rsi, qword ptr [rbp - 48]
lea rdi, [rbp - 80]
call <alloc::vec::Vec<T> as core::ops::index::IndexMut<core::ops::range::RangeFrom<usize>>>::index_mut
mov qword ptr [rbp - 120], rdx
mov qword ptr [rbp - 128], rax
jmp .LBB125_9
.LBB125_9:
movzx eax, word ptr [rbp - 50]
mov edx, eax
mov rdi, qword ptr [rbp - 128]
mov rsi, qword ptr [rbp - 120]
call example::mark_sieve
jmp .LBB125_10
We can see some differences in this code:
For these different instructions, you can see their relative execution time difference looking at the excellent Agner Instruction Tables. (I would love to see them integrated in compiler explorer btw). You CPU seems to be a KabyLake (so we'll use the Skylake architecture) so we'll take the tables starting page 231 (read this page for getting the abbreviations used in the tables).
From Agner tables, the number of uops between a mov r64,m and a movzx r,m are the same (1 uop), but the mov adds 2 cycles of latency.
The different accumulators also changes some optimizations done by the processor.
The compiler also performs other architecture dependent optimizations, like loop unrolling depending on the number of ALU available on the CPU. So depending on the compiler decisions, your code may behave differently between different CPUs.
The difference may also be due to code alignment, or cache optimization.
Regarding the power-management differences, this may by impacted by 2 elements : Frequency capping, and C-States management. The cstates are states where the cpu goes into different sleep states for short period of time. The impact of sleep/wakeup on the internals of a cpu are up to the firmware. So it's not something we can really check in detail (the analysis would also alter the results).
I made a post about understanding Meltdown/Spectre attack, which explains the different optimizations a CPU can do under the hood (and that even assembly cannot reflect). You may also want to take a look at it to better understand why CPU optimization is hard, as we can't control a lot of parameters.
Happy hacking !
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With