I am surprised that iterating over a vector with up to date compilers show a very different code for the same result. Here is the code:
void fun1(vector <uint8_t> &a,unsigned num) {
for (auto &&value : a) {
value += num;
}
}
void fun2(vector <uint8_t> &a,unsigned num){
for (uint32_t x = 0, p = a.size(); x < p; x++){
a[x] += num;
}
}
void fun4(vector <uint8_t> &a,unsigned num){
for (uint32_t x = 0; x < a.size(); x++){
a[x] += num;
}
}
void fun3(vector <uint8_t> &a,unsigned num) {
for (auto it = a.begin(); it!=a.end(); ++it) {
*it += num;
}
}
void fun5(vector <uint8_t> &a,unsigned num) {
std::for_each(a.begin(), a.end(), [num](auto &&val) {
val += num;
});
}
You can run it on godbolt.
The generated assembly is very different (fun1 && fun5 even have proposal for SIMD). I get that for fun4 since a.size() may change. But the others? Different compilers approximate to the same result.
The compiler should be able to generate the same code since, the intent here is really basic (and the same).
I really like fun2 version. Any way to improve it?
When giving enough info to the compiler, this becomes a non issue. So it tells that the compilers in their current version (gcc 9.x) are tailored differently for those loops when only partial info is accessible.
The code is still NOT the same but they all propose vectorization now. We will have to accept it I guess. Maybe future versions will improve that.
See the updated code (with useless allocations, but still)
Here are two the version of fun4:
void fun4_a(vector <uint8_t> &a, uint8_t num) {
size_t n = a.size();
f1: if (n <= 0) goto f2;
a[--n] += num;
goto f1;
f2: return;
}
void fun4_b(vector <uint8_t> &a, uint8_t num) {
size_t n = a.size();
while (n > 0) a[--n] += num;
}
You can run it on godbolt.
Compiler x86_64 gcc 9.1 produced the same assembly for fun4_a and fun4_b but still unrolled one 'add' loop:
fun4_a(std::vector<unsigned char, std::allocator<unsigned char> >&, unsigned char):
mov rdx, QWORD PTR [rdi]
mov rax, QWORD PTR [rdi+8]
mov ecx, esi
sub rax, rdx
je .L1
sub rax, 1
add BYTE PTR [rdx+rax], sil
test rax, rax
je .L1
.L3:
mov rdx, QWORD PTR [rdi]
sub rax, 1
add rdx, rax
add BYTE PTR [rdx], cl
test rax, rax
jne .L3
.L1:
While compiler x86-64 icc 19.0.1 decided to produce more optimizations for fun4_b:
fun4_a(std::vector<unsigned char, std::allocator<unsigned char> >&, unsigned char):
mov rcx, QWORD PTR [8+rdi] #806.26
mov rdx, rcx #806.26
mov rax, QWORD PTR [rdi] #806.52
sub rdx, rax #806.26
je ..B1.6 # Prob 18% #10.14
xor eax, eax #10.5
..B1.3: # Preds ..B1.4 ..B1.2
inc rax #10.5
mov r8, rcx #9.11
lea r9, QWORD PTR [rax+rax] #9.11
sub r8, r9 #9.11
neg r9 #9.11
add r9, rdx #9.11
mov rdi, r9 #9.11
add BYTE PTR [1+r8], sil #11.3
inc rdi #9.11
je ..B1.6 # Prob 18% #10.14
add BYTE PTR [r8], sil #11.3
test r9, r9 #10.14
jne ..B1.3 # Prob 82% #10.14
..B1.6: # Preds ..B1.3 ..B1.4 ..B1.1
ret #13.5
fun4_b(std::vector<unsigned char, std::allocator<unsigned char> >&, unsigned char):
mov r8d, esi #16.47
mov rsi, QWORD PTR [rdi] #806.52
mov rcx, QWORD PTR [8+rdi] #806.26
sub rcx, rsi #806.26
je ..B2.17 # Prob 50% #18.16
cmp rcx, 16 #18.5
jb ..B2.18 # Prob 10% #18.5
mov rdx, rsi #18.5
and rdx, 15 #18.5
je ..B2.9 # Prob 50% #18.5
mov rax, rdx #18.5
neg rax #18.5
lea rdx, QWORD PTR [16+rax] #18.5
add rax, 32 #18.5
cmp rcx, rax #18.5
jb ..B2.18 # Prob 10% #18.5
mov rax, rcx #18.5
xor r10d, r10d #18.5
sub rax, rdx #18.5
mov r9, rsi #18.5
and rax, 15 #18.5
neg rax #18.5
add rax, rcx #18.5
mov edi, r8d #18.20
..B2.7: # Preds ..B2.7 ..B2.6
inc r10 #18.5
add BYTE PTR [r9], dil #18.20
inc r9 #18.5
cmp r10, rdx #18.5
jb ..B2.7 # Prob 82% #18.5
jmp ..B2.10 # Prob 100% #18.5
..B2.9: # Preds ..B2.3
mov rax, rcx #18.5
and rax, 15 #18.5
neg rax #18.5
add rax, rcx #18.5
..B2.10: # Preds ..B2.7 ..B2.9
movzx edi, r8b #18.29
movd xmm0, edi #18.29
punpcklbw xmm0, xmm0 #18.29
punpcklwd xmm0, xmm0 #18.29
punpckldq xmm0, xmm0 #18.29
punpcklqdq xmm0, xmm0 #18.29
..B2.11: # Preds ..B2.11 ..B2.10
movdqu xmm1, XMMWORD PTR [rsi+rdx] #18.20
paddb xmm1, xmm0 #18.20
movdqu XMMWORD PTR [rdx+rsi], xmm1 #18.20
add rdx, 16 #18.5
cmp rdx, rax #18.5
jb ..B2.11 # Prob 82% #18.5
..B2.13: # Preds ..B2.11 ..B2.18
add rsi, rax #18.5
cmp rax, rcx #18.5
jae ..B2.17 # Prob 9% #18.5
..B2.15: # Preds ..B2.13 ..B2.15
inc rax #18.5
add BYTE PTR [rsi], r8b #18.20
inc rsi #18.5
cmp rax, rcx #18.5
jb ..B2.15 # Prob 82% #18.5
..B2.17: # Preds ..B2.15 ..B2.1 ..B2.13
ret #19.1
..B2.18: # Preds ..B2.2 ..B2.4
xor eax, eax #18.5
jmp ..B2.13 # Prob 100% #18.5
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With