As an exercise, I am trying to implement multi-precision arithmetic addition in c and in x86-64 asm (the full listing and objdump of the program is at the end of the post).
EDIT: I have added the "addN4()" asm function that removes the "partial flags update stall" and now the "addN4()" is the fastest. :)
EDIT2: Added "addN5()" and "addN6()" c functions that calculate the correct carry. (Thanks to Stephen Canon).
The programs adds the numbers from two arrays in to the third array and generates carry value. The multi-preciton numbers are stored in little endian format. Here is the example code:
int carry = 0;
for (i = 0; i < n; i++) {
c[i] = a[i] + b[i] + carry;
carry = (c[i] < a[i]) || (c[i] < b[i]);
I am compiling the program with:
`gcc -g -O3 -Wall int.c -o int'
and running the code with:
`time ./int'
I get the following execution times:
addN1():
0.26s user 0.00s system 94% cpu 0.284 total
addN2():
0.42s user 0.00s system 96% cpu 0.441 total
addN3():
0.56s user 0.00s system 97% cpu 0.580 total
addN1() with -DCOUNT_CARRIES:
0.18s user 0.01s system 92% cpu 0.208 total
addN2() with -DCOUNT_CARRIES:
0.41s user 0.00s system 96% cpu 0.433 total
addN4():
0.15s user 0.00s system 89% cpu 0.169 total
addN5():
0.20s user 0.00s system 92% cpu 0.215 total
addN6():
0.42s user 0.00s system 96% cpu 0.441 total
I have few questions:
Why addN3() isn't the fastest? I expect this to be the fastest because I took special attention to write "nice" assembly code.
Why is addN2() slower than addN1()? In my opinion, addN1() should run slower because it has additional jmp instruction (jb 400716 ) inside of the for loop. I would expect this to cause the problem for branch predictor because this jump have 50% cache going both ways.
Why does the example ''addN1() with -DCOUNT_CARRIES'' run the fastest? In my oppinion, this example should run slower than ''andN()'' because we count the number of carries that are generated in the benchmark.
Please can someone explain me this "unexpected" execution times.
Running environment:
CPU: Intel(R) Core(TM) i7 CPU M 640 @ 2.80GHz
GCC 4.7
Ubuntu 12.10
The whole listing of the program:
// int.c
#include <stdio.h>
#include <stdlib.h>
#define N 1024
unsigned long a[N];
unsigned long b[N];
unsigned long c[N];
int carry_count;
void addN1(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
int i;
int carry = 0;
for (i = 0; i < n; i++) {
c[i] = a[i] + b[i] + carry;
carry = (c[i] < a[i]) || (c[i] < b[i]);
#ifdef COUNT_CARRIES
carry_count += carry;
#endif
}
}
void addN2(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
int i;
int carry = 0;
for (i = 0; i < n; i++) {
c[i] = a[i] + b[i] + carry;
carry = (c[i] < a[i]) | (c[i] < b[i]);
#ifdef COUNT_CARRIES
carry_count += carry;
#endif
}
}
void addN3(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
register unsigned long tmp;
register unsigned long index;
asm volatile (
"xor %[index], %[index]\n"
"1:\n\t"
"movq (%[a],%[index],8), %[tmp]\n\t"
"adcq (%[b],%[index],8), %[tmp]\n\t"
"movq %[tmp], (%[c],%[index],8)\n\t"
"inc %[index]\n\t"
"dec %[n]\n\t"
"jnz 1b"
: [a] "+r"(a), [b] "+r"(b), [c] "+r"(c), [n] "+r"(n),
[tmp] "=r"(tmp), [index] "=r"(index)
:: "memory"
);
}
void addN4(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
register unsigned long tmp;
register unsigned long index;
unsigned char carry = 0;
asm volatile (
"xor %[index], %[index]\n"
"1:\n\t"
"shr %[carry]\n\t"
"movq (%[a],%[index],8), %[tmp]\n\t"
"adcq (%[b],%[index],8), %[tmp]\n\t"
"movq %[tmp], (%[c],%[index],8)\n\t"
"setb %[carry]\n\t"
"add $1, %[index]\n\t"
"sub $1, %[n]\n\t"
"jnz 1b"
: [a] "+r"(a), [b] "+r"(b), [c] "+r"(c), [n] "+r"(n),
[tmp] "=r"(tmp), [index] "=r"(index), [carry] "+r"(carry)
:: "memory"
);
}
void addN5(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
int i;
int carry = 0;
int partial;
for (i = 0; i < n; i++) {
c[i] = a[i] + b[i];
partial = c[i] < a[i];
c[i] += carry;
carry = (!c[i]) || partial;
}
}
void addN6(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
int i;
int carry = 0;
int partial;
for (i = 0; i < n; i++) {
c[i] = a[i] + b[i];
partial = c[i] < a[i];
c[i] += carry;
carry = (!c[i]) | partial;
}
}
unsigned long rand_long() {
unsigned long x, y, z;
x = rand();
y = rand();
z = rand();
// rand() gives 31 bits
return (x << 62) | (y << 31) | z;
}
int main() {
int i;
srandom(0);
for (i = 0; i < N; i++) {
a[i] = rand_long();
b[i] = rand_long();
}
for (i = 0; i < 100000; i++) {
// I change this function in each run.
addN1(a, b, c, N);
}
for (i = 0; i < N; i++) {
printf("%lu\n", c[i]);
}
printf("%d", carry_count);
return 0;
}
Objdump:
00000000004006e0 <addN1>:
4006e0: 31 c0 xor %eax,%eax
4006e2: 45 31 c9 xor %r9d,%r9d
4006e5: 85 c9 test %ecx,%ecx
4006e7: 44 8b 15 72 65 20 00 mov 0x206572(%rip),%r10d # 606c60 <carry
_count>
4006ee: 7e 38 jle 400728 <addN1+0x48>
4006f0: 4c 8b 04 c7 mov (%rdi,%rax,8),%r8
4006f4: 4c 03 04 c6 add (%rsi,%rax,8),%r8
4006f8: 4d 01 c8 add %r9,%r8
4006fb: 41 b9 01 00 00 00 mov $0x1,%r9d
400701: 4c 89 04 c2 mov %r8,(%rdx,%rax,8)
400705: 4c 3b 04 c7 cmp (%rdi,%rax,8),%r8
400709: 72 0b jb 400716 <addN1+0x36>
40070b: 45 31 c9 xor %r9d,%r9d
40070e: 4c 3b 04 c6 cmp (%rsi,%rax,8),%r8
400712: 41 0f 92 c1 setb %r9b
400716: 48 83 c0 01 add $0x1,%rax
40071a: 45 01 ca add %r9d,%r10d
40071d: 39 c1 cmp %eax,%ecx
40071f: 7f cf jg 4006f0 <addN1+0x10>
400721: 44 89 15 38 65 20 00 mov %r10d,0x206538(%rip) # 606c60 <carry_count>
400728: f3 c3 repz retq
40072a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
0000000000400730 <addN2>:
400730: 31 c0 xor %eax,%eax
400732: 45 31 c0 xor %r8d,%r8d
400735: 85 c9 test %ecx,%ecx
400737: 44 8b 1d 22 65 20 00 mov 0x206522(%rip),%r11d # 606c60 <carry_count>
40073e: 7e 39 jle 400779 <addN2+0x49>
400740: 4c 8b 14 c7 mov (%rdi,%rax,8),%r10
400744: 4c 03 14 c6 add (%rsi,%rax,8),%r10
400748: 4f 8d 0c 02 lea (%r10,%r8,1),%r9
40074c: 4c 89 0c c2 mov %r9,(%rdx,%rax,8)
400750: 4c 3b 0c c6 cmp (%rsi,%rax,8),%r9
400754: 41 0f 92 c0 setb %r8b
400758: 4c 3b 0c c7 cmp (%rdi,%rax,8),%r9
40075c: 41 0f 92 c1 setb %r9b
400760: 48 83 c0 01 add $0x1,%rax
400764: 45 09 c8 or %r9d,%r8d
400767: 45 0f b6 c0 movzbl %r8b,%r8d
40076b: 45 01 c3 add %r8d,%r11d
40076e: 39 c1 cmp %eax,%ecx
400770: 7f ce jg 400740 <addN2+0x10>
400772: 44 89 1d e7 64 20 00 mov %r11d,0x2064e7(%rip) # 606c60 <carry_count>
400779: f3 c3 repz retq
40077b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
0000000000400780 <addN3>:
400780: 4d 31 c0 xor %r8,%r8
400783: 4a 8b 04 c7 mov (%rdi,%r8,8),%rax
400787: 4a 13 04 c6 adc (%rsi,%r8,8),%rax
40078b: 4a 89 04 c2 mov %rax,(%rdx,%r8,8)
40078f: 49 ff c0 inc %r8
400792: ff c9 dec %ecx
400794: 75 ed jne 400783 <addN3+0x3>
400796: c3 retq
0000000000400770 <addN4>:
400770: 31 c0 xor %eax,%eax
400772: 4d 31 c9 xor %r9,%r9
400775: d0 e8 shr %al
400777: 4e 8b 04 cf mov (%rdi,%r9,8),%r8
40077b: 4e 13 04 ce adc (%rsi,%r9,8),%r8
40077f: 4e 89 04 ca mov %r8,(%rdx,%r9,8)
400783: 0f 92 c0 setb %al
400786: 49 83 c1 01 add $0x1,%r9
40078a: 83 e9 01 sub $0x1,%ecx
40078d: 75 e6 jne 400775 <addN4+0x5>
40078f: c3 retq
0000000000400790 <addN5>:
400790: 31 c0 xor %eax,%eax
400792: 45 31 c9 xor %r9d,%r9d
400795: 85 c9 test %ecx,%ecx
400797: 41 bb 01 00 00 00 mov $0x1,%r11d
40079d: 7e 35 jle 4007d4 <addN5+0x44>
40079f: 90 nop
4007a0: 4c 8b 04 c6 mov (%rsi,%rax,8),%r8
4007a4: 4c 03 04 c7 add (%rdi,%rax,8),%r8
4007a8: 4c 89 04 c2 mov %r8,(%rdx,%rax,8)
4007ac: 4c 8b 14 c7 mov (%rdi,%rax,8),%r10
4007b0: 4d 01 c1 add %r8,%r9
4007b3: 4c 89 0c c2 mov %r9,(%rdx,%rax,8)
4007b7: 4d 39 d0 cmp %r10,%r8
4007ba: 41 0f 92 c0 setb %r8b
4007be: 4d 85 c9 test %r9,%r9
4007c1: 45 0f b6 c0 movzbl %r8b,%r8d
4007c5: 45 0f 44 c3 cmove %r11d,%r8d
4007c9: 48 83 c0 01 add $0x1,%rax
4007cd: 39 c1 cmp %eax,%ecx
4007cf: 4d 63 c8 movslq %r8d,%r9
4007d2: 7f cc jg 4007a0 <addN5+0x10>
4007d4: f3 c3 repz retq
4007d6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
4007dd: 00 00 00
00000000004007e0 <addN6>:
4007e0: 31 c0 xor %eax,%eax
4007e2: 45 31 c9 xor %r9d,%r9d
4007e5: 85 c9 test %ecx,%ecx
4007e7: 7e 38 jle 400821 <addN6+0x41>
4007e9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
4007f0: 4c 8b 04 c6 mov (%rsi,%rax,8),%r8
4007f4: 4c 03 04 c7 add (%rdi,%rax,8),%r8
4007f8: 4c 89 04 c2 mov %r8,(%rdx,%rax,8)
4007fc: 4c 3b 04 c7 cmp (%rdi,%rax,8),%r8
400800: 41 0f 92 c2 setb %r10b
400804: 4d 01 c8 add %r9,%r8
400807: 4d 85 c0 test %r8,%r8
40080a: 4c 89 04 c2 mov %r8,(%rdx,%rax,8)
40080e: 41 0f 94 c0 sete %r8b
400812: 48 83 c0 01 add $0x1,%rax
400816: 45 09 d0 or %r10d,%r8d
400819: 39 c1 cmp %eax,%ecx
40081b: 45 0f b6 c8 movzbl %r8b,%r9d
40081f: 7f cf jg 4007f0 <addN6+0x10>
400821: f3 c3 repz retq
400823: 66 66 66 66 2e 0f 1f data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
40082a: 84 00 00 00 00 00
You are encountering a partial flags update stall. This is one of the least talked about architectural hazards.
Because the inc
and dec
instructions do not write all of the EFLAGS, they require any preceding instructions that write to EFLAGS to finish before they can issue (to get the value of the bits they don’t write to). This essentially serializes your entire loop. See section 3.5.2.6 in Intel’s optimization manual for more details.
The upshot is that your very clever loop, which depends on inc
and dec
not overwriting carry, is unfortunately too clever by half.
Now, what can you do about it?
inc
or dec
. Suitably unrolled, this is a very fast approach.lea
to handle indexing and counting, and branch on jrcxz
, which lets you preserve carry without a partial flags update stall. The details are fun to work out on your own, so I won’t give the whole game away.Without a simulator, it is very hard to say precisely why this is happening. However, I would note the following: you are running repeatedly over the same (fairly small) data set. The branch predictor on a modern x86 is very sophisticated, and it is likely predicting the first branch with very high accuracy, which would mean that AddN1 would execute significantly fewer instructions than AddN2.
As an aside: both carry checks in the C code are actually incorrect(!):
c[i] = a[i] + b[i] + carry;
carry = (c[i] < a[i]) || (c[i] < b[i]);
If a[i] = b[i] = 0xffffffffffffffff
and carry = 1
, then c[i] == a[i]
and c[i] == b[i]
, but a carry has nonetheless occurred. (Further aside: this perfectly illustrates the hazards of trusting randomized testing. The odds of a random test hitting this case are 680564733841876926926749214863536422912:1. If you could test one random add every cycle on every core of a fleet of 12-core Xeons, you would still need to have 3x10^20 computers in your cluster to have a 50% chance of finding this bug in one year).
A few options for how to fix it:
carry = (c[i] < a[i] || c[i] == a[i] & carry);
or
partialresult = a[i] + b[i];
partialcarry = partialresult < a[i];
c[i] = partialresult + carry;
carry = !c[i] | partialcarry;
Honestly, I have no idea. I would need to spend a lot of time thinking about it that I don’t have. Performance analysis of modern processors is exceedingly complex, and without a simulator they can be perplexing.
The compiler has decided to re-read a[i]
and b[i]
from memory for the compares. Presumably this because it is trying to avoid an aliasing hazard between them and c[i]
. Since an optimal multi precision add is entirely load-bound, this limits your throughput to 50% of peak. Either put a[i]
and b[i]
into temporaries or add the restrict
keyword to avoid this hazard.
You can make your AddN4 faster by unrolling, as you don’t need to do the setb
/shr
dance between adds that don’t span a loop boundary.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With