Unexpected performance of multi-precision addition

Question

As an exercise, I am trying to implement multi-precision arithmetic addition in c and in x86-64 asm (the full listing and objdump of the program is at the end of the post).

EDIT: I have added the "addN4()" asm function that removes the "partial flags update stall" and now the "addN4()" is the fastest. :)

EDIT2: Added "addN5()" and "addN6()" c functions that calculate the correct carry. (Thanks to Stephen Canon).

The programs adds the numbers from two arrays in to the third array and generates carry value. The multi-preciton numbers are stored in little endian format. Here is the example code:

  int carry = 0;
  for (i = 0; i < n; i++) {
    c[i] = a[i] + b[i] + carry;
    carry = (c[i] < a[i]) || (c[i] < b[i]);

I am compiling the program with:

`gcc -g -O3 -Wall int.c -o int'

and running the code with:

`time ./int'

I get the following execution times:

addN1():
  0.26s user 0.00s system 94% cpu 0.284 total
addN2():
  0.42s user 0.00s system 96% cpu 0.441 total
addN3():
  0.56s user 0.00s system 97% cpu 0.580 total
addN1() with -DCOUNT_CARRIES:
  0.18s user 0.01s system 92% cpu 0.208 total
addN2() with -DCOUNT_CARRIES:
  0.41s user 0.00s system 96% cpu 0.433 total
addN4():
  0.15s user 0.00s system 89% cpu 0.169 total
addN5():
  0.20s user 0.00s system 92% cpu 0.215 total
addN6():
  0.42s user 0.00s system 96% cpu 0.441 total

I have few questions:

Why addN3() isn't the fastest? I expect this to be the fastest because I took special attention to write "nice" assembly code.
Why is addN2() slower than addN1()? In my opinion, addN1() should run slower because it has additional jmp instruction (jb 400716 ) inside of the for loop. I would expect this to cause the problem for branch predictor because this jump have 50% cache going both ways.
Why does the example ''addN1() with -DCOUNT_CARRIES'' run the fastest? In my oppinion, this example should run slower than ''andN()'' because we count the number of carries that are generated in the benchmark.

Please can someone explain me this "unexpected" execution times.

Running environment:

CPU: Intel(R) Core(TM) i7 CPU       M 640  @ 2.80GHz
GCC 4.7
Ubuntu 12.10

The whole listing of the program:

// int.c
#include <stdio.h>
#include <stdlib.h>

#define N 1024

unsigned long a[N];
unsigned long b[N];
unsigned long c[N];

int carry_count;

void addN1(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
  int i;
  int carry = 0;
  for (i = 0; i < n; i++) {
    c[i] = a[i] + b[i] + carry;
    carry = (c[i] < a[i]) || (c[i] < b[i]);
#ifdef COUNT_CARRIES
    carry_count += carry;
#endif
  }
}

void addN2(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
  int i;
  int carry = 0;
  for (i = 0; i < n; i++) {
    c[i] = a[i] + b[i] + carry;
    carry = (c[i] < a[i]) | (c[i] < b[i]);
#ifdef COUNT_CARRIES
    carry_count += carry;
#endif
  }
}

void addN3(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
  register unsigned long tmp;
  register unsigned long index;
  asm volatile (
    "xor %[index], %[index]
"
    "1:
	"
    "movq (%[a],%[index],8), %[tmp]
	"
    "adcq (%[b],%[index],8), %[tmp]
	"
    "movq %[tmp], (%[c],%[index],8)
	"
    "inc %[index]
	"
    "dec %[n]
	"
    "jnz 1b"
    : [a] "+r"(a), [b] "+r"(b), [c] "+r"(c), [n] "+r"(n),
      [tmp] "=r"(tmp), [index] "=r"(index)
    :: "memory"
  );
}

void addN4(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
  register unsigned long tmp;
  register unsigned long index;
  unsigned char carry = 0;
  asm volatile (
    "xor %[index], %[index]
"
    "1:
	"
    "shr %[carry]
	"
    "movq (%[a],%[index],8), %[tmp]
	"
    "adcq (%[b],%[index],8), %[tmp]
	"
    "movq %[tmp], (%[c],%[index],8)
	"
    "setb %[carry]
	"
    "add $1, %[index]
	"
    "sub $1, %[n]
	"
    "jnz 1b"
    : [a] "+r"(a), [b] "+r"(b), [c] "+r"(c), [n] "+r"(n),
      [tmp] "=r"(tmp), [index] "=r"(index), [carry] "+r"(carry)
    :: "memory"
  );
}

void addN5(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
  int i;
  int carry = 0;
  int partial;
  for (i = 0; i < n; i++) {
    c[i] = a[i] + b[i];
    partial = c[i] < a[i];
    c[i] += carry;
    carry = (!c[i]) || partial;
  }
}
void addN6(unsigned long *a, unsigned long *b, unsigned long *c, int n) {
  int i;
  int carry = 0;
  int partial;
  for (i = 0; i < n; i++) {
    c[i] = a[i] + b[i];
    partial = c[i] < a[i];
    c[i] += carry;
    carry = (!c[i]) | partial;
  }
}

unsigned long rand_long() {
  unsigned long x, y, z;
  x = rand();
  y = rand();
  z = rand();
  // rand() gives 31 bits
  return (x << 62) | (y << 31) | z;
}

int main() {
  int i;
  srandom(0);
  for (i = 0; i < N; i++) {
    a[i] = rand_long();
    b[i] = rand_long();
  }
  for (i = 0; i < 100000; i++) {
    // I change this function in each run.
    addN1(a, b, c, N);
  }
  for (i = 0; i < N; i++) {
    printf("%lu
", c[i]);
  }
  printf("%d", carry_count);
  return 0;
}

Objdump:

00000000004006e0 <addN1>:
  4006e0:       31 c0                   xor    %eax,%eax
  4006e2:       45 31 c9                xor    %r9d,%r9d
  4006e5:       85 c9                   test   %ecx,%ecx
  4006e7:       44 8b 15 72 65 20 00    mov    0x206572(%rip),%r10d        # 606c60 <carry
_count>
  4006ee:       7e 38                   jle    400728 <addN1+0x48>
  4006f0:       4c 8b 04 c7             mov    (%rdi,%rax,8),%r8
  4006f4:       4c 03 04 c6             add    (%rsi,%rax,8),%r8
  4006f8:       4d 01 c8                add    %r9,%r8
  4006fb:       41 b9 01 00 00 00       mov    $0x1,%r9d
  400701:       4c 89 04 c2             mov    %r8,(%rdx,%rax,8)
  400705:       4c 3b 04 c7             cmp    (%rdi,%rax,8),%r8
  400709:       72 0b                   jb     400716 <addN1+0x36>
  40070b:       45 31 c9                xor    %r9d,%r9d
  40070e:       4c 3b 04 c6             cmp    (%rsi,%rax,8),%r8
  400712:       41 0f 92 c1             setb   %r9b
  400716:       48 83 c0 01             add    $0x1,%rax
  40071a:       45 01 ca                add    %r9d,%r10d
  40071d:       39 c1                   cmp    %eax,%ecx
  40071f:       7f cf                   jg     4006f0 <addN1+0x10>
  400721:       44 89 15 38 65 20 00    mov    %r10d,0x206538(%rip)        # 606c60 <carry_count>
  400728:       f3 c3                   repz retq 
  40072a:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

0000000000400730 <addN2>:
  400730:       31 c0                   xor    %eax,%eax
  400732:       45 31 c0                xor    %r8d,%r8d
  400735:       85 c9                   test   %ecx,%ecx
  400737:       44 8b 1d 22 65 20 00    mov    0x206522(%rip),%r11d        # 606c60 <carry_count>
  40073e:       7e 39                   jle    400779 <addN2+0x49>
  400740:       4c 8b 14 c7             mov    (%rdi,%rax,8),%r10
  400744:       4c 03 14 c6             add    (%rsi,%rax,8),%r10
  400748:       4f 8d 0c 02             lea    (%r10,%r8,1),%r9
  40074c:       4c 89 0c c2             mov    %r9,(%rdx,%rax,8)
  400750:       4c 3b 0c c6             cmp    (%rsi,%rax,8),%r9
  400754:       41 0f 92 c0             setb   %r8b
  400758:       4c 3b 0c c7             cmp    (%rdi,%rax,8),%r9
  40075c:       41 0f 92 c1             setb   %r9b
  400760:       48 83 c0 01             add    $0x1,%rax
  400764:       45 09 c8                or     %r9d,%r8d
  400767:       45 0f b6 c0             movzbl %r8b,%r8d
  40076b:       45 01 c3                add    %r8d,%r11d
  40076e:       39 c1                   cmp    %eax,%ecx
  400770:       7f ce                   jg     400740 <addN2+0x10>
  400772:       44 89 1d e7 64 20 00    mov    %r11d,0x2064e7(%rip)        # 606c60 <carry_count>
  400779:       f3 c3                   repz retq 
  40077b:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)

0000000000400780 <addN3>:
  400780:       4d 31 c0                xor    %r8,%r8
  400783:       4a 8b 04 c7             mov    (%rdi,%r8,8),%rax
  400787:       4a 13 04 c6             adc    (%rsi,%r8,8),%rax
  40078b:       4a 89 04 c2             mov    %rax,(%rdx,%r8,8)
  40078f:       49 ff c0                inc    %r8
  400792:       ff c9                   dec    %ecx
  400794:       75 ed                   jne    400783 <addN3+0x3>
  400796:       c3                      retq

0000000000400770 <addN4>:
  400770:       31 c0                   xor    %eax,%eax
  400772:       4d 31 c9                xor    %r9,%r9
  400775:       d0 e8                   shr    %al
  400777:       4e 8b 04 cf             mov    (%rdi,%r9,8),%r8
  40077b:       4e 13 04 ce             adc    (%rsi,%r9,8),%r8
  40077f:       4e 89 04 ca             mov    %r8,(%rdx,%r9,8)
  400783:       0f 92 c0                setb   %al
  400786:       49 83 c1 01             add    $0x1,%r9
  40078a:       83 e9 01                sub    $0x1,%ecx
  40078d:       75 e6                   jne    400775 <addN4+0x5>
  40078f:       c3                      retq  

0000000000400790 <addN5>:
  400790:       31 c0                   xor    %eax,%eax
  400792:       45 31 c9                xor    %r9d,%r9d
  400795:       85 c9                   test   %ecx,%ecx
  400797:       41 bb 01 00 00 00       mov    $0x1,%r11d
  40079d:       7e 35                   jle    4007d4 <addN5+0x44>
  40079f:       90                      nop
  4007a0:       4c 8b 04 c6             mov    (%rsi,%rax,8),%r8
  4007a4:       4c 03 04 c7             add    (%rdi,%rax,8),%r8
  4007a8:       4c 89 04 c2             mov    %r8,(%rdx,%rax,8)
  4007ac:       4c 8b 14 c7             mov    (%rdi,%rax,8),%r10
  4007b0:       4d 01 c1                add    %r8,%r9
  4007b3:       4c 89 0c c2             mov    %r9,(%rdx,%rax,8)
  4007b7:       4d 39 d0                cmp    %r10,%r8
  4007ba:       41 0f 92 c0             setb   %r8b
  4007be:       4d 85 c9                test   %r9,%r9
  4007c1:       45 0f b6 c0             movzbl %r8b,%r8d
  4007c5:       45 0f 44 c3             cmove  %r11d,%r8d
  4007c9:       48 83 c0 01             add    $0x1,%rax
  4007cd:       39 c1                   cmp    %eax,%ecx
  4007cf:       4d 63 c8                movslq %r8d,%r9
  4007d2:       7f cc                   jg     4007a0 <addN5+0x10>
  4007d4:       f3 c3                   repz retq 
  4007d6:       66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
  4007dd:       00 00 00 

00000000004007e0 <addN6>:
  4007e0:       31 c0                   xor    %eax,%eax
  4007e2:       45 31 c9                xor    %r9d,%r9d
  4007e5:       85 c9                   test   %ecx,%ecx
  4007e7:       7e 38                   jle    400821 <addN6+0x41>
  4007e9:       0f 1f 80 00 00 00 00    nopl   0x0(%rax)
  4007f0:       4c 8b 04 c6             mov    (%rsi,%rax,8),%r8
  4007f4:       4c 03 04 c7             add    (%rdi,%rax,8),%r8
  4007f8:       4c 89 04 c2             mov    %r8,(%rdx,%rax,8)
  4007fc:       4c 3b 04 c7             cmp    (%rdi,%rax,8),%r8
  400800:       41 0f 92 c2             setb   %r10b
  400804:       4d 01 c8                add    %r9,%r8
  400807:       4d 85 c0                test   %r8,%r8
  40080a:       4c 89 04 c2             mov    %r8,(%rdx,%rax,8)
  40080e:       41 0f 94 c0             sete   %r8b
  400812:       48 83 c0 01             add    $0x1,%rax
  400816:       45 09 d0                or     %r10d,%r8d
  400819:       39 c1                   cmp    %eax,%ecx
  40081b:       45 0f b6 c8             movzbl %r8b,%r9d
  40081f:       7f cf                   jg     4007f0 <addN6+0x10>
  400821:       f3 c3                   repz retq 
  400823:       66 66 66 66 2e 0f 1f    data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
  40082a:       84 00 00 00 00 00

Stephen Canon · Accepted Answer

Question 1:

You are encountering a partial flags update stall. This is one of the least talked about architectural hazards.

Because the inc and dec instructions do not write all of the EFLAGS, they require any preceding instructions that write to EFLAGS to finish before they can issue (to get the value of the bits they don’t write to). This essentially serializes your entire loop. See section 3.5.2.6 in Intel’s optimization manual for more details.

The upshot is that your very clever loop, which depends on inc and dec not overwriting carry, is unfortunately too clever by half.

Now, what can you do about it?

Use one of the other implementations that materializes carry and doesn’t need to use inc or dec. Suitably unrolled, this is a very fast approach.
Be even more clever. You can use lea to handle indexing and counting, and branch on jrcxz, which lets you preserve carry without a partial flags update stall. The details are fun to work out on your own, so I won’t give the whole game away.
Buy new hardware! The situation with regard to this particular stall is much better on Sandybridge and Ivybridge. (They insert a "merge-flags” µop instead of serializing).

Question 2:

Without a simulator, it is very hard to say precisely why this is happening. However, I would note the following: you are running repeatedly over the same (fairly small) data set. The branch predictor on a modern x86 is very sophisticated, and it is likely predicting the first branch with very high accuracy, which would mean that AddN1 would execute significantly fewer instructions than AddN2.

As an aside: both carry checks in the C code are actually incorrect(!):

c[i] = a[i] + b[i] + carry;
carry = (c[i] < a[i]) || (c[i] < b[i]);

If a[i] = b[i] = 0xffffffffffffffff and carry = 1, then c[i] == a[i] and c[i] == b[i], but a carry has nonetheless occurred. (Further aside: this perfectly illustrates the hazards of trusting randomized testing. The odds of a random test hitting this case are 680564733841876926926749214863536422912:1. If you could test one random add every cycle on every core of a fleet of 12-core Xeons, you would still need to have 3x10^20 computers in your cluster to have a 50% chance of finding this bug in one year).

A few options for how to fix it:

carry = (c[i] < a[i] || c[i] == a[i] & carry);

or

partialresult = a[i] + b[i];
partialcarry = partialresult < a[i];
c[i] = partialresult + carry;
carry = !c[i] | partialcarry;

Question 3:

Honestly, I have no idea. I would need to spend a lot of time thinking about it that I don’t have. Performance analysis of modern processors is exceedingly complex, and without a simulator they can be perplexing.

Other notes:

The compiler has decided to re-read a[i] and b[i] from memory for the compares. Presumably this because it is trying to avoid an aliasing hazard between them and c[i]. Since an optimal multi precision add is entirely load-bound, this limits your throughput to 50% of peak. Either put a[i] and b[i] into temporaries or add the restrict keyword to avoid this hazard.

You can make your AddN4 faster by unrolling, as you don’t need to do the setb/shr dance between adds that don’t span a loop boundary.

Unexpected performance of multi-precision addition

Tags:

c

x86

gcc

assembly

Srđan Stipić

1 Answers

Question 1:

Question 2:

Question 3:

Other notes:

Stephen Canon

Recent Activity

Donate For Us

Unexpected performance of multi-precision addition

Tags:

c

x86

gcc

assembly

Srđan Stipić

1 Answers

Question 1:

Question 2:

Question 3:

Other notes:

Stephen Canon

Related questions

Recent Activity

Donate For Us