Is there a really working example which showing the benefits of ILP(Instruction-Level Parallelism) on x86_64?

Question

As known CPU is pipeline, and it works most efficiently if the sequence of commands independent from each other - this known as ILP (Instruction-Level Parallelism): http://en.wikipedia.org/wiki/Instruction-level_parallelism

But is there a really working example which showing the benefits of ILP, at least syntetic example, for CPU x86_64 (but for the same amount of cmp/jne in both cases)?

I will write the following example - add up all the elements of the array, but it does not show any advantages of ILP: http://ideone.com/fork/poWfsm

Sequential:

        for(i = 0; i < arr_size; i += 8) {
            result += arr[i+0] + arr[i+1] + 
                    arr[i+2] + arr[i+3] + 
                    arr[i+4] + arr[i+5] +
                    arr[i+6] + arr[i+7];
        }

ILP:

        register unsigned int v0, v1, v2, v3;
        v0 = v1 = v2 = v3 = 0;
        for(i = 0; i < arr_size; i += 8) {              
            v0 += arr[i+0] + arr[i+1];
            v1 += arr[i+2] + arr[i+3];
            v2 += arr[i+4] + arr[i+5];
            v3 += arr[i+6] + arr[i+7];
        }
        result = v0+v1+v2+v3;

Result:

seq: 0.100000 sec, res: 1000000000, ipl: 0.110000 sec, faster 0.909091 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.100000 sec, faster 1.000000 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.110000 sec, faster 0.909091 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.100000 sec, faster 1.000000 X, res: 1000000000

seq: 0.110000 sec, res: 1000000000, ipl: 0.110000 sec, faster 1.000000 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.110000 sec, faster 0.909091 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.110000 sec, faster 0.909091 X, res: 1000000000

seq: 0.110000 sec, res: 1000000000, ipl: 0.100000 sec, faster 1.100000 X, res: 1000000000

seq: 0.110000 sec, res: 1000000000, ipl: 0.100000 sec, faster 1.100000 X, res: 1000000000

seq: 0.110000 sec, res: 1000000000, ipl: 0.120000 sec, faster 0.916667 X, res: 1000000000

faster AVG: 0.975303

ILP even a little slower than Sequential.

C-code: http://ideone.com/fork/poWfsm

#include <time.h>
#include <stdio.h>
#include <stdlib.h>

int main() {
    // create and init array
    const size_t arr_size = 100000000;
    unsigned int *arr = (unsigned int*) malloc(arr_size * sizeof(unsigned int));
    size_t i, k;
    for(i = 0; i < arr_size; ++i)
        arr[i] = 10;

    unsigned int result = 0;
    clock_t start, end;
    const int c_iterations = 10;    // iterations of experiment
    float faster_avg = 0;
    // -----------------------------------------------------------------


    for(k = 0; k < c_iterations; ++k) {
        result = 0; 

        // Sequential
        start = clock();

        for(i = 0; i < arr_size; i += 8) {
            result += arr[i+0] + arr[i+1] + 
                    arr[i+2] + arr[i+3] + 
                    arr[i+4] + arr[i+5] +
                    arr[i+6] + arr[i+7];
        }

        end = clock();
        const float c_time_seq = (float)(end - start)/CLOCKS_PER_SEC;   
        printf("seq: %f sec, res: %u, ", c_time_seq, result);
        // -----------------------------------------------------------------

        result = 0;

        // IPL-optimization
        start = clock();

        register unsigned int v0, v1, v2, v3;
        v0 = v1 = v2 = v3 = 0;

        for(i = 0; i < arr_size; i += 8) {

            v0 += arr[i+0] + arr[i+1];
            v1 += arr[i+2] + arr[i+3];
            v2 += arr[i+4] + arr[i+5];
            v3 += arr[i+6] + arr[i+7];


        }
        result = v0+v1+v2+v3;


        end = clock();
        const float c_time_ipl = (float)(end - start)/CLOCKS_PER_SEC;
        const float c_faster = c_time_seq/c_time_ipl;

        printf("ipl: %f sec, faster %f X, res: %u 
", c_time_ipl, c_faster, result);           
        faster_avg += c_faster;
    }

    faster_avg = faster_avg/c_iterations;
    printf("faster AVG: %f 
", faster_avg);

    return 0;
}

UPDATE:

Sequential (Disassembler MS Visual Studio 2013):

    for (i = 0; i < arr_size; i += 8) {
        result += arr[i + 0] + arr[i + 1] +
            arr[i + 2] + arr[i + 3] +
            arr[i + 4] + arr[i + 5] +
            arr[i + 6] + arr[i + 7];
    }

000000013F131080  mov         ecx,dword ptr [rdx-18h]  
000000013F131083  lea         rdx,[rdx+20h]  
000000013F131087  add         ecx,dword ptr [rdx-34h]  
000000013F13108A  add         ecx,dword ptr [rdx-30h]  
000000013F13108D  add         ecx,dword ptr [rdx-2Ch]  
000000013F131090  add         ecx,dword ptr [rdx-28h]  
000000013F131093  add         ecx,dword ptr [rdx-24h]  
000000013F131096  add         ecx,dword ptr [rdx-1Ch]  
000000013F131099  add         ecx,dword ptr [rdx-20h]  
000000013F13109C  add         edi,ecx  
000000013F13109E  dec         r8  
000000013F1310A1  jne         main+80h (013F131080h)

ILP (Disassembler MS Visual Studio 2013):

    for (i = 0; i < arr_size; i += 8) {
        v0 += arr[i + 0] + arr[i + 1];
000000013F1310F0  mov         ecx,dword ptr [rdx-0Ch]  
        v1 += arr[i + 2] + arr[i + 3];
        v2 += arr[i + 4] + arr[i + 5];
000000013F1310F3  mov         eax,dword ptr [rdx+8]  
000000013F1310F6  lea         rdx,[rdx+20h]  
000000013F1310FA  add         ecx,dword ptr [rdx-28h]  
000000013F1310FD  add         eax,dword ptr [rdx-1Ch]  
000000013F131100  add         ebp,ecx  
000000013F131102  mov         ecx,dword ptr [rdx-24h]  
000000013F131105  add         ebx,eax  
000000013F131107  add         ecx,dword ptr [rdx-20h]  
        v3 += arr[i + 6] + arr[i + 7];
000000013F13110A  mov         eax,dword ptr [rdx-10h]  
        v3 += arr[i + 6] + arr[i + 7];
000000013F13110D  add         eax,dword ptr [rdx-14h]  
000000013F131110  add         esi,ecx  
000000013F131112  add         edi,eax  
000000013F131114  dec         r8  
000000013F131117  jne         main+0F0h (013F1310F0h) 
    }
    result = v0 + v1 + v2 + v3;

Compiler command line:

/GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /Ob2 /sdl /Fd"x64\Release\vc120.pdb" /fp:precise /D "_MBCS" /errorReport:prompt /WX- /Zc:forScope /Gd /Oi /MT /Fa"x64\Release\" /EHsc /nologo /Fo"x64\Release\" /Ot /Fp"x64\Release\IPL_reduce_test.pch"

Additional Notes to the answer:

The simple example which showing the benefits of ILP between Unroll-loop and Unroll-loop+ILP for array of 50000000 double elements: http://ideone.com/LgTP6b

faster AVG: 1.152778

False-Sequential which can be optimized by CPU-pipeline (Disassembler MS Visual Studio 2013) - for add 8 elements in each iteration uses temporary register xmm0 which then adds to the result xmm6, i.e. can be used Register renaming:

result += arr[i + 0] + arr[i + 1] + arr[i + 2] + arr[i + 3] +
    arr[i + 4] + arr[i + 5] + arr[i + 6] + arr[i + 7];
000000013FBA1090  movsd       xmm0,mmword ptr [rcx-10h]  
000000013FBA1095  add         rcx,40h  
000000013FBA1099  addsd       xmm0,mmword ptr [rcx-48h]  
000000013FBA109E  addsd       xmm0,mmword ptr [rcx-40h]  
000000013FBA10A3  addsd       xmm0,mmword ptr [rcx-38h]  
000000013FBA10A8  addsd       xmm0,mmword ptr [rcx-30h]  
000000013FBA10AD  addsd       xmm0,mmword ptr [rcx-28h]  
000000013FBA10B2  addsd       xmm0,mmword ptr [rcx-20h]  
000000013FBA10B7  addsd       xmm0,mmword ptr [rcx-18h]  
000000013FBA10BC  addsd       xmm6,xmm0  
000000013FBA10C0  dec         rdx  
000000013FBA10C3  jne         main+90h (013FBA1090h)

True-Sequential which can not be optimized by CPU-pipeline (Disassembler MS Visual Studio 2013) - for add 8 elements in each iteration uses the result register xmm6, i.e. can not be used Register renaming:

            result += arr[i + 0];
000000013FFC1090  addsd       xmm6,mmword ptr [rcx-10h]  
000000013FFC1095  add         rcx,40h  
            result += arr[i + 1];
000000013FFC1099  addsd       xmm6,mmword ptr [rcx-48h]  
            result += arr[i + 2];
000000013FFC109E  addsd       xmm6,mmword ptr [rcx-40h]  
            result += arr[i + 3];
000000013FFC10A3  addsd       xmm6,mmword ptr [rcx-38h]  
            result += arr[i + 4];
000000013FFC10A8  addsd       xmm6,mmword ptr [rcx-30h]  
            result += arr[i + 5];
000000013FFC10AD  addsd       xmm6,mmword ptr [rcx-28h]  
            result += arr[i + 6];
000000013FFC10B2  addsd       xmm6,mmword ptr [rcx-20h]  
            result += arr[i + 7];
000000013FFC10B7  addsd       xmm6,mmword ptr [rcx-18h]  
000000013FFC10BC  dec         rdx  
000000013FFC10BF  jne         main+90h (013FFC1090h)

Mysticial · Accepted Answer

On most Intel processors, it takes 3 cycles to do a floating-point add. But it can sustain up to 1/cycle if they are independent.

We can easily demonstrate ILP by putting a floating-point add on the critical path.

Environment:

GCC 4.8.2: -O2
Sandy Bridge Xeon

Make sure that the compiler does not do unsafe floating-point optimizations.

#include <iostream>
using namespace std;

#include <time.h>

const int iterations = 1000000000;

double sequential(){
    double a = 2.3;
    double result = 0;

    for (int c = 0; c < iterations; c += 4){
        //  Every add depends on the previous add. No ILP is possible.
        result += a;
        result += a;
        result += a;
        result += a;
    }

    return result;
}
double optimized(){
    double a = 2.3;
    double result0 = 0;
    double result1 = 0;
    double result2 = 0;
    double result3 = 0;

    for (int c = 0; c < iterations; c += 4){
        //  4 independent adds. Up to 4 adds can be run in parallel.
        result0 += a;
        result1 += a;
        result2 += a;
        result3 += a;
    }

    return result0 + result1 + result2 + result3;
}

int main(){

    clock_t start0 = clock();
    double sum0 = sequential();
    clock_t end0 = clock();
    cout << "sum = " << sum0 << endl;
    cout << "sequential time: " << (double)(end0 - start0) / CLOCKS_PER_SEC << endl;

    clock_t start1 = clock();
    double sum1 = optimized();
    clock_t end1 = clock();
    cout << "sum = " << sum1 << endl;
    cout << "optimized time:  " << (double)(end1 - start1) / CLOCKS_PER_SEC << endl;

}

Output:

sum = 2.3e+09
sequential time: 0.948138
sum = 2.3e+09
optimized time:  0.317293

Notice how the difference is almost exactly 3x. That's because of the 3-cycle latency and 1-cycle throughput of the floating-point add.

The sequential version has very little ILP because all the floating-point adds are on the critical path. (each add needs to wait until the previous add is done) The unrolled version has 4 separate dependency chains with up to 4 independent adds - all of which can be run in parallel. Only 3 are required to saturate the processor core.

harold · Answer

The difference can also be made visible for integer code, for example

global cmp1
proc_frame cmp1
[endprolog]
    mov ecx, -10000000
    mov r8d, 1
    xor eax, eax
_cmp1_loop:
    add eax, r8d
    add eax, r8d
    add eax, r8d
    add eax, r8d
    add ecx, 1
    jnz _cmp1_loop
    ret
endproc_frame

global cmp2
proc_frame cmp2
[endprolog]
    mov ecx, -10000000
    mov r8d, 1
    xor eax, eax
    xor edx, edx
    xor r9d, r9d
    xor r10d, r10d
_cmp2_loop:
    add eax, r8d
    add edx, r8d
    add r9d, r8d
    add r10d, r8d
    add ecx, 1
    jnz _cmp2_loop
    add r9d, r10d
    add eax, edx
    add eax, r9d
    ret
endproc_frame

Results on my 4770K are about 35.9 million TSC ticks for the first one vs 11.9 million for the second one (minimum time over 1k runs).

In the first one, the dependency chain on eax is the slowest thing at 4 cycles per iteration. Nothing else matters, the dependency chain on ecx is faster and there is plenty of throughput to hide it and the control flow. 35.9 million TSC ticks works out to 40 million cycles by the way, since the TSC ticks at the base clock rate of 3.5GHz but the max turbo is 3.9GHz, 3.9/3.5 * 35.9 is about 40.

The version of the second one I mentioned in the comments (4 accumulators but using [rsp] to store the constant 1) takes 17.9, which makes it 2 cycles per iteration. That matches the throughput of the memory loads, which on Haswell is 2/cycle. 4 loads so 2 cycles. The loop overhead can still hide.

The second one as posted above takes 1.3333 cycles per iteration. The first four adds can go to ports 0, 1, 5 and 6, the add/jnz fused pair can go to port 6 only. Putting the fused pair in p6 leaves 3 ports for 4 µops, hence 1.3333 cycles.

Is there a really working example which showing the benefits of ILP(Instruction-Level Parallelism) on x86_64?

Tags:

c++

performance

optimization

x86

assembly

Alex

2 Answers

Mysticial

harold

Recent Activity

Donate For Us

Is there a really working example which showing the benefits of ILP(Instruction-Level Parallelism) on x86_64?

Tags:

c++

performance

optimization

x86

assembly

Alex

2 Answers

Mysticial

harold

Related questions

Recent Activity

Donate For Us