How to optimize C-code with SSE-intrinsics for packed 32x32 = 64-bit multiplies, and unpacking the halves of those results for (Galois Fields)

Question

I've been struggling for a while with the performance of the network coding in an application I'm developing (see Optimzing SSE-code, Improving performance of network coding-encoding and OpenCL distribution). Now I'm quite close to achieve acceptable performance. This is the current state of the innermost loop (which is where >99% of the execution time is being spent):

while(elementIterations-- >0)
{   
    unsigned int firstMessageField = *(currentMessageGaloisFieldsArray++);
    unsigned int secondMessageField = *(currentMessageGaloisFieldsArray++);
    __m128i valuesToMultiply = _mm_set_epi32(0, secondMessageField, 0, firstMessageField);
    __m128i mulitpliedHalves = _mm_mul_epu32(valuesToMultiply, fragmentCoefficentVector);
}

Do you have any suggestions on how to further optimize this? I understand that it's hard to do without more context but any help is appreciated!

Mysticial · Accepted Answer

Now that I'm awake, here's my answer:

In your original code, the bottleneck is almost certainly _mm_set_epi32. This single intrinsic gets compiled into this mess in your assembly:

633415EC  xor         edi,edi  
633415EE  movd        xmm3,edi  
...
633415F6  xor         ebx,ebx  
633415F8  movd        xmm4,edi  
633415FC  movd        xmm5,ebx  
63341600  movd        xmm0,esi  
...
6334160B  punpckldq   xmm5,xmm3  
6334160F  punpckldq   xmm0,xmm4 
...
63341618  punpckldq   xmm0,xmm5

What is this? 9 instructions?!?!?! Pure overhead...

Another place that seems odd is that the compiler didn't merge the adds and loads:

movdqa      xmm3,xmmword ptr [ecx-10h]
paddq       xmm0,xmm3

should have been merged into:

paddq       xmm0,xmmword ptr [ecx-10h]

I'm not sure if the compiler went brain-dead, or if it actually had a legitimate reason to do that... Anyways, it's a small thing compared to the _mm_set_epi32.

Disclaimer: The code I will present from here on violates strict-aliasing. But non-standard compliant methods are often needed to achieve maximum performance.

Solution 1: No Vectorization

This solution assumes allZero is really all zeros.

The loop is actually simpler than it looks. Since there isn't a lot of arithmetic, it might be better to just not vectorize:

//  Test Data
unsigned __int32 fragmentCoefficentVector = 1000000000;

__declspec(align(16)) int currentMessageGaloisFieldsArray_[8] = {10,11,12,13,14,15,16,17};
int *currentMessageGaloisFieldsArray = currentMessageGaloisFieldsArray_;

__m128i currentUnModdedGaloisFieldFragments_[8];
__m128i *currentUnModdedGaloisFieldFragments = currentUnModdedGaloisFieldFragments_;
memset(currentUnModdedGaloisFieldFragments,0,8 * sizeof(__m128i));


int elementIterations = 4;

//  The Loop
while (elementIterations > 0){
    elementIterations -= 1;

    //  Default 32 x 32 -> 64-bit multiply code
    unsigned __int64 r0 = currentMessageGaloisFieldsArray[0] * (unsigned __int64)fragmentCoefficentVector;
    unsigned __int64 r1 = currentMessageGaloisFieldsArray[1] * (unsigned __int64)fragmentCoefficentVector;

    //  Use this for Visual Studio. VS doesn't know how to optimize 32 x 32 -> 64-bit multiply
//    unsigned __int64 r0 = __emulu(currentMessageGaloisFieldsArray[0], fragmentCoefficentVector);
//    unsigned __int64 r1 = __emulu(currentMessageGaloisFieldsArray[1], fragmentCoefficentVector);

    ((__int64*)currentUnModdedGaloisFieldFragments)[0] += r0 & 0x00000000ffffffff;
    ((__int64*)currentUnModdedGaloisFieldFragments)[1] += r0 >> 32;
    ((__int64*)currentUnModdedGaloisFieldFragments)[2] += r1 & 0x00000000ffffffff;
    ((__int64*)currentUnModdedGaloisFieldFragments)[3] += r1 >> 32;

    currentMessageGaloisFieldsArray     += 2;
    currentUnModdedGaloisFieldFragments += 2;
}

Which compiles to this on x64:

$LL4@main:
mov ecx, DWORD PTR [rbx]
mov rax, r11
add r9, 32                  ; 00000020H
add rbx, 8
mul rcx
mov ecx, DWORD PTR [rbx-4]
mov r8, rax
mov rax, r11
mul rcx
mov ecx, r8d
shr r8, 32                  ; 00000020H
add QWORD PTR [r9-48], rcx
add QWORD PTR [r9-40], r8
mov ecx, eax
shr rax, 32                 ; 00000020H
add QWORD PTR [r9-24], rax
add QWORD PTR [r9-32], rcx
dec r10
jne SHORT $LL4@main

and this on x86:

$LL4@main:
mov eax, DWORD PTR [esi]
mul DWORD PTR _fragmentCoefficentVector$[esp+224]
mov ebx, eax
mov eax, DWORD PTR [esi+4]
mov DWORD PTR _r0$31463[esp+228], edx
mul DWORD PTR _fragmentCoefficentVector$[esp+224]
add DWORD PTR [ecx-16], ebx
mov ebx, DWORD PTR _r0$31463[esp+228]
adc DWORD PTR [ecx-12], edi
add DWORD PTR [ecx-8], ebx
adc DWORD PTR [ecx-4], edi
add DWORD PTR [ecx], eax
adc DWORD PTR [ecx+4], edi
add DWORD PTR [ecx+8], edx
adc DWORD PTR [ecx+12], edi
add esi, 8
add ecx, 32                 ; 00000020H
dec DWORD PTR tv150[esp+224]
jne SHORT $LL4@main

It's possible that both of these are already faster than your original (SSE) code... On x64, Unrolling it will make it even better.

Solution 2: SSE2 Integer Shuffle

This solution unrolls the loop to 2 iterations:

//  Test Data
__m128i allZero = _mm_setzero_si128();
__m128i fragmentCoefficentVector = _mm_set1_epi32(1000000000);

__declspec(align(16)) int currentMessageGaloisFieldsArray_[8] = {10,11,12,13,14,15,16,17};
int *currentMessageGaloisFieldsArray = currentMessageGaloisFieldsArray_;

__m128i currentUnModdedGaloisFieldFragments_[8];
__m128i *currentUnModdedGaloisFieldFragments = currentUnModdedGaloisFieldFragments_;
memset(currentUnModdedGaloisFieldFragments,0,8 * sizeof(__m128i));


int elementIterations = 4;

//  The Loop
while(elementIterations > 1){   
    elementIterations -= 2;

    //  Load 4 elements. If needed use unaligned load instead.
    //      messageField = {a, b, c, d}
    __m128i messageField = _mm_load_si128((__m128i*)currentMessageGaloisFieldsArray);

    //  Get into this form:
    //      values0 = {a, x, b, x}
    //      values1 = {c, x, d, x}
    __m128i values0 = _mm_shuffle_epi32(messageField,216);
    __m128i values1 = _mm_shuffle_epi32(messageField,114);

    //  Multiply by "fragmentCoefficentVector"
    values0 = _mm_mul_epu32(values0, fragmentCoefficentVector);
    values1 = _mm_mul_epu32(values1, fragmentCoefficentVector);

    __m128i halves0 = _mm_unpacklo_epi32(values0, allZero);
    __m128i halves1 = _mm_unpackhi_epi32(values0, allZero);
    __m128i halves2 = _mm_unpacklo_epi32(values1, allZero);
    __m128i halves3 = _mm_unpackhi_epi32(values1, allZero);


    halves0 = _mm_add_epi64(halves0, currentUnModdedGaloisFieldFragments[0]);
    halves1 = _mm_add_epi64(halves1, currentUnModdedGaloisFieldFragments[1]);
    halves2 = _mm_add_epi64(halves2, currentUnModdedGaloisFieldFragments[2]);
    halves3 = _mm_add_epi64(halves3, currentUnModdedGaloisFieldFragments[3]);

    currentUnModdedGaloisFieldFragments[0] = halves0;
    currentUnModdedGaloisFieldFragments[1] = halves1;
    currentUnModdedGaloisFieldFragments[2] = halves2;
    currentUnModdedGaloisFieldFragments[3] = halves3;

    currentMessageGaloisFieldsArray     += 4;
    currentUnModdedGaloisFieldFragments += 4;
}

which gets compiled to this (x86): (x64 isn't too different)

$LL4@main:
movdqa    xmm1, XMMWORD PTR [esi]
pshufd    xmm0, xmm1, 216               ; 000000d8H
pmuludq   xmm0, xmm3
movdqa    xmm4, xmm0
punpckhdq xmm0, xmm2
paddq     xmm0, XMMWORD PTR [eax-16]
pshufd    xmm1, xmm1, 114               ; 00000072H
movdqa    XMMWORD PTR [eax-16], xmm0
pmuludq   xmm1, xmm3
movdqa    xmm0, xmm1
punpckldq xmm4, xmm2
paddq     xmm4, XMMWORD PTR [eax-32]
punpckldq xmm0, xmm2
paddq     xmm0, XMMWORD PTR [eax]
punpckhdq xmm1, xmm2
paddq     xmm1, XMMWORD PTR [eax+16]
movdqa    XMMWORD PTR [eax-32], xmm4
movdqa    XMMWORD PTR [eax], xmm0
movdqa    XMMWORD PTR [eax+16], xmm1
add       esi, 16                   ; 00000010H
add       eax, 64                   ; 00000040H
dec       ecx
jne       SHORT $LL4@main

Only slightly longer than the non-vectorized version for two iterations. This uses very few registers, so you can further unroll this even on x86.

Explanations:

As Paul R mentioned, unrolling to two iterations allows you to combine the initial load into one SSE load. This also has the benefit of getting your data into the SSE registers.
Since the data starts off in the SSE registers, _mm_set_epi32 (which gets compiled into about ~9 instructions in your original code) can be replaced with a single _mm_shuffle_epi32.

Paul R · Answer

I suggest you unroll your loop by a factor of 2 so that you can load 4 messageField values using one _mm_load_XXX, and then unpack these four values into two vector pairs and process them as per the current loop. That way you won't have a lot of messy code being generated by the compiler for _mm_set_epi32 and all your loads and stores will be 128 bit SSE loads/stores. This will also give the compiler more opportunity to schedule instructions optimally within the loop.

How to optimize C-code with SSE-intrinsics for packed 32x32 => 64-bit multiplies, and unpacking the halves of those results for (Galois Fields)

Tags:

c

optimization

x86

simd

sse

Yrlec

2 Answers

Mysticial

Paul R

Recent Activity

Donate For Us