Is shufps slower than memory access?

Question

The title may seem nonsense but let me explain. I was studying a program the other day when I encountered the following assembly code:

movaps  xmm3, xmmword ptr [rbp-30h]
lea     rdx, [rdi+1320h]
movaps  xmm5, xmm3
movaps  xmm6, xmm3
movaps  xmm0, xmm3
movss   dword ptr [rdx], xmm3
shufps  xmm5, xmm3, 55h
shufps  xmm6, xmm3, 0AAh
shufps  xmm0, xmm3, 0FFh
movaps  xmm4, xmm3
movss   dword ptr [rdx+4], xmm5
movss   dword ptr [rdx+8], xmm6
movss   dword ptr [rdx+0Ch], xmm0
mulss   xmm4, xmm3

and it seems like mostly it just copies four floats from [rbp-30h] to [rdx]. Those shufpss are used just to select one of four floats in xmm3 (e.g. shufps xmm5, xmm3, 55h selects the second float and places it in xmm5).

This makes me wonder if the compiler did so because shufps is actually faster than memory access (something like movss xmm0, dword ptr [rbp-30h], movss dword ptr [rdx], xmm0).

So I wrote some tests to compare these two approaches and found shufps always slower than multiple memory accesses. Now I'm thinking maybe the use of shufps has nothing to do with performance. It might just be there to obfuscate the code so decompilers cannot produce clean code easily (tried with IDA pro and it was indeed overly complicated).

While I'll probably never use shufps explicitly anyway (by using _mm_shuffle_ps for example) in any practical programs as the compiler is most likely smarter than me, I still want to know why the compiler that compiled the program generated such code. It's neither faster nor smaller. It makes no sense.

Anyway I'll provide the tests I wrote below.

#include <Windows.h>
#include <iostream>

using namespace std;

__declspec(noinline) DWORD profile_routine(void (*routine)(void *), void *arg, int iterations = 1)
{
    DWORD startTime = GetTickCount();
    while (iterations--)
    {
        routine(arg);
    }
    DWORD timeElapsed = GetTickCount() - startTime;
    return timeElapsed;
}


struct Struct
{
    float x, y, z, w;
};

__declspec(noinline) Struct shuffle1(float *arr)
{
    float x = arr[3];
    float y = arr[2];
    float z = arr[0];
    float w = arr[1];

    return {x, y, z, w};
}


#define SS0     (0x00)
#define SS1     (0x55)
#define SS2     (0xAA)
#define SS3     (0xFF)
__declspec(noinline) Struct shuffle2(float *arr)
{
    Struct r;
    __m128 packed = *reinterpret_cast<__m128 *>(arr);

    __m128 x = _mm_shuffle_ps(packed, packed, SS3);
    __m128 y = _mm_shuffle_ps(packed, packed, SS2);
    __m128 z = _mm_shuffle_ps(packed, packed, SS0);
    __m128 w = _mm_shuffle_ps(packed, packed, SS1);

    _mm_store_ss(&r.x, x);
    _mm_store_ss(&r.y, y);
    _mm_store_ss(&r.z, z);
    _mm_store_ss(&r.w, w);

    return r;
}



void profile_shuffle_r1(void *arg)
{
    float *arr = static_cast<float *>(arg);
    Struct q = shuffle1(arr);
    arr[0] += q.w;
    arr[1] += q.z;
    arr[2] += q.y;
    arr[3] += q.x;
}
void profile_shuffle_r2(void *arg)
{
    float *arr = static_cast<float *>(arg);
    Struct q = shuffle2(arr);
    arr[0] += q.w;
    arr[1] += q.z;
    arr[2] += q.y;
    arr[3] += q.x;
}

int main(int argc, char **argv)
{
    int n = argc + 3;
    float arr1[4], arr2[4];
    for (int i = 0; i < 4; i++)
    {
        arr1[i] = static_cast<float>(n + i);
        arr2[i] = static_cast<float>(n + i);
    }

    int iterations = 20000000;
    DWORD time1 = profile_routine(profile_shuffle_r1, arr1, iterations);
    cout << "time1 = " << time1 << endl;
    DWORD time2 = profile_routine(profile_shuffle_r2, arr2, iterations);
    cout << "time2 = " << time2 << endl;

    return 0;
}

In the test above, I have two shuffle methods shuffle1 and shuffle2 that do the same thing. When compiled with MSVC -O2, it produces the following code:

shuffle1:
 mov         eax,dword ptr [rdx+0Ch]  
 mov         dword ptr [rcx],eax  
 mov         eax,dword ptr [rdx+8]  
 mov         dword ptr [rcx+4],eax  
 mov         eax,dword ptr [rdx]  
 mov         dword ptr [rcx+8],eax  
 mov         eax,dword ptr [rdx+4]  
 mov         dword ptr [rcx+0Ch],eax  
 mov         rax,rcx  
 ret  
shuffle2:
 movaps      xmm2,xmmword ptr [rdx]  
 mov         rax,rcx  
 movaps      xmm0,xmm2  
 shufps      xmm0,xmm2,0FFh  
 movss       dword ptr [rcx],xmm0  
 movaps      xmm0,xmm2  
 shufps      xmm0,xmm2,0AAh  
 movss       dword ptr [rcx+4],xmm0  
 movss       dword ptr [rcx+8],xmm2  
 shufps      xmm2,xmm2,55h  
 movss       dword ptr [rcx+0Ch],xmm2  
 ret

shuffle1 is always at least 30% faster than shuffle2 on my machine. I did notice shuffle2 has two more instructions and shuffle1 actually uses eax instead of xmm0 so I thought if I add some junk arithmetic operations, the result would be different.

So I modified them as the following:

__declspec(noinline) Struct shuffle1(float *arr)
{
    float x0 = arr[3];
    float y0 = arr[2];
    float z0 = arr[0];
    float w0 = arr[1];

    float x = x0 + y0 + z0;
    float y = y0 + z0 + w0;
    float z = z0 + w0 + x0;
    float w = w0 + x0 + y0;

    return {x, y, z, w};
}


#define SS0     (0x00)
#define SS1     (0x55)
#define SS2     (0xAA)
#define SS3     (0xFF)
__declspec(noinline) Struct shuffle2(float *arr)
{
    Struct r;
    __m128 packed = *reinterpret_cast<__m128 *>(arr);

    __m128 x0 = _mm_shuffle_ps(packed, packed, SS3);
    __m128 y0 = _mm_shuffle_ps(packed, packed, SS2);
    __m128 z0 = _mm_shuffle_ps(packed, packed, SS0);
    __m128 w0 = _mm_shuffle_ps(packed, packed, SS1);

    __m128 yz = _mm_add_ss(y0, z0);
    __m128 x = _mm_add_ss(x0, yz);
    __m128 y = _mm_add_ss(w0, yz);

    __m128 wx = _mm_add_ss(w0, x0);
    __m128 z = _mm_add_ss(z0, wx);
    __m128 w = _mm_add_ss(y0, wx);

    _mm_store_ss(&r.x, x);
    _mm_store_ss(&r.y, y);
    _mm_store_ss(&r.z, z);
    _mm_store_ss(&r.w, w);

    return r;
}

and now the assembly looks a bit more fair as they have the same number of instructions and both need to use xmm registers.

shuffle1:
 movss       xmm5,dword ptr [rdx+8]  
 mov         rax,rcx  
 movss       xmm3,dword ptr [rdx+0Ch]  
 movaps      xmm0,xmm5  
 movss       xmm2,dword ptr [rdx]  
 addss       xmm0,xmm3  
 movss       xmm4,dword ptr [rdx+4]  
 movaps      xmm1,xmm2  
 addss       xmm1,xmm5  
 addss       xmm0,xmm2  
 addss       xmm1,xmm4  
 movss       dword ptr [rcx],xmm0  
 movaps      xmm0,xmm4  
 addss       xmm0,xmm2  
 addss       xmm4,xmm3  
 movss       dword ptr [rcx+4],xmm1  
 addss       xmm0,xmm3  
 addss       xmm4,xmm5  
 movss       dword ptr [rcx+8],xmm0  
 movss       dword ptr [rcx+0Ch],xmm4  
 ret  
shuffle2:
 movaps      xmm4,xmmword ptr [rdx]  
 mov         rax,rcx  
 movaps      xmm3,xmm4  
 movaps      xmm5,xmm4  
 shufps      xmm5,xmm4,0AAh  
 movaps      xmm2,xmm4  
 shufps      xmm2,xmm4,0FFh  
 movaps      xmm0,xmm5  
 addss       xmm0,xmm3  
 shufps      xmm4,xmm4,55h  
 movaps      xmm1,xmm4  
 addss       xmm1,xmm2  
 addss       xmm2,xmm0  
 addss       xmm4,xmm0  
 addss       xmm3,xmm1  
 addss       xmm5,xmm1  
 movss       dword ptr [rcx],xmm2  
 movss       dword ptr [rcx+4],xmm4  
 movss       dword ptr [rcx+8],xmm3  
 movss       dword ptr [rcx+0Ch],xmm5  
 ret

but it doesn't matter. shuffle1 is still 30% faster!

Nick Apperson · Accepted Answer

Without the broader context, it is hard to say for sure, but ... when optimizing for newer processors, you have to consider the usage of the different ports. See Agners here: http://www.agner.org/optimize/instruction_tables.pdf

In this case, while it may seem unlikely, there are a few possibilities that jump out at me if we're assuming that the assembly is, in fact, optimized.

This could appear in a stretch of code where the Out-Of-Order scheduler happens to have more of port 5 (on Haswell, for example) than ports 2 and 3 (again, using Haswell as an example) available.
Similar to with #1, but the same effect might be observed when hyperthreading. This code may be intended to not steal read operations from the sibling hyperthread.
Lastly, specific to this sort of optimization and where I've used something similar. Say you have a branch that is run-time near 100% predictable, but not during compile-time. Let's imagine, hypothetically that right after the branch there is a read that is often a cache miss. You want to read as soon as possible. The Out-Of-Order scheduler will read ahead and begin executing that read if it you don't use the read ports. This could make the shufps instructions essentially "free" to execute. Here's that example:
```
  MOV ecx, [some computed, mostly constant at run-time global]
 label loop:
  ADD rdi, 16
  ADD rbp, 16
  CALL shuffle
  SUB ecx, 1
  JNE loop

MOV rax, [rdi]

;do a read that could be "predicted" properly
MOV rbx, [rax]
```

Honestly though, it just looks like poorly written assembly or poorly generated machine code, so I wouldn't put much thought into it. The example I'm giving is pretty darned unlikely.

Peter Cordes · Answer

You don't show if the later code actually uses the results of broadcasting each element to all 4 positions of a vector. (e.g. 0x55 is _MM_SHUFFLE(1,1,1,1)). If you already need that for for a ...ps instruction later, then you need those shuffles anyway, so there'd be no reason to also do scalar loads.

If you don't, and the only visible side-effect is the stores to memory, this is just a hilariously bad missed optimization by either a human programmer using intrinsics, and/or by a compiler. Just like in your examples of MSVC output for your test functions.

Keep in mind that some compilers (like ICC and MSVC) don't really optimize intrinsics, so if you write 3x _mm_shuffle_ps you get 3x shufps, so this poor decision could have been made by a human using intrinsics, not a compiler.

But Clang on the other hand does aggressively optimize shuffle intrinsics. clang optimizes both of your shuffle functions to one movaps load, one shufps (or pshufd), and one movups store. This is optimal for most CPUs, getting the work done in the fewest instructions and uops.

(gcc auto-vectorizes shuffle1 but not shuffle2. MSVC fails at everything, just using scalar for shuffle1)

(If you just need each scalar float at the bottom of an xmm register for ...ss instructions, you can use the shuffle that creates your store vector as one of them, because it has a different low element than the input. You'd movaps copy first, though, or use pshufd, to avoid destroying the reg with the original low element.)

If tuning specifically for a CPU with slow movups stores (like Intel pre-Nehalem) and the result isn't known to be aligned, then you'd still use one shufps but store the result with movlps and movhps. This is what gcc does if you compile with -mtune=core2.

You apparently know your input vector is aligned, so it still makes a huge amount of sense to load it with movaps. K8 will split a movaps into two 8-byte load uops, but most other x86-64 CPUs can do 16-byte aligned loads as a single uop. (Pentium M / Core 1 were the last mainstream Intel CPUs to split 128-bit vector ops like that, and they didn't support 64-bit mode.)

vbroadcastss requires AVX, so without AVX if you want a dword from memory broadcast into an XMM register, you have to use a shuffle instruction that needs a port 5 ALU uop. (vbroadcastss xmm0, [rsi+4] decodes to a pure load uop on Intel CPUs, no ALU uop needed, so it has 2 per clock throughput instead of 1.)

Old CPUs like Merom and K8 have slow shuffle units that are only 64 bits wide, so shufps is pretty slow because it's a full 128-bit shuffle with granularity smaller than 64 bits. You might consider doing 2x movsd or movq loads to feed pshuflw, which is fast because it only shuffles the low 64 bits. But only if you're specifically tuning for old CPUs.

 // for gcc, I used __attribute__((ms_abi)) to target the Windows x64 calling convention
Struct shuffle3(float *arr)
{
    Struct r;
    __m128 packed = _mm_load_ps(arr);

    __m128 xyzw = _mm_shuffle_ps(packed, packed, _MM_SHUFFLE(1,0,2,3));
    _mm_storeu_ps(&r.x, xyzw);
    return r;
}

shuffle1 and shuffle3 both compile to identical code with gcc and clang (on the Godbolt compiler explorer), because they auto-vectorize the scalar assignments. The only difference is using a movups load for shuffle1, because nothing guarantees 16-byte alignment there. (If we promised the compiler an aligned pointer for the pure C scalar version, then it would be exactly identical.)

# MSVC compiles shuffle3 like this as well

# from gcc9.1 -O3    (default baseline x86-64, tune=generic)
shuffle3(float*):
        movaps  xmm0, XMMWORD PTR [rdx]        # MSVC still uses movups even for _mm_load_ps
        mov     rax, rcx                       # return the retval pointer
        shufps  xmm0, xmm0, 75
        movups  XMMWORD PTR [rcx], xmm0        # store to the hidden retval pointer
        ret

With -mtune=core2, gcc still auto-vectorizes shuffle1. It uses split unaligned loads because we didn't promise the compiler aligned memory.

For shuffle3, it does use movaps but still splits _mm_storeu_ps into movlps + movhps. (This is one of the interesting effects that tuning options can have. They don't let the compiler use new instruction, just change the selection for existing ones.)

# gcc9.1 -O3 -mtune=core2        # auto-vectorizing shuffle1
shuffle1(float*):
        movq    xmm0, QWORD PTR [rdx]
        mov     rax, rcx
        movhps  xmm0, QWORD PTR [rdx+8]
        shufps  xmm0, xmm0, 75
        movlps  QWORD PTR [rcx], xmm0          # store in 2 halves
        movhps  QWORD PTR [rcx+8], xmm0
        ret

MSVC doesn't have tuning options, and doesn't auto-vectorize shuffle1.

Is shufps slower than memory access?

Tags:

c++

assembly

simd

sse

uNiverselEgacy

2 Answers

Nick Apperson

Peter Cordes

Recent Activity

Donate For Us

Is shufps slower than memory access?

Tags:

c++

assembly

simd

sse

uNiverselEgacy

2 Answers

Nick Apperson

Peter Cordes

Related questions

Recent Activity

Donate For Us