The title may seem nonsense but let me explain. I was studying a program the other day when I encountered the following assembly code:
movaps xmm3, xmmword ptr [rbp-30h]
lea rdx, [rdi+1320h]
movaps xmm5, xmm3
movaps xmm6, xmm3
movaps xmm0, xmm3
movss dword ptr [rdx], xmm3
shufps xmm5, xmm3, 55h
shufps xmm6, xmm3, 0AAh
shufps xmm0, xmm3, 0FFh
movaps xmm4, xmm3
movss dword ptr [rdx+4], xmm5
movss dword ptr [rdx+8], xmm6
movss dword ptr [rdx+0Ch], xmm0
mulss xmm4, xmm3
and it seems like mostly it just copies four floats from [rbp-30h] to [rdx]. Those shufps
s are used just to select one of four floats in xmm3
(e.g. shufps xmm5, xmm3, 55h
selects the second float and places it in xmm5
).
This makes me wonder if the compiler did so because shufps
is actually faster than memory access (something like movss xmm0, dword ptr [rbp-30h]
, movss dword ptr [rdx], xmm0
).
So I wrote some tests to compare these two approaches and found shufps
always slower than multiple memory accesses. Now I'm thinking maybe the use of shufps
has nothing to do with performance. It might just be there to obfuscate the code so decompilers cannot produce clean code easily (tried with IDA pro and it was indeed overly complicated).
While I'll probably never use shufps
explicitly anyway (by using _mm_shuffle_ps
for example) in any practical programs as the compiler is most likely smarter than me, I still want to know why the compiler that compiled the program generated such code. It's neither faster nor smaller. It makes no sense.
Anyway I'll provide the tests I wrote below.
#include <Windows.h>
#include <iostream>
using namespace std;
__declspec(noinline) DWORD profile_routine(void (*routine)(void *), void *arg, int iterations = 1)
{
DWORD startTime = GetTickCount();
while (iterations--)
{
routine(arg);
}
DWORD timeElapsed = GetTickCount() - startTime;
return timeElapsed;
}
struct Struct
{
float x, y, z, w;
};
__declspec(noinline) Struct shuffle1(float *arr)
{
float x = arr[3];
float y = arr[2];
float z = arr[0];
float w = arr[1];
return {x, y, z, w};
}
#define SS0 (0x00)
#define SS1 (0x55)
#define SS2 (0xAA)
#define SS3 (0xFF)
__declspec(noinline) Struct shuffle2(float *arr)
{
Struct r;
__m128 packed = *reinterpret_cast<__m128 *>(arr);
__m128 x = _mm_shuffle_ps(packed, packed, SS3);
__m128 y = _mm_shuffle_ps(packed, packed, SS2);
__m128 z = _mm_shuffle_ps(packed, packed, SS0);
__m128 w = _mm_shuffle_ps(packed, packed, SS1);
_mm_store_ss(&r.x, x);
_mm_store_ss(&r.y, y);
_mm_store_ss(&r.z, z);
_mm_store_ss(&r.w, w);
return r;
}
void profile_shuffle_r1(void *arg)
{
float *arr = static_cast<float *>(arg);
Struct q = shuffle1(arr);
arr[0] += q.w;
arr[1] += q.z;
arr[2] += q.y;
arr[3] += q.x;
}
void profile_shuffle_r2(void *arg)
{
float *arr = static_cast<float *>(arg);
Struct q = shuffle2(arr);
arr[0] += q.w;
arr[1] += q.z;
arr[2] += q.y;
arr[3] += q.x;
}
int main(int argc, char **argv)
{
int n = argc + 3;
float arr1[4], arr2[4];
for (int i = 0; i < 4; i++)
{
arr1[i] = static_cast<float>(n + i);
arr2[i] = static_cast<float>(n + i);
}
int iterations = 20000000;
DWORD time1 = profile_routine(profile_shuffle_r1, arr1, iterations);
cout << "time1 = " << time1 << endl;
DWORD time2 = profile_routine(profile_shuffle_r2, arr2, iterations);
cout << "time2 = " << time2 << endl;
return 0;
}
In the test above, I have two shuffle methods shuffle1
and shuffle2
that do the same thing. When compiled with MSVC -O2, it produces the following code:
shuffle1:
mov eax,dword ptr [rdx+0Ch]
mov dword ptr [rcx],eax
mov eax,dword ptr [rdx+8]
mov dword ptr [rcx+4],eax
mov eax,dword ptr [rdx]
mov dword ptr [rcx+8],eax
mov eax,dword ptr [rdx+4]
mov dword ptr [rcx+0Ch],eax
mov rax,rcx
ret
shuffle2:
movaps xmm2,xmmword ptr [rdx]
mov rax,rcx
movaps xmm0,xmm2
shufps xmm0,xmm2,0FFh
movss dword ptr [rcx],xmm0
movaps xmm0,xmm2
shufps xmm0,xmm2,0AAh
movss dword ptr [rcx+4],xmm0
movss dword ptr [rcx+8],xmm2
shufps xmm2,xmm2,55h
movss dword ptr [rcx+0Ch],xmm2
ret
shuffle1
is always at least 30% faster than shuffle2
on my machine. I did notice shuffle2
has two more instructions and shuffle1
actually uses eax
instead of xmm0
so I thought if I add some junk arithmetic operations, the result would be different.
So I modified them as the following:
__declspec(noinline) Struct shuffle1(float *arr)
{
float x0 = arr[3];
float y0 = arr[2];
float z0 = arr[0];
float w0 = arr[1];
float x = x0 + y0 + z0;
float y = y0 + z0 + w0;
float z = z0 + w0 + x0;
float w = w0 + x0 + y0;
return {x, y, z, w};
}
#define SS0 (0x00)
#define SS1 (0x55)
#define SS2 (0xAA)
#define SS3 (0xFF)
__declspec(noinline) Struct shuffle2(float *arr)
{
Struct r;
__m128 packed = *reinterpret_cast<__m128 *>(arr);
__m128 x0 = _mm_shuffle_ps(packed, packed, SS3);
__m128 y0 = _mm_shuffle_ps(packed, packed, SS2);
__m128 z0 = _mm_shuffle_ps(packed, packed, SS0);
__m128 w0 = _mm_shuffle_ps(packed, packed, SS1);
__m128 yz = _mm_add_ss(y0, z0);
__m128 x = _mm_add_ss(x0, yz);
__m128 y = _mm_add_ss(w0, yz);
__m128 wx = _mm_add_ss(w0, x0);
__m128 z = _mm_add_ss(z0, wx);
__m128 w = _mm_add_ss(y0, wx);
_mm_store_ss(&r.x, x);
_mm_store_ss(&r.y, y);
_mm_store_ss(&r.z, z);
_mm_store_ss(&r.w, w);
return r;
}
and now the assembly looks a bit more fair as they have the same number of instructions and both need to use xmm registers.
shuffle1:
movss xmm5,dword ptr [rdx+8]
mov rax,rcx
movss xmm3,dword ptr [rdx+0Ch]
movaps xmm0,xmm5
movss xmm2,dword ptr [rdx]
addss xmm0,xmm3
movss xmm4,dword ptr [rdx+4]
movaps xmm1,xmm2
addss xmm1,xmm5
addss xmm0,xmm2
addss xmm1,xmm4
movss dword ptr [rcx],xmm0
movaps xmm0,xmm4
addss xmm0,xmm2
addss xmm4,xmm3
movss dword ptr [rcx+4],xmm1
addss xmm0,xmm3
addss xmm4,xmm5
movss dword ptr [rcx+8],xmm0
movss dword ptr [rcx+0Ch],xmm4
ret
shuffle2:
movaps xmm4,xmmword ptr [rdx]
mov rax,rcx
movaps xmm3,xmm4
movaps xmm5,xmm4
shufps xmm5,xmm4,0AAh
movaps xmm2,xmm4
shufps xmm2,xmm4,0FFh
movaps xmm0,xmm5
addss xmm0,xmm3
shufps xmm4,xmm4,55h
movaps xmm1,xmm4
addss xmm1,xmm2
addss xmm2,xmm0
addss xmm4,xmm0
addss xmm3,xmm1
addss xmm5,xmm1
movss dword ptr [rcx],xmm2
movss dword ptr [rcx+4],xmm4
movss dword ptr [rcx+8],xmm3
movss dword ptr [rcx+0Ch],xmm5
ret
but it doesn't matter. shuffle1
is still 30% faster!
Without the broader context, it is hard to say for sure, but ... when optimizing for newer processors, you have to consider the usage of the different ports. See Agners here: http://www.agner.org/optimize/instruction_tables.pdf
In this case, while it may seem unlikely, there are a few possibilities that jump out at me if we're assuming that the assembly is, in fact, optimized.
Lastly, specific to this sort of optimization and where I've used something similar. Say you have a branch that is run-time near 100% predictable, but not during compile-time. Let's imagine, hypothetically that right after the branch there is a read that is often a cache miss. You want to read as soon as possible. The Out-Of-Order scheduler will read ahead and begin executing that read if it you don't use the read ports. This could make the shufps instructions essentially "free" to execute. Here's that example:
MOV ecx, [some computed, mostly constant at run-time global]
label loop:
ADD rdi, 16
ADD rbp, 16
CALL shuffle
SUB ecx, 1
JNE loop
MOV rax, [rdi]
;do a read that could be "predicted" properly
MOV rbx, [rax]
Honestly though, it just looks like poorly written assembly or poorly generated machine code, so I wouldn't put much thought into it. The example I'm giving is pretty darned unlikely.
You don't show if the later code actually uses the results of broadcasting each element to all 4 positions of a vector. (e.g. 0x55
is _MM_SHUFFLE(1,1,1,1)
). If you already need that for for a ...ps
instruction later, then you need those shuffles anyway, so there'd be no reason to also do scalar loads.
If you don't, and the only visible side-effect is the stores to memory, this is just a hilariously bad missed optimization by either a human programmer using intrinsics, and/or by a compiler. Just like in your examples of MSVC output for your test functions.
Keep in mind that some compilers (like ICC and MSVC) don't really optimize intrinsics, so if you write 3x _mm_shuffle_ps
you get 3x shufps
, so this poor decision could have been made by a human using intrinsics, not a compiler.
But Clang on the other hand does aggressively optimize shuffle intrinsics. clang
optimizes both of your shuffle
functions to one movaps
load, one shufps
(or pshufd
), and one movups
store. This is optimal for most CPUs, getting the work done in the fewest instructions and uops.
(gcc auto-vectorizes shuffle1
but not shuffle2
. MSVC fails at everything, just using scalar for shuffle1
)
(If you just need each scalar float at the bottom of an xmm register for ...ss
instructions, you can use the shuffle that creates your store vector as one of them, because it has a different low element than the input. You'd movaps
copy first, though, or use pshufd
, to avoid destroying the reg with the original low element.)
If tuning specifically for a CPU with slow movups
stores (like Intel pre-Nehalem) and the result isn't known to be aligned, then you'd still use one shufps
but store the result with movlps
and movhps
. This is what gcc does if you compile with -mtune=core2
.
You apparently know your input vector is aligned, so it still makes a huge amount of sense to load it with movaps
. K8 will split a movaps
into two 8-byte load uops, but most other x86-64 CPUs can do 16-byte aligned loads as a single uop. (Pentium M / Core 1 were the last mainstream Intel CPUs to split 128-bit vector ops like that, and they didn't support 64-bit mode.)
vbroadcastss
requires AVX, so without AVX if you want a dword from memory broadcast into an XMM register, you have to use a shuffle instruction that needs a port 5 ALU uop. (vbroadcastss xmm0, [rsi+4]
decodes to a pure load uop on Intel CPUs, no ALU uop needed, so it has 2 per clock throughput instead of 1.)
Old CPUs like Merom and K8 have slow shuffle units that are only 64 bits wide, so shufps
is pretty slow because it's a full 128-bit shuffle with granularity smaller than 64 bits. You might consider doing 2x movsd
or movq
loads to feed pshuflw
, which is fast because it only shuffles the low 64 bits. But only if you're specifically tuning for old CPUs.
// for gcc, I used __attribute__((ms_abi)) to target the Windows x64 calling convention
Struct shuffle3(float *arr)
{
Struct r;
__m128 packed = _mm_load_ps(arr);
__m128 xyzw = _mm_shuffle_ps(packed, packed, _MM_SHUFFLE(1,0,2,3));
_mm_storeu_ps(&r.x, xyzw);
return r;
}
shuffle1
and shuffle3
both compile to identical code with gcc and clang (on the Godbolt compiler explorer), because they auto-vectorize the scalar assignments. The only difference is using a movups
load for shuffle1
, because nothing guarantees 16-byte alignment there. (If we promised the compiler an aligned pointer for the pure C scalar version, then it would be exactly identical.)
# MSVC compiles shuffle3 like this as well
# from gcc9.1 -O3 (default baseline x86-64, tune=generic)
shuffle3(float*):
movaps xmm0, XMMWORD PTR [rdx] # MSVC still uses movups even for _mm_load_ps
mov rax, rcx # return the retval pointer
shufps xmm0, xmm0, 75
movups XMMWORD PTR [rcx], xmm0 # store to the hidden retval pointer
ret
With -mtune=core2
, gcc still auto-vectorizes shuffle1
. It uses split unaligned loads because we didn't promise the compiler aligned memory.
For shuffle3
, it does use movaps
but still splits _mm_storeu_ps
into movlps
+ movhps
. (This is one of the interesting effects that tuning options can have. They don't let the compiler use new instruction, just change the selection for existing ones.)
# gcc9.1 -O3 -mtune=core2 # auto-vectorizing shuffle1
shuffle1(float*):
movq xmm0, QWORD PTR [rdx]
mov rax, rcx
movhps xmm0, QWORD PTR [rdx+8]
shufps xmm0, xmm0, 75
movlps QWORD PTR [rcx], xmm0 # store in 2 halves
movhps QWORD PTR [rcx+8], xmm0
ret
MSVC doesn't have tuning options, and doesn't auto-vectorize shuffle1
.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With