SSE vector wrapper type performance compared to bare __m128

Question

I found an interesting Gamasutra article about SIMD pitfalls, which states that it is not possible to reach the performance of the "pure" __m128 type with wrapper types. Well I was skeptical, so I downloaded the project files and fabricated a comparable test case.

It turned out (for my surprise) that the wrapper version is significantly slower. Since I don't want to talk about just the thin air, the test cases are the following:

In the 1st case Vec4 is a simple alias of the __m128 type with some operators:

#include <xmmintrin.h>
#include <emmintrin.h>

using Vec4 = __m128;

inline __m128 VLoad(float f)
{
    return _mm_set_ps(f, f, f, f);
};

inline Vec4& operator+=(Vec4 &va, Vec4 vb)
{
    return (va = _mm_add_ps(va, vb));
};

inline Vec4& operator*=(Vec4 &va, Vec4 vb)
{
    return (va = _mm_mul_ps(va, vb));
};

inline Vec4 operator+(Vec4 va, Vec4 vb)
{
    return _mm_add_ps(va, vb);
};

inline Vec4 operator-(Vec4 va, Vec4 vb)
{
    return _mm_sub_ps(va, vb);
};

inline Vec4 operator*(Vec4 va, Vec4 vb)
{
    return _mm_mul_ps(va, vb);
};

In the 2nd case Vec4 is a lightweight wrapper around __m128. It is not a complete wrapper, just a short sketch which covers the issue. The operators wrap exactly the same intrinsics, the only difference is (since 16-byte alignment cannot be applied on arguments) that they take Vec4 as const reference:

#include <xmmintrin.h>
#include <emmintrin.h>

struct Vec4
{
    __m128 simd;

    inline Vec4() = default;
    inline Vec4(const Vec4&) = default;
    inline Vec4& operator=(const Vec4&) = default;

    inline Vec4(__m128 s)
        : simd(s)
    {}

    inline operator __m128() const
    {
        return simd;
    }

    inline operator __m128&()
    {
        return simd;
    }
};

inline __m128 VLoad(float f)
{
    return _mm_set_ps(f, f, f, f);
};

inline Vec4 VAdd(const Vec4 &va, const Vec4 &vb)
{
    return _mm_add_ps(va, vb);
    // return _mm_add_ps(va.simd, vb.simd); // doesn't make difference
};

inline Vec4 VSub(const Vec4 &va, const Vec4 &vb)
{
    return _mm_sub_ps(va, vb);
    // return _mm_sub_ps(va.simd, vb.simd); // doesn't make difference
};

inline Vec4 VMul(const Vec4 &va, const Vec4 &vb)
{
    return _mm_mul_ps(va, vb);
    // return _mm_mul_ps(va.simd, vb.simd); // doesn't make difference
};

And here is the test kernel which produces different performance with different versions of Vec4:

#include <xmmintrin.h>
#include <emmintrin.h>

struct EQSTATE
{
    // Filter #1 (Low band)

    Vec4  lf;       // Frequency
    Vec4  f1p0;     // Poles ...
    Vec4  f1p1;     
    Vec4  f1p2;
    Vec4  f1p3;

    // Filter #2 (High band)

    Vec4  hf;       // Frequency
    Vec4  f2p0;     // Poles ...
    Vec4  f2p1;
    Vec4  f2p2;
    Vec4  f2p3;

    // Sample history buffer

    Vec4  sdm1;     // Sample data minus 1
    Vec4  sdm2;     //                   2
    Vec4  sdm3;     //                   3

    // Gain Controls

    Vec4  lg;       // low  gain
    Vec4  mg;       // mid  gain
    Vec4  hg;       // high gain

};  

static float vsaf = (1.0f / 4294967295.0f);   // Very small amount (Denormal Fix)
static Vec4 vsa = VLoad(vsaf);

Vec4 TestEQ(EQSTATE* es, Vec4& sample)
{
    // Locals

    Vec4  l,m,h;      // Low / Mid / High - Sample Values

    // Filter #1 (lowpass)

    es->f1p0  += (es->lf * (sample   - es->f1p0)) + vsa;
    //es->f1p0 = VAdd(es->f1p0, VAdd(VMul(es->lf, VSub(sample, es->f1p0)), vsa));

    es->f1p1  += (es->lf * (es->f1p0 - es->f1p1));
    //es->f1p1 = VAdd(es->f1p1, VMul(es->lf, VSub(es->f1p0, es->f1p1)));

    es->f1p2  += (es->lf * (es->f1p1 - es->f1p2));
    //es->f1p2 = VAdd(es->f1p2, VMul(es->lf, VSub(es->f1p1, es->f1p2)));

    es->f1p3  += (es->lf * (es->f1p2 - es->f1p3));
    //es->f1p3 = VAdd(es->f1p3, VMul(es->lf, VSub(es->f1p2, es->f1p3)));

    l          = es->f1p3;

    // Filter #2 (highpass)

    es->f2p0  += (es->hf * (sample   - es->f2p0)) + vsa;
    //es->f2p0 = VAdd(es->f2p0, VAdd(VMul(es->hf, VSub(sample, es->f2p0)), vsa));

    es->f2p1  += (es->hf * (es->f2p0 - es->f2p1));
    //es->f2p1 = VAdd(es->f2p1, VMul(es->hf, VSub(es->f2p0, es->f2p1)));

    es->f2p2  += (es->hf * (es->f2p1 - es->f2p2));
    //es->f2p2 = VAdd(es->f2p2, VMul(es->hf, VSub(es->f2p1, es->f2p2)));

    es->f2p3  += (es->hf * (es->f2p2 - es->f2p3));
    //es->f2p3 = VAdd(es->f2p3, VMul(es->hf, VSub(es->f2p2, es->f2p3)));

    h          = es->sdm3 - es->f2p3;
    //h = VSub(es->sdm3, es->f2p3);

    // Calculate midrange (signal - (low + high))

    m          = es->sdm3 - (h + l);
    //m = VSub(es->sdm3, VAdd(h, l));

    // Scale, Combine and store

    l         *= es->lg;
    m         *= es->mg;
    h         *= es->hg;

    //l = VMul(l, es->lg);
    //m = VMul(m, es->mg);
    //h = VMul(h, es->hg);

    // Shuffle history buffer 

    es->sdm3   = es->sdm2;
    es->sdm2   = es->sdm1;
    es->sdm1   = sample;                

    // Return result

    return(l + m + h);
    //return(VAdd(l, VAdd(m, h)));
}

//make these as globals to enforce the function call;
static Vec4 sample[1024], result[1024];
static EQSTATE es;

#include <chrono>
#include <iostream>

int main()
{
    auto t0 = std::chrono::high_resolution_clock::now();

    for (int ii=0; ii<1024; ii++)
    {
        result[ii] = TestEQ(&es, sample[ii]);
    }

    auto t1 = std::chrono::high_resolution_clock::now();
    auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
    std::cout << "timing: " << t << '
';

    std::cin.get();

    return 0;
}

Link to working code

https://godbolt.org/g/fZ8X0N

MSVC 2015 generated assembly for the 1st version:

;   COMDAT ?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z
_TEXT   SEGMENT
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC      ; TestEQ, COMDAT
; _es$dead$ = ecx
; _sample$ = edx
    vmovaps xmm0, XMMWORD PTR [edx]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
    vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
    vmovaps xmm0, XMMWORD PTR [edx]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vsubps  xmm2, xmm1, xmm0
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
    vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
    vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
    vmovaps xmm0, XMMWORD PTR [edx]
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
    vaddps  xmm0, xmm4, xmm2
    vsubps  xmm0, xmm1, xmm0
    vmulps  xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
    vmulps  xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
    vaddps  xmm1, xmm1, xmm0
    vmulps  xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
    vaddps  xmm0, xmm1, xmm0
    ret 0
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z ENDP      ; TestEQ

MSVC 2015 generated assembly for the 2nd version:

?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z PROC ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$dead$ = edx
    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -8                 ; fffffff8H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov eax, DWORD PTR _sample$[ebx]
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
    vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
    mov DWORD PTR [esp+4], ebp
    vmovaps xmm0, XMMWORD PTR [eax]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
    vmovaps xmm0, XMMWORD PTR [eax]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vsubps  xmm2, xmm1, xmm0
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
    vaddps  xmm0, xmm2, xmm4
    vsubps  xmm0, xmm1, xmm0
    vmulps  xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
    vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
    vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
    vmovdqu xmm0, XMMWORD PTR [eax]
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
    vmulps  xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
    vaddps  xmm1, xmm0, xmm1
    vmulps  xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
    vaddps  xmm0, xmm1, xmm0
    vmovaps XMMWORD PTR [ecx], xmm0
    mov eax, ecx
    pop ebp
    mov esp, ebx
    pop ebx
    ret 0
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z ENDP ; TestEQ

The produced assembly of the 2nd version is significantly longer and slower. It is not strictly related to Visual Studio, since Clang 3.8 produces similar performance results.

Clang 3.8 generated assembly for the 1st version:

"?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z": # @"\01?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0:                                 # %entry
    movl    8(%esp), %eax
    movl    4(%esp), %ecx
    vmovaps _vsa, %xmm0
    vmovaps (%ecx), %xmm1
    vmovaps 16(%ecx), %xmm2
    vmovaps (%eax), %xmm3
    vsubps  %xmm2, %xmm3, %xmm3
    vmulps  %xmm3, %xmm1, %xmm3
    vaddps  %xmm3, %xmm0, %xmm3
    vaddps  %xmm3, %xmm2, %xmm2
    vmovaps %xmm2, 16(%ecx)
    vmovaps 32(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 32(%ecx)
    vmovaps 48(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 48(%ecx)
    vmovaps 64(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm1
    vaddps  %xmm1, %xmm3, %xmm1
    vmovaps %xmm1, 64(%ecx)
    vmovaps 80(%ecx), %xmm2
    vmovaps 96(%ecx), %xmm3
    vmovaps (%eax), %xmm4
    vsubps  %xmm3, %xmm4, %xmm4
    vmulps  %xmm4, %xmm2, %xmm4
    vaddps  %xmm4, %xmm0, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 96(%ecx)
    vmovaps 112(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 112(%ecx)
    vmovaps 128(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 128(%ecx)
    vmovaps 144(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 144(%ecx)
    vmovaps 192(%ecx), %xmm2
    vsubps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm1, %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  208(%ecx), %xmm1, %xmm1
    vmulps  224(%ecx), %xmm2, %xmm2
    vmulps  240(%ecx), %xmm0, %xmm0
    vmovaps 176(%ecx), %xmm3
    vmovaps %xmm3, 192(%ecx)
    vmovaps 160(%ecx), %xmm3
    vmovaps %xmm3, 176(%ecx)
    vmovaps (%eax), %xmm3
    vmovaps %xmm3, 160(%ecx)
    vaddps  %xmm2, %xmm0, %xmm0
    vaddps  %xmm0, %xmm1, %xmm0
    retl
Lfunc_end0:

Clang 3.8 generated assembly for the 2nd version:

"?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z": # @"\01?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0:                                 # %entry
    movl    12(%esp), %ecx
    movl    8(%esp), %edx
    vmovaps (%edx), %xmm0
    vmovaps 16(%edx), %xmm1
    vmovaps (%ecx), %xmm2
    vsubps  %xmm1, %xmm2, %xmm2
    vmulps  %xmm0, %xmm2, %xmm2
    vaddps  _vsa, %xmm2, %xmm2
    vaddps  %xmm2, %xmm1, %xmm1
    vmovaps %xmm1, 16(%edx)
    vmovaps 32(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm1
    vaddps  %xmm1, %xmm2, %xmm1
    vmovaps %xmm1, 32(%edx)
    vmovaps 48(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm1
    vaddps  %xmm1, %xmm2, %xmm1
    vmovaps %xmm1, 48(%edx)
    vmovaps 64(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm0
    vaddps  %xmm0, %xmm2, %xmm0
    vmovaps %xmm0, 64(%edx)
    vmovaps 80(%edx), %xmm1
    vmovaps 96(%edx), %xmm2
    vmovaps (%ecx), %xmm3
    vsubps  %xmm2, %xmm3, %xmm3
    vmulps  %xmm1, %xmm3, %xmm3
    vaddps  _vsa, %xmm3, %xmm3
    vaddps  %xmm3, %xmm2, %xmm2
    vmovaps %xmm2, 96(%edx)
    vmovaps 112(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 112(%edx)
    vmovaps 128(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 128(%edx)
    vmovaps 144(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm1
    vaddps  %xmm1, %xmm3, %xmm1
    vmovaps %xmm1, 144(%edx)
    vmovaps 192(%edx), %xmm2
    vsubps  %xmm1, %xmm2, %xmm1
    vaddps  %xmm1, %xmm0, %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  208(%edx), %xmm0, %xmm0
    vmulps  224(%edx), %xmm2, %xmm2
    movl    4(%esp), %eax
    vmulps  240(%edx), %xmm1, %xmm1
    vmovaps 176(%edx), %xmm3
    vmovaps %xmm3, 192(%edx)
    vmovaps 160(%edx), %xmm3
    vmovaps %xmm3, 176(%edx)
    vmovaps (%ecx), %xmm3
    vmovaps %xmm3, 160(%edx)
    vaddps  %xmm2, %xmm0, %xmm0
    vaddps  %xmm0, %xmm1, %xmm0
    vmovaps %xmm0, (%eax)
    retl
Lfunc_end0:

Although the number of the instructions is the same, the 1st version is still about 50% faster.

I tried to identify the cause of the issue, without success. There are suspicious things like those ugly vmovdqu instructions in the 2nd MSVC assembly. Construction, copy assignment operator and the pass-by-reference also can unnecessarily move the data from SSE registers back to memory, however all my attempts to solve or exactly identify the issue was unsuccessful.

I really don't think that such a simple wrapper cannot reach the same performance as the bare __m128, whatever causes the overhead it could be eliminated.

So what is going on there?

plasmacel · Accepted Answer

As it turned out the problem is not with the user defined struct Vec4. It is deeply related to the x86 calling conventions.

The default x86 calling convention in Visual C++ is __cdecl, which

Pushes parameters on the stack, in reverse order (right to left)

Now this is a problem, since Vec4 should be kept and passed in an XMM register. But let's see what is actually happening.

1st case

In the first case Vec4 is a simple type alias of __m128.

using Vec4 = __m128;
/* ... */
Vec4 TestEQ(EQSTATE* es, Vec4 &sample) { ... }

The generated function header of TestEQ in assembly is

?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC      ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...

Nice.

2nd case

In the second case Vec4 is not an alias of __m128, it is an user defined type now.

Here I investigate compilation for both x86 and x64 platform.

x86 (32-bit compilation)

Since __cdecl (which is the default calling convention in x86) doesn't allow to pass aligned values to functions (that would emit Error C2719: 'sample': formal parameter with requested alignment of 16 won't be aligned) we pass it by const reference.

struct Vec4{ __m128 simd; /* ... */ };
/* ... */
Vec4 TestEQ(EQSTATE* es, const Vec4 &sample) { ... }

which generates the function header for TestEQ as

?TestEQ@@YA?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$ = edx
    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -8                 ; fffffff8H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov eax, DWORD PTR _sample$[ebx]
    ...

This is not so simple like the one in the 1st case. The arguments are moved to the stack. There are some additional mov instructions between the first few SSE instructions too, which are not listed here. These instructions in overall are enough to somewhat hit the performance.

x64 (64-bit compilation)

Windows in x64 use a different calling convention as part of the x64 Application Binary Interface (ABI).

This convention tries to keep the data in registers if possible, in a way that floating-point data kept in XMM registers.

From MSDN Overview of x64 Calling Conventions:

The x64 Application Binary Interface (ABI) is a 4 register fast-call calling convention, with stack-backing for those registers. There is a strict one-to-one correspondence between arguments in a function, and the registers for those arguments. Any argument that doesn’t fit in 8 bytes, or is not 1, 2, 4, or 8 bytes, must be passed by reference. (...) All floating point operations are done using the 16 XMM registers. The arguments are passed in registers RCX, RDX, R8, and R9. If the argumentsare float/double, they are passed in XMM0L, XMM1L, XMM2L, and XMM3L. 16 byte arguments are passed by reference.

From Wikipedia page for x86-64 calling conventions

The Microsoft x64 calling convention is followed on Windows and pre-boot UEFI (for long mode on x86-64). It uses registers RCX, RDX, R8, R9 for the first four integer or pointer arguments (in that order), and XMM0, XMM1, XMM2, XMM3 are used for floating point arguments. Additional arguments are pushed onto the stack (right to left). Integer return values (similar to x86) are returned in RAX if 64 bits or less. Floating point return values are returned in XMM0.

So the second case in x64 mode generates the function header for TestEQ as

?TestEQ@@YQ?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...

This is exactly the same as the 1st case!

Solution

For x86 mode the presented behavior should be clearly fixed.

The most simple solution is to inline the function. Although this is just a hint and the compiler can completely ignore, you can tell the compiler to always inline the function. However sometimes this is not desired because of the function size or any other reason.

Fortunately Microsoft introduced the __vectorcall convention in Visual Studio 2013 and above (available in both x86 and x64 mode). This is very similar to the default Windows x64 calling convention, but with more utilizable registers.

Let's rewrite the 2nd case with __vectorcall:

Vec4 __vectorcall TestEQ(EQSTATE* es, const Vec4 &sample) { ... }

Now the generated assembly function header for TestEQ is

?TestEQ@@YQ?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...

which is finally the same as the 1st case and the 2nd case in x64.

As Peter Cordes pointed out, to take the full advantage of __vectorcall, the Vec4 argument should be passed by value, instead of constant reference. To do this the passed type should meet some requirements, like it must be trivially copy constructible (no user defined copy constructors) and shouldn't contain any union. More info in the comments below and here.

Final words

It looks like MSVC under the hood automatically applies the __vectorcall convention as an optimization when it detects an __m128 argument. Otherwise it uses the default calling convention __cdecl (you can change this behavior by compiler options).

People told me in the comments that they didn't see much difference between the GCC and Clang generated assembly of the two case. This is because these compilers with optimization flag -O2 simply inline the TestEQ function into the test loop body (see). It is also possible that they would be more clever than MSVC and they would perform better optimization of the function call.

SSE vector wrapper type performance compared to bare __m128

Tags:

c++

optimization

x86

assembly

sse

plasmacel

1 Answers

1st case

2nd case

Solution

Final words

plasmacel

Recent Activity

Donate For Us

SSE vector wrapper type performance compared to bare __m128

Tags:

c++

optimization

x86

assembly

sse

plasmacel

1 Answers

1st case

2nd case

Solution

Final words

plasmacel

Related questions

Recent Activity

Donate For Us