Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

SSE inline assembly and possible g++ optimization bug

Let's start with the code. I have two structures, one for vectors, and other for matrices.

struct AVector
    {
    explicit AVector(float x=0.0f, float y=0.0f, float z=0.0f, float w=0.0f):
        x(x), y(y), z(z), w(w) {}
    AVector(const AVector& a):
        x(a.x), y(a.y), z(a.z), w(a.w) {}

    AVector& operator=(const AVector& a) {x=a.x; y=a.y; z=a.z; w=a.w; return *this;}

    float x, y, z, w;
    };

struct AMatrix
    {
    // Row-major
    explicit AMatrix(const AVector& a=AVector(), const AVector& b=AVector(), const AVector& c=AVector(), const AVector& d=AVector())
        {row[0]=a; row[1]=b; row[2]=c; row[3]=d;}
    AMatrix(const AMatrix& m) {row[0]=m.row[0]; row[1]=m.row[1]; row[2]=m.row[2]; row[3]=m.row[3];}

    AMatrix& operator=(const AMatrix& m) {row[0]=m.row[0]; row[1]=m.row[1]; row[2]=m.row[2]; row[3]=m.row[3]; return *this;}

    AVector row[4];
    };

Next, code performing calculations on those structures. Dot product using inlined ASM and SSE instructions:

inline AVector AVectorDot(const AVector& a, const AVector& b)
    {
    // XXX
    /*const double v=a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;

    return AVector(v, v, v, v);*/

    AVector c;

    asm volatile(
        "movups (%1), %%xmm0\n\t"
        "movups (%2), %%xmm1\n\t"
        "mulps %%xmm1, %%xmm0\n\t"          // xmm0 -> (a1+b1, , , )
        "movaps %%xmm0, %%xmm1\n\t"         // xmm1 = xmm0
        "shufps $0xB1, %%xmm1, %%xmm1\n\t"  // 0xB1 = 10110001
        "addps %%xmm1, %%xmm0\n\t"          // xmm1 -> (x, y, z, w)+(y, x, w, z)=(x+y, x+y, z+w, z+w)
        "movaps %%xmm0, %%xmm1\n\t"         // xmm1 = xmm0
        "shufps $0x0A, %%xmm1, %%xmm1\n\t"  // 0x0A = 00001010
        "addps %%xmm1, %%xmm0\n\t"          // xmm1 -> (x+y+z+w, , , )
        "movups %%xmm0, %0\n\t"
        : "=m"(c)
        : "r"(&a), "r"(&b)
        );

    return c;
    }

Matrix transposition:

inline AMatrix AMatrixTranspose(const AMatrix& m)
    {
    AMatrix c(
        AVector(m.row[0].x, m.row[1].x, m.row[2].x, m.row[3].x),
        AVector(m.row[0].y, m.row[1].y, m.row[2].y, m.row[3].y),
        AVector(m.row[0].z, m.row[1].z, m.row[2].z, m.row[3].z),
        AVector(m.row[0].w, m.row[1].w, m.row[2].w, m.row[3].w));

    // XXX
    /*printf("AMcrix c:\n    [%5.2f %5.2f %5.2f %5.2f]\n    [%5.2f %5.2f %5.2f %5.2f]\n    [%5.2f %5.2f %5.2f %5.2f]\n    [%5.2f %5.2f %5.2f %5.2f]\n",
        c.row[0].x, c.row[0].y, c.row[0].z, c.row[0].w,
        c.row[1].x, c.row[1].y, c.row[1].z, c.row[1].w,
        c.row[2].x, c.row[2].y, c.row[2].z, c.row[2].w,
        c.row[3].x, c.row[3].y, c.row[3].z, c.row[3].w);*/

    return c;
    }

Matrix-matrix multiplication - transpose first matrix, because when I have it stored as column major, and second one as row major, then I can perform multiplication using dot-products.

inline AMatrix AMatrixMultiply(const AMatrix& a, const AMatrix& b)
    {
    AMatrix c;

    const AMatrix at=AMatrixTranspose(a);

    // XXX
    /*printf("AMatrix at:\n    [%5.2f %5.2f %5.2f %5.2f]\n    [%5.2f %5.2f %5.2f %5.2f]\n    [%5.2f %5.2f %5.2f %5.2f]\n    [%5.2f %5.2f %5.2f %5.2f]\n",
        at.row[0].x, at.row[0].y, at.row[0].z, at.row[0].w,
        at.row[1].x, at.row[1].y, at.row[1].z, at.row[1].w,
        at.row[2].x, at.row[2].y, at.row[2].z, at.row[2].w,
        at.row[3].x, at.row[3].y, at.row[3].z, at.row[3].w);*/

    for(int i=0; i<4; ++i)
        {
        c.row[i].x=AVectorDot(at.row[0], b.row[i]).w;
        c.row[i].y=AVectorDot(at.row[1], b.row[i]).w;
        c.row[i].z=AVectorDot(at.row[2], b.row[i]).w;
        c.row[i].w=AVectorDot(at.row[3], b.row[i]).w;
        }

    return c;
    }

Now time for main (pun intended) part:

int main(int argc, char *argv[])
    {
    AMatrix a(
        AVector(0, 1, 0, 0),
        AVector(1, 0, 0, 0),
        AVector(0, 0, 0, 1),
        AVector(0, 0, 1, 0)
        );

    AMatrix b(
        AVector(1, 0, 0, 0),
        AVector(0, 2, 0, 0),
        AVector(0, 0, 3, 0),
        AVector(0, 0, 0, 4)
        );

    AMatrix c=AMatrixMultiply(a, b);

    printf("AMatrix c:\n    [%5.2f %5.2f %5.2f %5.2f]\n    [%5.2f %5.2f %5.2f %5.2f]\n    [%5.2f %5.2f %5.2f %5.2f]\n    [%5.2f %5.2f %5.2f %5.2f]\n",
        c.row[0].x, c.row[0].y, c.row[0].z, c.row[0].w,
        c.row[1].x, c.row[1].y, c.row[1].z, c.row[1].w,
        c.row[2].x, c.row[2].y, c.row[2].z, c.row[2].w,
        c.row[3].x, c.row[3].y, c.row[3].z, c.row[3].w);

    AVector v(1, 2, 3, 4);
    AVector w(1, 1, 1, 1);

    printf("Dot product: %f (1+2+3+4 = 10)\n", AVectorDot(v, w).w);

    return 0;
    }

In the above code I make two matrices, multiply them and print the resulting matrix. It works fine if I don't use any of the compiler optimizations (g++ main.cpp -O0 -msse). With optimizations enabled (g++ main.cpp -O1 -msse) resulting matrix is empty (all fields are zeroes). Uncommenting any block marked with XXX makes program write correct result.

It seems to me that GCC optimizes-out matrix at from AMatrixMultiply function, because it wrongly assumes it's not used in AVectorDot, which is written using SSE inlines.

Last few lines check if dot-product function really works, and yes, it does.

So, the question is: did I do or understand something wrong, or is this some kind of bug in GCC? My guess is 7:3 mix of above.

I'm using GCC version 5.1.0 (tdm-1).

like image 821
crueltear Avatar asked Dec 24 '22 16:12

crueltear


2 Answers

This is also a very inefficient way of multiplying matrices using SSE. I'd be surprised if it was much faster than a scalar implementation with so much floating-point throughput available on modern CPUs. A better method is outlined here, no explicit transpose needed:

AMatrix & operator *= (AMatrix & m0, const AMatrix & m1)
{
    __m128 r0 = _mm_load_ps(& m1[0][x]);
    __m128 r1 = _mm_load_ps(& m1[1][x]);
    __m128 r2 = _mm_load_ps(& m1[2][x]);
    __m128 r3 = _mm_load_ps(& m1[3][x]);

    for (int i = 0; i < 4; i++)
    {
        __m128 ti = _mm_load_ps(& m0[i][x]), t0, t1, t2, t3;

        t0 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(0, 0, 0, 0));
        t1 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(1, 1, 1, 1));
        t2 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(2, 2, 2, 2));
        t3 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(3, 3, 3, 3));

        ti = t0 * r0 + t1 * r1 + t2 * r2 + t3 * r3;
        _mm_store_ps(& m0[i][x], ti);
    }

    return m0;
}

On modern compilers, like gcc and clang, t0 * r0 + t1 * r1 + t2 * r2 + t3 * r3 is actually operating on __m128 types; though you can replace these with _mm_mul_ps and _mm_add_ps intrinsics if you want.

Return by value is then just a matter of adding a function like:

inline AMatrix operator * (const AMatrix & m0, const AMatrix & m1)
{
    AMatrix lhs (m0); return (lhs *= m1);
}

Personally, I'd just replace the float x, y, z, w; with alignas (16) float _s[4] = {}; or similar - so you get a 'zero-vector' by default, or a defaulted constructor:

constexpr AVector () = default;

as well as nice constructors, like:

constexpr Vector (float x, float y, float z, float w)
        : _s {x, y, z, w} {}
like image 92
Brett Hale Avatar answered Jan 04 '23 19:01

Brett Hale


Your inline assembly lacks some constraints:

asm volatile(
    "movups (%1), %%xmm0\n\t"
    "movups (%2), %%xmm1\n\t"
    "mulps %%xmm1, %%xmm0\n\t"          // xmm0 -> (a1+b1, , , )
    "movaps %%xmm0, %%xmm1\n\t"         // xmm1 = xmm0
    "shufps $0xB1, %%xmm1, %%xmm1\n\t"  // 0xB1 = 10110001
    "addps %%xmm1, %%xmm0\n\t"          // xmm1 -> (x, y, z, w)+(y, x, w, z)=(x+y, x+y, z+w, z+w)
    "movaps %%xmm0, %%xmm1\n\t"         // xmm1 = xmm0
    "shufps $0x0A, %%xmm1, %%xmm1\n\t"  // 0x0A = 00001010
    "addps %%xmm1, %%xmm0\n\t"          // xmm1 -> (x+y+z+w, , , )
    "movups %%xmm0, %0\n\t"
    : "=m"(c)
    : "r"(&a), "r"(&b)
    );

GCC does not know that this assembler fragment clobbers %xmm0 and %xmm1, so it might not reload those registers to their previous values after the fragment has run. Some additional clobbers might be missing as well.

like image 32
Florian Weimer Avatar answered Jan 04 '23 20:01

Florian Weimer