Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why does arm-gcc decrement/increment the stack pointer even when the stack is never accessed?

When compiling this program with arm-elf-gcc-4.5 -O3 -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp:

#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
    float32x4x2_t
        xxyyzz1(vzipq_f32(v1, v1)),
        xxyyzz2(vzipq_f32(v2, v2));

    float32x2_t
        xx1(vget_low_f32(xxyyzz1.val[0])),
        yy1(vget_high_f32(xxyyzz1.val[0])),
        zz1(vget_low_f32(xxyyzz1.val[1])),
        xx2(vget_low_f32(xxyyzz2.val[0])),
        yy2(vget_high_f32(xxyyzz2.val[0])),
        zz2(vget_low_f32(xxyyzz2.val[1]));

    float32x2_t
        x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
        y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
        z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));

    return vcombine_f32(vuzp_f32(x, y).val[0], z);
}

...this is what I get. Notice the two useless instructions marked with @<<<

_Z5crossRK19__simd128_float32_tS1_:
    vldmia  r0, {d16-d17}
    vldmia  r1, {d22-d23}
    vmov    q10, q8  @ v4sf
    vmov    q9, q11  @ v4sf
    vzip.32 q8, q10
    vzip.32 q11, q9
    vmov    d24, d17
    vmov    d21, d22
    vmov    d22, d23
    vmul.f32    d17, d24, d18
    vmul.f32    d19, d20, d21
    vmls.f32    d19, d16, d18
    vmls.f32    d17, d20, d22
    vmul.f32    d16, d16, d22
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    sub sp, sp, #80 @<<<
    vswp    d17, d16
    vmov    r0, r1, d16  @ v4sf
    vmov    r2, r3, d17
    add sp, sp, #80 @<<<
    bx

The stack is never accessed, yet the stack pointer get decremented, then incremented by the same amount. Why?

If I modify the original code to include asm comment at the end of the prologue and the begining of the epilogue, like this:

#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
    asm volatile("# End of prologue");
    float32x4x2_t
        xxyyzz1(vzipq_f32(v1, v1)),
        xxyyzz2(vzipq_f32(v2, v2));

    float32x2_t
        xx1(vget_low_f32(xxyyzz1.val[0])),
        yy1(vget_high_f32(xxyyzz1.val[0])),
        zz1(vget_low_f32(xxyyzz1.val[1])),
        xx2(vget_low_f32(xxyyzz2.val[0])),
        yy2(vget_high_f32(xxyyzz2.val[0])),
        zz2(vget_low_f32(xxyyzz2.val[1]));

    float32x2_t
        x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
        y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
        z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));

    float32x4_t res(vcombine_f32(vuzp_f32(x, y).val[0], z));
    asm volatile("# Start of epilogue");
    return res;
}

Then I get slightly different version:

_Z5crossRK19__simd128_float32_tS1_:
    sub sp, sp, #80
    # End of prologue
    vldmia  r0, {d16-d17}
    vldmia  r1, {d22-d23}
    vmov    q10, q8  @ v4sf
    vmov    q9, q11  @ v4sf
    vzip.32 q8, q10
    vzip.32 q11, q9
    vmov    d24, d17
    vmov    d21, d22
    vmov    d22, d23
    vmul.f32    d17, d24, d18
    vmul.f32    d19, d20, d21
    vmls.f32    d19, d16, d18
    vmls.f32    d17, d20, d22
    vmul.f32    d16, d16, d22
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    vswp    d17, d16
    # Start of epilogue
    vmov    r0, r1, d16  @ v4sf
    vmov    r2, r3, d17
    add sp, sp, #80
    bx  lr

The stack pointer decrement/increment clearly is part of the prologue/epilogue, and happens even if the stack is not used. Is that to comply with some standard, or is it a gcc optimization bug?

EDIT: Compiler is arm-elf-gcc-4.5 (GCC) 4.5.0, configured with: /opt/local/var/macports/build/_opt_local_var_macports_sources_rsync.macports.org_release_ports_cross_arm-elf-gcc/work/gcc-4.5.0/configure --prefix=/opt/local --infodir=/opt/local/share/info --mandir=/opt/local/share/man --target=arm-elf --program-prefix=arm-elf- --program-suffix=-4.5 --without-included-gettext --enable-obsolete --with-newlib --disable-__cxa_atexit --enable-multilib --enable-biendian --disable-libgfortran --with-gxx-include-dir=/opt/local/arm-elf/include/c++/4.5.0/ --enable-languages=c,c++,objc --build=x86_64-apple-darwin10 --enable-fpu

EDIT: I managed to pinpoint the problem using the following C source. It only happens when using arrays of vector types as temporaries, such as float32x4x2_t which is declared as struct { float32x4_t val[2]; }, even tho these temporaries are made registers. I believe this is a bug, so I reported it.

#include <arm_neon.h>

// This one is ok
extern float32x4_t add(float32x4_t* v1, float32x4_t* v2) {
    return vaddq_f32(*v1, *v2);
#if 0
    produced assembly:

add:
    vldmia  r0, {d16-d17}
    vldmia  r1, {d18-d19}
    vadd.f32    q8, q8, q9
    vmov    r0, r1, d16
    vmov    r2, r3, d17
    bx  lr

#endif
}

// This one uses float32x4x2_t temporaries and has the bug
extern float32x4_t cross(float32x4_t* v1, float32x4_t* v2) {
    float32x4x2_t
        xxyyzz1=vzipq_f32(*v1, *v1),
        xxyyzz2=vzipq_f32(*v2, *v2);

    float32x2_t
        xx1=vget_low_f32(xxyyzz1.val[0]),
        yy1=vget_high_f32(xxyyzz1.val[0]),
        zz1=vget_low_f32(xxyyzz1.val[1]),
        xx2=vget_low_f32(xxyyzz2.val[0]),
        yy2=vget_high_f32(xxyyzz2.val[0]),
        zz2=vget_low_f32(xxyyzz2.val[1]);

    float32x2_t
        x=vmls_f32(vmul_f32(yy1, zz2), zz1, yy2),
        y=vmls_f32(vmul_f32(zz1, xx2), xx1, zz2),
        z=vmls_f32(vmul_f32(xx1, yy2), yy1, xx2);

    return vcombine_f32(vuzp_f32(x, y).val[0], z);
#if 0
    produced assembly:

cross:
    vldmia  r0, {d18-d19}
    vldmia  r1, {d16-d17}
    vmov    q10, q9
    vmov    q11, q8
    vzip.32 q9, q10
    vzip.32 q8, q11
    vmov    d24, d19
    vmov    d21, d16
    vmov    d16, d17
    vmul.f32    d19, d20, d21
    vmul.f32    d17, d24, d22
    vmls.f32    d17, d20, d16
    vmls.f32    d19, d18, d22
    vmul.f32    d16, d18, d16
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    sub sp, sp, #48           @ here
    vswp    d17, d16
    vmov    r0, r1, d16
    vmov    r2, r3, d17
    add sp, sp, #48           @ and here
    bx  lr

#endif
}
like image 785
jcayzac Avatar asked May 10 '11 02:05

jcayzac


People also ask

Does stack pointer increment decrement?

When you PUT something ONTO the stack (PUSH onto the stack), the SP is decremented before the item is placed on the stack. When you take something OFF of the stack (PULL from the stack), the SP is incremented after the item is pulled from the stack.

Does pop increment the stack pointer?

The POP ESP instruction increments the stack pointer (ESP) before data at the old top of stack is written into the destination.

Does the stack pointer move?

The stack pointer points to the last in-use byte of the stack. The standard convention is that when your function starts up, you can claim some of the stack by moving the stack pointer down--this indicates to any functions you might call that you're using those bytes of the stack.

What happens to the stack pointer when you pop an element from a bottom up stack?

The PUSH means pushing or inserting an element into the stack. The PUSH operation always increments the stack pointer and the POP operation always decrements the stack pointer.


1 Answers

This turned out to be a bug, so closing it.

like image 123
jcayzac Avatar answered Oct 26 '22 10:10

jcayzac