When compiling this program with arm-elf-gcc-4.5 -O3 -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp:
#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
float32x4x2_t
xxyyzz1(vzipq_f32(v1, v1)),
xxyyzz2(vzipq_f32(v2, v2));
float32x2_t
xx1(vget_low_f32(xxyyzz1.val[0])),
yy1(vget_high_f32(xxyyzz1.val[0])),
zz1(vget_low_f32(xxyyzz1.val[1])),
xx2(vget_low_f32(xxyyzz2.val[0])),
yy2(vget_high_f32(xxyyzz2.val[0])),
zz2(vget_low_f32(xxyyzz2.val[1]));
float32x2_t
x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));
return vcombine_f32(vuzp_f32(x, y).val[0], z);
}
...this is what I get. Notice the two useless instructions marked with @<<<
_Z5crossRK19__simd128_float32_tS1_:
vldmia r0, {d16-d17}
vldmia r1, {d22-d23}
vmov q10, q8 @ v4sf
vmov q9, q11 @ v4sf
vzip.32 q8, q10
vzip.32 q11, q9
vmov d24, d17
vmov d21, d22
vmov d22, d23
vmul.f32 d17, d24, d18
vmul.f32 d19, d20, d21
vmls.f32 d19, d16, d18
vmls.f32 d17, d20, d22
vmul.f32 d16, d16, d22
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
sub sp, sp, #80 @<<<
vswp d17, d16
vmov r0, r1, d16 @ v4sf
vmov r2, r3, d17
add sp, sp, #80 @<<<
bx
The stack is never accessed, yet the stack pointer get decremented, then incremented by the same amount. Why?
If I modify the original code to include asm comment at the end of the prologue and the begining of the epilogue, like this:
#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
asm volatile("# End of prologue");
float32x4x2_t
xxyyzz1(vzipq_f32(v1, v1)),
xxyyzz2(vzipq_f32(v2, v2));
float32x2_t
xx1(vget_low_f32(xxyyzz1.val[0])),
yy1(vget_high_f32(xxyyzz1.val[0])),
zz1(vget_low_f32(xxyyzz1.val[1])),
xx2(vget_low_f32(xxyyzz2.val[0])),
yy2(vget_high_f32(xxyyzz2.val[0])),
zz2(vget_low_f32(xxyyzz2.val[1]));
float32x2_t
x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));
float32x4_t res(vcombine_f32(vuzp_f32(x, y).val[0], z));
asm volatile("# Start of epilogue");
return res;
}
Then I get slightly different version:
_Z5crossRK19__simd128_float32_tS1_:
sub sp, sp, #80
# End of prologue
vldmia r0, {d16-d17}
vldmia r1, {d22-d23}
vmov q10, q8 @ v4sf
vmov q9, q11 @ v4sf
vzip.32 q8, q10
vzip.32 q11, q9
vmov d24, d17
vmov d21, d22
vmov d22, d23
vmul.f32 d17, d24, d18
vmul.f32 d19, d20, d21
vmls.f32 d19, d16, d18
vmls.f32 d17, d20, d22
vmul.f32 d16, d16, d22
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
vswp d17, d16
# Start of epilogue
vmov r0, r1, d16 @ v4sf
vmov r2, r3, d17
add sp, sp, #80
bx lr
The stack pointer decrement/increment clearly is part of the prologue/epilogue, and happens even if the stack is not used. Is that to comply with some standard, or is it a gcc optimization bug?
EDIT: Compiler is arm-elf-gcc-4.5 (GCC) 4.5.0, configured with: /opt/local/var/macports/build/_opt_local_var_macports_sources_rsync.macports.org_release_ports_cross_arm-elf-gcc/work/gcc-4.5.0/configure --prefix=/opt/local --infodir=/opt/local/share/info --mandir=/opt/local/share/man --target=arm-elf --program-prefix=arm-elf- --program-suffix=-4.5 --without-included-gettext --enable-obsolete --with-newlib --disable-__cxa_atexit --enable-multilib --enable-biendian --disable-libgfortran --with-gxx-include-dir=/opt/local/arm-elf/include/c++/4.5.0/ --enable-languages=c,c++,objc --build=x86_64-apple-darwin10 --enable-fpu
EDIT: I managed to pinpoint the problem using the following C source. It only happens when using arrays of vector types as temporaries, such as float32x4x2_t which is declared as struct { float32x4_t val[2]; }
, even tho these temporaries are made registers. I believe this is a bug, so I reported it.
#include <arm_neon.h>
// This one is ok
extern float32x4_t add(float32x4_t* v1, float32x4_t* v2) {
return vaddq_f32(*v1, *v2);
#if 0
produced assembly:
add:
vldmia r0, {d16-d17}
vldmia r1, {d18-d19}
vadd.f32 q8, q8, q9
vmov r0, r1, d16
vmov r2, r3, d17
bx lr
#endif
}
// This one uses float32x4x2_t temporaries and has the bug
extern float32x4_t cross(float32x4_t* v1, float32x4_t* v2) {
float32x4x2_t
xxyyzz1=vzipq_f32(*v1, *v1),
xxyyzz2=vzipq_f32(*v2, *v2);
float32x2_t
xx1=vget_low_f32(xxyyzz1.val[0]),
yy1=vget_high_f32(xxyyzz1.val[0]),
zz1=vget_low_f32(xxyyzz1.val[1]),
xx2=vget_low_f32(xxyyzz2.val[0]),
yy2=vget_high_f32(xxyyzz2.val[0]),
zz2=vget_low_f32(xxyyzz2.val[1]);
float32x2_t
x=vmls_f32(vmul_f32(yy1, zz2), zz1, yy2),
y=vmls_f32(vmul_f32(zz1, xx2), xx1, zz2),
z=vmls_f32(vmul_f32(xx1, yy2), yy1, xx2);
return vcombine_f32(vuzp_f32(x, y).val[0], z);
#if 0
produced assembly:
cross:
vldmia r0, {d18-d19}
vldmia r1, {d16-d17}
vmov q10, q9
vmov q11, q8
vzip.32 q9, q10
vzip.32 q8, q11
vmov d24, d19
vmov d21, d16
vmov d16, d17
vmul.f32 d19, d20, d21
vmul.f32 d17, d24, d22
vmls.f32 d17, d20, d16
vmls.f32 d19, d18, d22
vmul.f32 d16, d18, d16
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
sub sp, sp, #48 @ here
vswp d17, d16
vmov r0, r1, d16
vmov r2, r3, d17
add sp, sp, #48 @ and here
bx lr
#endif
}
When you PUT something ONTO the stack (PUSH onto the stack), the SP is decremented before the item is placed on the stack. When you take something OFF of the stack (PULL from the stack), the SP is incremented after the item is pulled from the stack.
The POP ESP instruction increments the stack pointer (ESP) before data at the old top of stack is written into the destination.
The stack pointer points to the last in-use byte of the stack. The standard convention is that when your function starts up, you can claim some of the stack by moving the stack pointer down--this indicates to any functions you might call that you're using those bytes of the stack.
The PUSH means pushing or inserting an element into the stack. The PUSH operation always increments the stack pointer and the POP operation always decrements the stack pointer.
This turned out to be a bug, so closing it.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With