For the following loop GCC will only vectorize the loop if I tell it to use associative math e.g. with -Ofast
.
float sumf(float *x)
{
x = (float*)__builtin_assume_aligned(x, 64);
float sum = 0;
for(int i=0; i<2048; i++) sum += x[i];
return sum;
}
Here is the assembly with -Ofast -mavx
sumf(float*):
vxorps %xmm0, %xmm0, %xmm0
leaq 8192(%rdi), %rax
.L2:
vaddps (%rdi), %ymm0, %ymm0
addq $32, %rdi
cmpq %rdi, %rax
jne .L2
vhaddps %ymm0, %ymm0, %ymm0
vhaddps %ymm0, %ymm0, %ymm1
vperm2f128 $1, %ymm1, %ymm1, %ymm0
vaddps %ymm1, %ymm0, %ymm0
vzeroupper
ret
This clearly shows the loop has been vectorized.
But this loop also has a dependency chain. In order to overcome the latency of the addition I need to unroll and do partial sums at least three times on x86_64 (excluding Skylake which needs to unroll eight times and doing the addition with FMA instructions which need to unroll 10 times on Haswell and Broadwell). As far as I understand I can unroll the loop with -funroll-loops
.
Here is the assembly with -Ofast -mavx -funroll-loops
.
sumf(float*):
vxorps %xmm7, %xmm7, %xmm7
leaq 8192(%rdi), %rax
.L2:
vaddps (%rdi), %ymm7, %ymm0
addq $256, %rdi
vaddps -224(%rdi), %ymm0, %ymm1
vaddps -192(%rdi), %ymm1, %ymm2
vaddps -160(%rdi), %ymm2, %ymm3
vaddps -128(%rdi), %ymm3, %ymm4
vaddps -96(%rdi), %ymm4, %ymm5
vaddps -64(%rdi), %ymm5, %ymm6
vaddps -32(%rdi), %ymm6, %ymm7
cmpq %rdi, %rax
jne .L2
vhaddps %ymm7, %ymm7, %ymm8
vhaddps %ymm8, %ymm8, %ymm9
vperm2f128 $1, %ymm9, %ymm9, %ymm10
vaddps %ymm9, %ymm10, %ymm0
vzeroupper
ret
GCC does unroll the loop eight times. However, it does not do independent sums. It does eight dependent sums. That's pointless and no better than without unrolling.
How can I get GCC to unroll the loop and do independent partial sums?
Edit:
Clang unrolls to four independent partial sums even without -funroll-loops
for SSE but I am not sure its AVX code is as efficient. The compiler should not need -funroll-loops
with -Ofast
anyway so it's good to see Clang is doing this right at least for SSE.
Clang 3.5.1 with -Ofast
.
sumf(float*): # @sumf(float*)
xorps %xmm0, %xmm0
xorl %eax, %eax
xorps %xmm1, %xmm1
.LBB0_1: # %vector.body
movups (%rdi,%rax,4), %xmm2
movups 16(%rdi,%rax,4), %xmm3
addps %xmm0, %xmm2
addps %xmm1, %xmm3
movups 32(%rdi,%rax,4), %xmm0
movups 48(%rdi,%rax,4), %xmm1
addps %xmm2, %xmm0
addps %xmm3, %xmm1
addq $16, %rax
cmpq $2048, %rax # imm = 0x800
jne .LBB0_1
addps %xmm0, %xmm1
movaps %xmm1, %xmm2
movhlps %xmm2, %xmm2 # xmm2 = xmm2[1,1]
addps %xmm1, %xmm2
pshufd $1, %xmm2, %xmm0 # xmm0 = xmm2[1,0,0,0]
addps %xmm2, %xmm0
retq
ICC 13.0.1 with -O3
unrolls to two independent partial sums. ICC apparently assumes associative math with only -O3
.
.B1.8:
vaddps (%rdi,%rdx,4), %ymm1, %ymm1 #5.29
vaddps 32(%rdi,%rdx,4), %ymm0, %ymm0 #5.29
vaddps 64(%rdi,%rdx,4), %ymm1, %ymm1 #5.29
vaddps 96(%rdi,%rdx,4), %ymm0, %ymm0 #5.29
addq $32, %rdx #5.3
cmpq %rax, %rdx #5.3
jb ..B1.8 # Prob 99% #5.3
Some use of gcc intrinsics and __builtin_
produce this:
typedef float v8sf __attribute__((vector_size(32)));
typedef uint32_t v8u32 __attribute__((vector_size(32)));
static v8sf sumfvhelper1(v8sf arr[4])
{
v8sf retval = {0};
for (size_t i = 0; i < 4; i++)
retval += arr[i];
return retval;
}
static float sumfvhelper2(v8sf x)
{
v8sf t = __builtin_shuffle(x, (v8u32){4,5,6,7,0,1,2,3});
x += t;
t = __builtin_shuffle(x, (v8u32){2,3,0,1,6,7,4,5});
x += t;
t = __builtin_shuffle(x, (v8u32){1,0,3,2,5,4,7,6});
x += t;
return x[0];
}
float sumfv(float *x)
{
//x = __builtin_assume_aligned(x, 64);
v8sf *vx = (v8sf*)x;
v8sf sumvv[4] = {{0}};
for (size_t i = 0; i < 2048/8; i+=4)
{
sumvv[0] += vx[i+0];
sumvv[1] += vx[i+1];
sumvv[2] += vx[i+2];
sumvv[3] += vx[i+3];
}
v8sf sumv = sumfvhelper1(sumvv);
return sumfvhelper2(sumv);
}
Which gcc 4.8.4 gcc -Wall -Wextra -Wpedantic -std=gnu11 -march=native -O3 -fno-signed-zeros -fno-trapping-math -freciprocal-math -ffinite-math-only -fassociative-math -S
turns into:
sumfv:
vxorps %xmm2, %xmm2, %xmm2
xorl %eax, %eax
vmovaps %ymm2, %ymm3
vmovaps %ymm2, %ymm0
vmovaps %ymm2, %ymm1
.L7:
addq $4, %rax
vaddps (%rdi), %ymm1, %ymm1
subq $-128, %rdi
vaddps -96(%rdi), %ymm0, %ymm0
vaddps -64(%rdi), %ymm3, %ymm3
vaddps -32(%rdi), %ymm2, %ymm2
cmpq $256, %rax
jne .L7
vaddps %ymm2, %ymm3, %ymm2
vaddps %ymm0, %ymm1, %ymm0
vaddps %ymm0, %ymm2, %ymm0
vperm2f128 $1, %ymm0, %ymm0, %ymm1
vaddps %ymm0, %ymm1, %ymm0
vpermilps $78, %ymm0, %ymm1
vaddps %ymm0, %ymm1, %ymm0
vpermilps $177, %ymm0, %ymm1
vaddps %ymm0, %ymm1, %ymm0
vzeroupper
ret
The second helper function isn't strictly necessary, but summing over the elements of a vector tends to produce terrible code in gcc. If you're willing to do platform-dependent intrinsics, you can probably replace most of it with __builtin_ia32_hadps256()
.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With