I am using an Haswell Core i7-4790K.
When I compile the following toy example with icc -O3 -std=c99 -march=core-avx2 -g
:
#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>
typedef struct {
__m256i a;
__m256i b;
__m256i c;
} mystruct_t;
#define SIZE 1000
#define TEST_VAL 42
int _do(mystruct_t* array) {
int value = 0;
for (size_t i = 0; i < SIZE; ++i) {
array[i].a = _mm256_set1_epi8(TEST_VAL + i*3 );
array[i].b = _mm256_set1_epi8(TEST_VAL + i*3 + 1);
array[i].c = _mm256_set1_epi8(TEST_VAL + i*3 + 2);
value += _mm_popcnt_u32(_mm256_movemask_epi8(array[i].a)) +
_mm_popcnt_u32(_mm256_movemask_epi8(array[i].b)) +
_mm_popcnt_u32(_mm256_movemask_epi8(array[i].c));
}
return value;
}
int main() {
mystruct_t* array = (mystruct_t*)_mm_malloc(SIZE * sizeof(*array), 32);
printf("%d\n", _do(array));
_mm_free(array);
}
The following ASM code is produced for the _do()
function:
0x0000000000400bc0 <+0>: xor %eax,%eax
0x0000000000400bc2 <+2>: xor %ecx,%ecx
0x0000000000400bc4 <+4>: xor %edx,%edx
0x0000000000400bc6 <+6>: nopl (%rax)
0x0000000000400bc9 <+9>: nopl 0x0(%rax)
0x0000000000400bd0 <+16>: lea 0x2b(%rdx),%r8d
0x0000000000400bd4 <+20>: inc %ecx
0x0000000000400bd6 <+22>: lea 0x2a(%rdx),%esi
0x0000000000400bd9 <+25>: lea 0x2c(%rdx),%r9d
0x0000000000400bdd <+29>: add $0x3,%edx
0x0000000000400be0 <+32>: vmovd %r8d,%xmm1
0x0000000000400be5 <+37>: vpbroadcastb %xmm1,%ymm4
0x0000000000400bea <+42>: vmovd %esi,%xmm0
0x0000000000400bee <+46>: vpmovmskb %ymm4,%r11d
0x0000000000400bf2 <+50>: vmovd %r9d,%xmm2
0x0000000000400bf7 <+55>: vmovdqu %ymm4,0x20(%rdi)
0x0000000000400bfc <+60>: vpbroadcastb %xmm0,%ymm3
0x0000000000400c01 <+65>: vpbroadcastb %xmm2,%ymm5
0x0000000000400c06 <+70>: vpmovmskb %ymm3,%r10d
0x0000000000400c0a <+74>: vmovdqu %ymm3,(%rdi)
0x0000000000400c0e <+78>: vmovdqu %ymm5,0x40(%rdi)
0x0000000000400c13 <+83>: popcnt %r11d,%esi
0x0000000000400c18 <+88>: add $0x60,%rdi
0x0000000000400c1c <+92>: vpmovmskb %ymm5,%r11d
0x0000000000400c20 <+96>: popcnt %r10d,%r9d
0x0000000000400c25 <+101>: popcnt %r11d,%r8d
0x0000000000400c2a <+106>: add %esi,%r9d
0x0000000000400c2d <+109>: add %r8d,%r9d
0x0000000000400c30 <+112>: add %r9d,%eax
0x0000000000400c33 <+115>: cmp $0x3e8,%ecx
0x0000000000400c39 <+121>: jb 0x400bd0 <_do+16>
0x0000000000400c3b <+123>: vzeroupper
0x0000000000400c3e <+126>: retq
0x0000000000400c3f <+127>: nop
If I compile the same code using gcc-5 -O3 -std=c99 -mavx2 -march=native -g
, the following ASM code is produced for the _do()
function:
0x0000000000400650 <+0>: lea 0x17700(%rdi),%r9
0x0000000000400657 <+7>: mov $0x2a,%r8d
0x000000000040065d <+13>: xor %eax,%eax
0x000000000040065f <+15>: nop
0x0000000000400660 <+16>: lea 0x1(%r8),%edx
0x0000000000400664 <+20>: vmovd %r8d,%xmm2
0x0000000000400669 <+25>: xor %esi,%esi
0x000000000040066b <+27>: vpbroadcastb %xmm2,%ymm2
0x0000000000400670 <+32>: vmovd %edx,%xmm1
0x0000000000400674 <+36>: add $0x60,%rdi
0x0000000000400678 <+40>: lea 0x2(%r8),%edx
0x000000000040067c <+44>: vpbroadcastb %xmm1,%ymm1
0x0000000000400681 <+49>: vmovdqa %ymm2,-0x60(%rdi)
0x0000000000400686 <+54>: add $0x3,%r8d
0x000000000040068a <+58>: vmovd %edx,%xmm0
0x000000000040068e <+62>: vpmovmskb %ymm2,%edx
0x0000000000400692 <+66>: vmovdqa %ymm1,-0x40(%rdi)
0x0000000000400697 <+71>: vpbroadcastb %xmm0,%ymm0
0x000000000040069c <+76>: popcnt %edx,%esi
0x00000000004006a0 <+80>: vpmovmskb %ymm1,%edx
0x00000000004006a4 <+84>: popcnt %edx,%edx
0x00000000004006a8 <+88>: vpmovmskb %ymm0,%ecx
0x00000000004006ac <+92>: add %esi,%edx
0x00000000004006ae <+94>: vmovdqa %ymm0,-0x20(%rdi)
0x00000000004006b3 <+99>: popcnt %ecx,%ecx
0x00000000004006b7 <+103>: add %ecx,%edx
0x00000000004006b9 <+105>: add %edx,%eax
0x00000000004006bb <+107>: cmp %rdi,%r9
0x00000000004006be <+110>: jne 0x400660 <_do+16>
0x00000000004006c0 <+112>: vzeroupper
0x00000000004006c3 <+115>: retq
My questions are:
1) Why icc uses unaligned moves (vmovdqu) unlike gcc?
2) Is there a penalty when vmovdqu is used instead of vmovdqa on aligned memory?
P.S: The problem is the same using SSE instructions/registers.
Thanks
There is no penalty to using VMOVDQU when the address is aligned. The behavior is identical to using VMOVDQA in that case.
As for "why" there may not be a single clear answer. It's possible that ICC does this deliberately so that users who later call _do
with an unaligned argument will not crash, but it's also possible that it's simply emergent behavior of the compiler. Someone on the Intel compiler team could answer this question, the rest of us can only speculate.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With