I'm attempting to generate arrays of __m256i
's to reuse in another computation. When I attempt to do that (even with a minimal testcase), I get a segmentation fault - but only if the code is compiled with g++ or clang. If I compile the code with the Intel compiler (version 16.0), no segmentation fault occurs. Here is a test case I created:
int main() {
__m256i *table = new __m256i[10000];
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
table[99] = zeroes;
}
When compiling the above with clang 3.6 and g++ 4.8, a segmentation fault occurs.
Here's the assembly generated by the Intel compiler (from https://gcc.godbolt.org/, icc 13.0):
pushq %rbx #3.12
movq %rsp, %rbx #3.12
andq $-32, %rsp #3.12
pushq %rbp #3.12
pushq %rbp #3.12
movq 8(%rbx), %rbp #3.12
movq %rbp, 8(%rsp) #3.12
movq %rsp, %rbp #3.12
subq $112, %rsp #3.12
movl $3200, %eax #4.38
vzeroupper #4.38
movq %rax, %rdi #4.38
call operator new[](unsigned long) #4.38
movq %rax, -112(%rbp) #4.38
movq -112(%rbp), %rax #4.38
movq %rax, -104(%rbp) #4.20
vxorps %ymm0, %ymm0, %ymm0 #5.22
vmovdqu %ymm0, -80(%rbp) #5.22
vmovdqu -80(%rbp), %ymm0 #5.22
vmovdqu %ymm0, -48(%rbp) #5.20
movl $3168, %eax #6.17
addq -104(%rbp), %rax #6.5
vmovdqu -48(%rbp), %ymm0 #6.17
vmovdqu %ymm0, (%rax) #6.5
movl $0, %eax #7.1
vzeroupper #7.1
leave #7.1
movq %rbx, %rsp #7.1
popq %rbx #7.1
ret #7.1
And here's from clang 3.7:
pushq %rbp
movq %rsp, %rbp
andq $-32, %rsp
subq $192, %rsp
xorl %eax, %eax
movl $3200, %ecx # imm = 0xC80
movl %ecx, %edi
movl %eax, 28(%rsp) # 4-byte Spill
callq operator new[](unsigned long)
movq %rax, 88(%rsp)
movq $0, 168(%rsp)
movq $0, 160(%rsp)
movq $0, 152(%rsp)
movq $0, 144(%rsp)
vmovq 168(%rsp), %xmm0 # xmm0 = mem[0],zero
vmovq 160(%rsp), %xmm1 # xmm1 = mem[0],zero
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovq 152(%rsp), %xmm1 # xmm1 = mem[0],zero
vpslldq $8, %xmm1, %xmm1 # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
vmovaps %xmm1, %xmm2
vinserti128 $1, %xmm0, %ymm2, %ymm2
vmovaps %ymm2, 96(%rsp)
vmovaps %ymm2, 32(%rsp)
movq 88(%rsp), %rax
vmovaps %ymm2, 3168(%rax)
movl 28(%rsp), %eax # 4-byte Reload
movq %rbp, %rsp
popq %rbp
vzeroupper
retq
Am I running into a compiler bug in clang/g++? Or am I simply doing something wrong?
I have said many times before that implicit SIMD loads/stores are a bad idea. Stop using them. Use explicit loads/stores like this
int64_t* table = new int64_t[4*10000];
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
_mm256_storeu_si256((__m256i*)&table[4*99], zeroes);
or since this is POD use the cross-compiler/OS function _mm_malloc
int64_t* table = (int64_t*)_mm_malloc(sizeof(int64_t)*4*10000, 32);
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
_mm256_store_si256((__m256i*)&table[4*99], zeroes);
You can use _mm256_setzero_si256()
instead of _mm256_set_epi64x(0, 0, 0, 0)
(note that _mm256_set_epi64x
does not work in 32-bit mode on some version of MSVC) but GCC and Clang are smart enough to know they are the same thing.
Since you're using intrinsics which are not part of the C/C++ specification then some rules such as strict aliasing may be overlooked.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With