Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Segmentation fault with array of __m256i when using clang/g++

I'm attempting to generate arrays of __m256i's to reuse in another computation. When I attempt to do that (even with a minimal testcase), I get a segmentation fault - but only if the code is compiled with g++ or clang. If I compile the code with the Intel compiler (version 16.0), no segmentation fault occurs. Here is a test case I created:

int main() {
    __m256i *table = new __m256i[10000];
    __m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
    table[99] = zeroes;
}

When compiling the above with clang 3.6 and g++ 4.8, a segmentation fault occurs.

Here's the assembly generated by the Intel compiler (from https://gcc.godbolt.org/, icc 13.0):

pushq     %rbx                                          #3.12
movq      %rsp, %rbx                                    #3.12
andq      $-32, %rsp                                    #3.12
pushq     %rbp                                          #3.12
pushq     %rbp                                          #3.12
movq      8(%rbx), %rbp                                 #3.12
movq      %rbp, 8(%rsp)                                 #3.12
movq      %rsp, %rbp                                    #3.12
subq      $112, %rsp                                    #3.12
movl      $3200, %eax                                   #4.38
vzeroupper                                              #4.38
movq      %rax, %rdi                                    #4.38
call      operator new[](unsigned long)                 #4.38
movq      %rax, -112(%rbp)                              #4.38
movq      -112(%rbp), %rax                              #4.38
movq      %rax, -104(%rbp)                              #4.20
vxorps    %ymm0, %ymm0, %ymm0                           #5.22
vmovdqu   %ymm0, -80(%rbp)                              #5.22
vmovdqu   -80(%rbp), %ymm0                              #5.22
vmovdqu   %ymm0, -48(%rbp)                              #5.20
movl      $3168, %eax                                   #6.17
addq      -104(%rbp), %rax                              #6.5
vmovdqu   -48(%rbp), %ymm0                              #6.17
vmovdqu   %ymm0, (%rax)                                 #6.5
movl      $0, %eax                                      #7.1
vzeroupper                                              #7.1
leave                                                   #7.1
movq      %rbx, %rsp                                    #7.1
popq      %rbx                                          #7.1
ret                                                     #7.1

And here's from clang 3.7:

pushq   %rbp
movq    %rsp, %rbp
andq    $-32, %rsp
subq    $192, %rsp
xorl    %eax, %eax
movl    $3200, %ecx             # imm = 0xC80
movl    %ecx, %edi
movl    %eax, 28(%rsp)          # 4-byte Spill
callq   operator new[](unsigned long)
movq    %rax, 88(%rsp)
movq    $0, 168(%rsp)
movq    $0, 160(%rsp)
movq    $0, 152(%rsp)
movq    $0, 144(%rsp)
vmovq   168(%rsp), %xmm0        # xmm0 = mem[0],zero
vmovq   160(%rsp), %xmm1        # xmm1 = mem[0],zero
vpunpcklqdq     %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovq   152(%rsp), %xmm1        # xmm1 = mem[0],zero
vpslldq $8, %xmm1, %xmm1        # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
vmovaps %xmm1, %xmm2
vinserti128     $1, %xmm0, %ymm2, %ymm2
vmovaps %ymm2, 96(%rsp)
vmovaps %ymm2, 32(%rsp)
movq    88(%rsp), %rax
vmovaps %ymm2, 3168(%rax)
movl    28(%rsp), %eax          # 4-byte Reload
movq    %rbp, %rsp
popq    %rbp
vzeroupper
retq

Am I running into a compiler bug in clang/g++? Or am I simply doing something wrong?

like image 497
hichris123 Avatar asked Dec 23 '15 17:12

hichris123


1 Answers

I have said many times before that implicit SIMD loads/stores are a bad idea. Stop using them. Use explicit loads/stores like this

int64_t* table = new int64_t[4*10000];
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
_mm256_storeu_si256((__m256i*)&table[4*99], zeroes);

or since this is POD use the cross-compiler/OS function _mm_malloc

int64_t* table = (int64_t*)_mm_malloc(sizeof(int64_t)*4*10000, 32);
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
_mm256_store_si256((__m256i*)&table[4*99], zeroes);

You can use _mm256_setzero_si256() instead of _mm256_set_epi64x(0, 0, 0, 0) (note that _mm256_set_epi64x does not work in 32-bit mode on some version of MSVC) but GCC and Clang are smart enough to know they are the same thing.

Since you're using intrinsics which are not part of the C/C++ specification then some rules such as strict aliasing may be overlooked.

like image 185
Z boson Avatar answered Oct 27 '22 23:10

Z boson