Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why does march=native corrupt my program?

Tags:

c++

gcc

I'm compiling the program:

#include <iostream>
#include <vector>
#include <cstddef>
#include <algorithm>

struct Model
{
    int open, extend;
};

struct Cell
{
    int a, b;
};

typedef std::vector<std::vector<Cell>> DPMatrix;

void print(const DPMatrix& matrix)
{
    for (std::size_t i = 0; i < matrix.size(); ++i) {
        for (std::size_t j = 0; j < matrix[i].size(); ++j) {
            std::cout << '{' << matrix[i][j].a << ' ' << matrix[i][j].b << "} ";
        }
        std::cout << std::endl;
    }
}

DPMatrix init_dp_matrix(const std::size_t num_cols, const std::size_t num_rows, const Model& model)
{
    DPMatrix result(num_cols, DPMatrix::value_type(num_rows, Cell()));
    const int inf = model.open * std::max(num_cols, num_rows);
    for (int i = 1; i < num_cols; ++i) {
        result[i][0].b = model.open + (i - 1) * model.extend;
    }
    for (int j = 1; j < num_rows; ++j) {
        result[0][j].a = model.open + (j - 1) * model.extend;
    }
    return result;
}

int main()
{
    const Model model = {-8, -1};
    const DPMatrix matrix = init_dp_matrix(10, 2, model);
    print(matrix);
}

With GCC 9.2.0:

$ g++-9 -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)

with -march=native:

$ g++-9 -O3 -march=native -o bug bug.cpp

On an Ubuntu machine with Intel chips:

$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 18.04.3 LTS
Release:        18.04
Codename:       bioni

$ grep model /proc/cpuinfo | head -2
model           : 85
model name      : Intel(R) Xeon(R) Platinum 8175M CPU @ 2.50GHz

Running the program I get bogus output:

$ ./bug 
{0 0} {-8 0} 
{-2048 255} {0 0} 
{-2304 255} {0 0} 
{-2560 255} {0 0} 
{-2816 255} {0 0} 
{-3072 255} {0 0} 
{-3328 255} {0 0} 
{-3584 255} {0 0} 
{-3840 255} {0 0} 
{0 -16} {0 0}

If I compile without -march=native I get the correct output:

$ g++-9 -O3 -o bug bug.cpp
$ ./bug 
{0 0} {-8 0} 
{0 -8} {0 0} 
{0 -9} {0 0} 
{0 -10} {0 0} 
{0 -11} {0 0} 
{0 -12} {0 0} 
{0 -13} {0 0} 
{0 -14} {0 0} 
{0 -15} {0 0} 
{0 -16} {0 0

The assembly for the -match=native version is:

$ g++-9 -O3 -march=native -S bug.cpp
$ cat bug.s
    .file   "bug.cpp"
    .text
    .section    .text._ZNKSt5ctypeIcE8do_widenEc,"axG",@progbits,_ZNKSt5ctypeIcE8do_widenEc,comdat
    .align 2
    .p2align 4
    .weak   _ZNKSt5ctypeIcE8do_widenEc
    .type   _ZNKSt5ctypeIcE8do_widenEc, @function
_ZNKSt5ctypeIcE8do_widenEc:
.LFB1303:
    .cfi_startproc
    movl    %esi, %eax
    ret
    .cfi_endproc
.LFE1303:
    .size   _ZNKSt5ctypeIcE8do_widenEc, .-_ZNKSt5ctypeIcE8do_widenEc
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC0:
    .string "} "
    .text
    .p2align 4
    .globl  _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
    .type   _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, @function
_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB2359:
    .cfi_startproc
    movq    (%rdi), %rdx
    cmpq    %rdx, 8(%rdi)
    je  .L23
    pushq   %r15
    .cfi_def_cfa_offset 16
    .cfi_offset 15, -16
    pushq   %r14
    .cfi_def_cfa_offset 24
    .cfi_offset 14, -24
    pushq   %r13
    .cfi_def_cfa_offset 32
    .cfi_offset 13, -32
    movabsq $-6148914691236517205, %r13
    pushq   %r12
    .cfi_def_cfa_offset 40
    .cfi_offset 12, -40
    xorl    %r12d, %r12d
    pushq   %rbp
    .cfi_def_cfa_offset 48
    .cfi_offset 6, -48
    movq    %rdi, %rbp
    pushq   %rbx
    .cfi_def_cfa_offset 56
    .cfi_offset 3, -56
    subq    $24, %rsp
    .cfi_def_cfa_offset 80
    .p2align 4,,10
    .p2align 3
.L4:
    leaq    (%r12,%r12,2), %rbx
    salq    $3, %rbx
    addq    %rbx, %rdx
    movq    8(%rdx), %rax
    xorl    %r14d, %r14d
    cmpq    %rax, (%rdx)
    je  .L8
    .p2align 4,,10
    .p2align 3
.L5:
    movl    $1, %edx
    leaq    15(%rsp), %rsi
    movl    $_ZSt4cout, %edi
    movb    $123, 15(%rsp)
    call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
    movq    %rax, %rdi
    movq    0(%rbp), %rax
    leaq    0(,%r14,8), %r15
    movq    (%rax,%rbx), %rax
    movl    (%rax,%r14,8), %esi
    incq    %r14
    call    _ZNSolsEi
    movq    %rax, %rdi
    movl    $1, %edx
    leaq    15(%rsp), %rsi
    movb    $32, 15(%rsp)
    call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
    movq    %rax, %rdi
    movq    0(%rbp), %rax
    movq    (%rax,%rbx), %rax
    movl    4(%rax,%r15), %esi
    call    _ZNSolsEi
    movq    %rax, %rdi
    movl    $2, %edx
    movl    $.LC0, %esi
    call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
    movq    0(%rbp), %rdx
    addq    %rbx, %rdx
    movq    8(%rdx), %rax
    subq    (%rdx), %rax
    sarq    $3, %rax
    cmpq    %rax, %r14
    jb  .L5
.L8:
    movq    _ZSt4cout(%rip), %rax
    movq    -24(%rax), %rax
    movq    _ZSt4cout+240(%rax), %r14
    testq   %r14, %r14
    je  .L26
    cmpb    $0, 56(%r14)
    je  .L9
    movsbl  67(%r14), %esi
.L10:
    movl    $_ZSt4cout, %edi
    call    _ZNSo3putEc
    movq    %rax, %rdi
    call    _ZNSo5flushEv
    movq    0(%rbp), %rdx
    movq    8(%rbp), %rax
    incq    %r12
    subq    %rdx, %rax
    sarq    $3, %rax
    imulq   %r13, %rax
    cmpq    %r12, %rax
    ja  .L4
    addq    $24, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 56
    popq    %rbx
    .cfi_def_cfa_offset 48
    popq    %rbp
    .cfi_def_cfa_offset 40
    popq    %r12
    .cfi_def_cfa_offset 32
    popq    %r13
    .cfi_def_cfa_offset 24
    popq    %r14
    .cfi_def_cfa_offset 16
    popq    %r15
    .cfi_def_cfa_offset 8
    ret
    .p2align 4,,10
    .p2align 3
.L9:
    .cfi_restore_state
    movq    %r14, %rdi
    call    _ZNKSt5ctypeIcE13_M_widen_initEv
    movq    (%r14), %rax
    movl    $10, %esi
    movq    48(%rax), %rax
    cmpq    $_ZNKSt5ctypeIcE8do_widenEc, %rax
    je  .L10
    movq    %r14, %rdi
    call    *%rax
    movsbl  %al, %esi
    jmp .L10
.L23:
    .cfi_def_cfa_offset 8
    .cfi_restore 3
    .cfi_restore 6
    .cfi_restore 12
    .cfi_restore 13
    .cfi_restore 14
    .cfi_restore 15
    ret
.L26:
    .cfi_def_cfa_offset 80
    .cfi_offset 3, -56
    .cfi_offset 6, -48
    .cfi_offset 12, -40
    .cfi_offset 13, -32
    .cfi_offset 14, -24
    .cfi_offset 15, -16
    call    _ZSt16__throw_bad_castv
    .cfi_endproc
.LFE2359:
    .size   _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
    .section    .rodata.str1.8,"aMS",@progbits,1
    .align 8
.LC2:
    .string "cannot create std::vector larger than max_size()"
    .section    .text.unlikely,"ax",@progbits
.LCOLDB6:
    .text
.LHOTB6:
    .p2align 4
    .globl  _Z14init_dp_matrixmmRK5Model
    .type   _Z14init_dp_matrixmmRK5Model, @function
_Z14init_dp_matrixmmRK5Model:
.LFB2360:
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    .cfi_lsda 0x3,.LLSDA2360
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movabsq $1152921504606846975, %rax
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %r15
    pushq   %r14
    pushq   %r13
    pushq   %r12
    pushq   %rbx
    andq    $-32, %rsp
    subq    $64, %rsp
    .cfi_offset 15, -24
    .cfi_offset 14, -32
    .cfi_offset 13, -40
    .cfi_offset 12, -48
    .cfi_offset 3, -56
    movq    %rdi, 24(%rsp)
    movq    %rsi, 40(%rsp)
    movq    %rcx, 16(%rsp)
    cmpq    %rax, %rdx
    ja  .L103
    movq    %rdx, %r15
    testq   %rdx, %rdx
    je  .L71
    leaq    0(,%rdx,8), %rbx
    movq    %rbx, %rdi
.LEHB0:
    call    _Znwm
.LEHE0:
    movq    %rax, %r13
    leaq    -1(%r15), %rax
    cmpq    $3, %rax
    movq    %r15, %rdx
    movq    %r13, %rax
    jbe .L30
    shrq    $2, %rdx
    salq    $5, %rdx
    addq    %r13, %rdx
    vpxor   %xmm0, %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L32:
    vmovdqu32   %ymm0, (%rax)
    addq    $32, %rax
    cmpq    %rdx, %rax
    jne .L32
    movq    %r15, %rcx
    andq    $-4, %rcx
    movq    %r15, %rdx
    andl    $3, %edx
    leaq    0(%r13,%rcx,8), %rax
    cmpq    %rcx, %r15
    je  .L33
.L30:
    movq    $0, (%rax)
    cmpq    $1, %rdx
    je  .L33
    movq    $0, 8(%rax)
    cmpq    $2, %rdx
    je  .L33
    movq    $0, 16(%rax)
    cmpq    $3, %rdx
    je  .L33
    movq    $0, 24(%rax)
.L33:
    leaq    0(%r13,%rbx), %rax
    movq    %rax, 56(%rsp)
.L29:
    movabsq $384307168202282325, %rax
    cmpq    %rax, 40(%rsp)
    ja  .L104
    movq    40(%rsp), %rax
    movq    24(%rsp), %r12
    leaq    (%rax,%rax,2), %rbx
    movq    $0, (%r12)
    movq    $0, 8(%r12)
    movq    $0, 16(%r12)
    salq    $3, %rbx
    testq   %rax, %rax
    je  .L35
    movq    %rbx, %rdi
    vzeroupper
.LEHB1:
    call    _Znwm
.LEHE1:
    addq    %rax, %rbx
    movq    %rax, (%r12)
    movq    %rax, 8(%r12)
    movq    %rbx, 16(%r12)
    movq    56(%rsp), %r12
    movq    %rax, %r14
    subq    %r13, %r12
    movq    %r12, %rax
    sarq    $3, %rax
    je  .L40
    movabsq $1152921504606846975, %rdx
    cmpq    %rdx, %rax
    ja  .L41
    movq    40(%rsp), %rax
    movq    %r14, %rbx
    movq    %rax, 48(%rsp)
    .p2align 4,,10
    .p2align 3
.L46:
    movq    $0, (%rbx)
    movq    $0, 8(%rbx)
    movq    $0, 16(%rbx)
    movq    %r12, %rdi
.LEHB2:
    call    _Znwm
.LEHE2:
    leaq    (%rax,%r12), %rcx
    movq    %rax, (%rbx)
    movq    %rcx, 16(%rbx)
    movq    %rax, %rdi
    cmpq    %r13, 56(%rsp)
    je  .L42
    movq    %r12, %rdx
    movq    %r13, %rsi
    movq    %rcx, 32(%rsp)
    call    memcpy
    movq    32(%rsp), %rcx
    addq    $24, %rbx
    movq    %rcx, -16(%rbx)
    decq    48(%rsp)
    jne .L46
    movq    24(%rsp), %rax
    movq    %rbx, 8(%rax)
.L47:
    movq    %r13, %rdi
    call    _ZdlPv
.L48:
    movq    16(%rsp), %rax
    cmpq    $1, 40(%rsp)
    movl    (%rax), %edx
    jbe .L62
    movl    4(%rax), %edi
    movq    24(%rsp), %rax
    movq    (%rax), %rsi
    movq    40(%rsp), %rax
    leaq    -2(%rax), %rcx
    cmpq    $7, %rcx
    jbe .L73
    movq    %rcx, %r8
    shrq    $3, %r8
    leaq    (%r8,%r8,2), %r8
    salq    $6, %r8
    vmovdqa64   .LC1(%rip), %ymm3
    vmovdqa64   .LC3(%rip), %ymm4
    vmovdqa64   .LC4(%rip), %ymm6
    vmovdqa64   .LC5(%rip), %ymm5
    vpbroadcastd    %edi, %ymm10
    vpbroadcastd    %edx, %ymm9
    leaq    24(%rsi), %rax
    leaq    24(%rsi,%r8), %r8
    vpcmpeqd    %ymm8, %ymm8, %ymm8
    kxnorb  %k1, %k1, %k1
    .p2align 4,,10
    .p2align 3
.L61:
    vmovdqa64   %ymm3, %ymm0
    vpaddd  %ymm8, %ymm0, %ymm0
    vpmulld %ymm10, %ymm0, %ymm0
    vmovdqu64   (%rax), %ymm2
    vmovdqu64   96(%rax), %ymm1
    vpermt2q    32(%rax), %ymm6, %ymm2
    vpermt2q    128(%rax), %ymm6, %ymm1
    vpermt2q    64(%rax), %ymm5, %ymm2
    vpaddd  %ymm9, %ymm0, %ymm0
    vpermt2q    160(%rax), %ymm5, %ymm1
    kmovb   %k1, %k2
    addq    $192, %rax
    vpscatterqd %xmm0, 4(,%ymm2,1){%k2}
    vperm2i128  $17, %ymm0, %ymm0, %ymm0
    kmovb   %k1, %k3
    vpaddd  %ymm4, %ymm3, %ymm3
    vpscatterqd %xmm0, 4(,%ymm1,1){%k3}
    cmpq    %r8, %rax
    jne .L61
    andq    $-8, %rcx
    leaq    1(%rcx), %r8
    leal    1(%rcx), %eax
.L59:
    leaq    (%r8,%r8,2), %rcx
    movq    (%rsi,%rcx,8), %r8
    leal    -1(%rax), %ecx
    imull   %edi, %ecx
    movq    40(%rsp), %rbx
    addl    %edx, %ecx
    movl    %ecx, 4(%r8)
    leal    1(%rax), %ecx
    movslq  %ecx, %r8
    cmpq    %r8, %rbx
    jbe .L62
    leaq    (%r8,%r8,2), %r8
    movq    (%rsi,%r8,8), %r9
    movl    %edi, %r8d
    imull   %eax, %r8d
    addl    %edx, %r8d
    movl    %r8d, 4(%r9)
    leal    2(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %ecx
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    %edx, %ecx
    movl    %ecx, 4(%r9)
    leal    3(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %r8d
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    %edx, %r8d
    movl    %r8d, 4(%r9)
    leal    4(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %ecx
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    %edx, %ecx
    movl    %ecx, 4(%r9)
    leal    5(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %r8d
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    %edx, %r8d
    movl    %r8d, 4(%r9)
    leal    6(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %rbx
    jbe .L62
    imull   %edi, %ecx
    leaq    (%r9,%r9,2), %r9
    movq    (%rsi,%r9,8), %r9
    addl    $7, %eax
    addl    %edx, %ecx
    cltq
    movl    %ecx, 4(%r9)
    cmpq    %rax, %rbx
    jbe .L62
    imull   %r8d, %edi
    leaq    (%rax,%rax,2), %rax
    movq    (%rsi,%rax,8), %rax
    leal    (%rdi,%rdx), %r8d
    movl    %r8d, 4(%rax)
.L62:
    cmpq    $1, %r15
    jbe .L27
    movq    16(%rsp), %rax
    leaq    -1(%r15), %r8
    movl    4(%rax), %edi
    movq    24(%rsp), %rax
    movq    (%rax), %rax
    movq    (%rax), %rsi
    leaq    -2(%r15), %rax
    cmpq    $6, %rax
    jbe .L74
    movq    %r8, %rcx
    shrq    $3, %rcx
    salq    $6, %rcx
    vmovdqa64   .LC1(%rip), %ymm2
    vmovdqa64   .LC3(%rip), %ymm4
    vpbroadcastd    %edi, %ymm6
    vpbroadcastd    %edx, %ymm5
    movq    %rsi, %rax
    addq    %rsi, %rcx
    vpcmpeqd    %ymm3, %ymm3, %ymm3
    .p2align 4,,10
    .p2align 3
.L66:
    vmovdqa64   %ymm2, %ymm0
    vpaddd  %ymm3, %ymm0, %ymm0
    vpmulld %ymm6, %ymm0, %ymm0
    addq    $64, %rax
    vpaddd  %ymm4, %ymm2, %ymm2
    vpaddd  %ymm5, %ymm0, %ymm0
    vmovd   %xmm0, -56(%rax)
    vpextrd $1, %xmm0, -48(%rax)
    vpextrd $2, %xmm0, -40(%rax)
    vpextrd $3, %xmm0, -32(%rax)
    vextracti128    $0x1, %ymm0, %xmm0
    vmovd   %xmm0, -24(%rax)
    vpextrd $1, %xmm0, -16(%rax)
    vpextrd $2, %xmm0, -8(%rax)
    vpextrd $3, %xmm0, (%rax)
    cmpq    %rcx, %rax
    jne .L66
    movq    %r8, %rcx
    andq    $-8, %rcx
    leaq    1(%rcx), %r9
    leal    1(%rcx), %eax
    cmpq    %r8, %rcx
    je  .L27
.L64:
    leal    -1(%rax), %ecx
    imull   %edi, %ecx
    addl    %edx, %ecx
    movl    %ecx, (%rsi,%r9,8)
    leal    1(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r15, %r9
    jnb .L27
    movl    %edi, %r8d
    imull   %eax, %r8d
    addl    %edx, %r8d
    movl    %r8d, (%rsi,%r9,8)
    leal    2(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %r15
    jbe .L27
    imull   %edi, %ecx
    addl    %edx, %ecx
    movl    %ecx, (%rsi,%r9,8)
    leal    3(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r15, %r9
    jnb .L27
    imull   %edi, %r8d
    addl    %edx, %r8d
    movl    %r8d, (%rsi,%r9,8)
    leal    4(%rax), %r8d
    movslq  %r8d, %r9
    cmpq    %r9, %r15
    jbe .L27
    imull   %edi, %ecx
    addl    %edx, %ecx
    movl    %ecx, (%rsi,%r9,8)
    leal    5(%rax), %ecx
    movslq  %ecx, %r9
    cmpq    %r9, %r15
    jbe .L27
    imull   %edi, %r8d
    addl    $6, %eax
    cltq
    addl    %edx, %r8d
    movl    %r8d, (%rsi,%r9,8)
    cmpq    %rax, %r15
    jbe .L27
    imull   %ecx, %edi
    addl    %edi, %edx
    movl    %edx, (%rsi,%rax,8)
.L27:
    movq    24(%rsp), %rax
    vzeroupper
    leaq    -40(%rbp), %rsp
    popq    %rbx
    popq    %r12
    popq    %r13
    popq    %r14
    popq    %r15
    popq    %rbp
    .cfi_remember_state
    .cfi_def_cfa 7, 8
    ret
    .p2align 4,,10
    .p2align 3
.L37:
    .cfi_restore_state
    movq    %r12, 8(%r14)
    addq    $24, %r14
    cmpq    %r14, %rbx
    je  .L45
.L40:
    movq    $0, (%r14)
    movq    %r12, 16(%r14)
    cmpq    %r13, 56(%rsp)
    je  .L37
    movq    %r12, %rdx
    movq    %r13, %rsi
    xorl    %edi, %edi
    call    memcpy
    addq    $24, %r14
    movq    %r12, -16(%r14)
    cmpq    %r14, %rbx
    jne .L40
.L45:
    movq    24(%rsp), %rax
    movq    %rbx, 8(%rax)
    testq   %r13, %r13
    je  .L48
.L105:
    movq    %r13, %rdi
    call    _ZdlPv
    jmp .L48
    .p2align 4,,10
    .p2align 3
.L42:
    movq    %rcx, 8(%rbx)
    addq    $24, %rbx
    decq    48(%rsp)
    jne .L46
    movq    24(%rsp), %rax
    movq    %rbx, 8(%rax)
    testq   %r13, %r13
    je  .L48
    jmp .L105
    .p2align 4,,10
    .p2align 3
.L71:
    movq    $0, 56(%rsp)
    xorl    %r13d, %r13d
    jmp .L29
    .p2align 4,,10
    .p2align 3
.L35:
    testq   %r13, %r13
    je  .L106
    vzeroupper
    jmp .L47
.L73:
    movl    $1, %eax
    movl    $1, %r8d
    jmp .L59
.L74:
    movl    $1, %eax
    movl    $1, %r9d
    jmp .L64
.L106:
    movq    16(%rsp), %rax
    movl    (%rax), %edx
    jmp .L62
.L41:
    movq    $0, (%r14)
    movq    $0, 8(%r14)
    movq    $0, 16(%r14)
.LEHB3:
    call    _ZSt17__throw_bad_allocv
.LEHE3:
.L104:
    movl    $.LC2, %edi
    vzeroupper
.LEHB4:
    call    _ZSt20__throw_length_errorPKc
.LEHE4:
.L103:
    movl    $.LC2, %edi
.LEHB5:
    call    _ZSt20__throw_length_errorPKc
.LEHE5:
.L78:
    movq    %rax, %rdi
    jmp .L49
.L77:
    movq    %rax, %rdi
    jmp .L50
.L75:
    movq    %rax, %r12
    vzeroupper
    jmp .L56
    .globl  __gxx_personality_v0
    .section    .gcc_except_table,"a",@progbits
    .align 4
.LLSDA2360:
    .byte   0xff
    .byte   0x3
    .uleb128 .LLSDATT2360-.LLSDATTD2360
.LLSDATTD2360:
    .byte   0x1
    .uleb128 .LLSDACSE2360-.LLSDACSB2360
.LLSDACSB2360:
    .uleb128 .LEHB0-.LFB2360
    .uleb128 .LEHE0-.LEHB0
    .uleb128 0
    .uleb128 0
    .uleb128 .LEHB1-.LFB2360
    .uleb128 .LEHE1-.LEHB1
    .uleb128 .L75-.LFB2360
    .uleb128 0
    .uleb128 .LEHB2-.LFB2360
    .uleb128 .LEHE2-.LEHB2
    .uleb128 .L77-.LFB2360
    .uleb128 0x1
    .uleb128 .LEHB3-.LFB2360
    .uleb128 .LEHE3-.LEHB3
    .uleb128 .L78-.LFB2360
    .uleb128 0x1
    .uleb128 .LEHB4-.LFB2360
    .uleb128 .LEHE4-.LEHB4
    .uleb128 .L75-.LFB2360
    .uleb128 0
    .uleb128 .LEHB5-.LFB2360
    .uleb128 .LEHE5-.LEHB5
    .uleb128 0
    .uleb128 0
.LLSDACSE2360:
    .byte   0x1
    .byte   0
    .align 4
    .long   0

.LLSDATT2360:
    .text
    .cfi_endproc
    .section    .text.unlikely
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    .cfi_lsda 0x3,.LLSDAC2360
    .type   _Z14init_dp_matrixmmRK5Model.cold, @function
_Z14init_dp_matrixmmRK5Model.cold:
.LFSB2360:
.L49:
    .cfi_def_cfa 6, 16
    .cfi_offset 3, -56
    .cfi_offset 6, -16
    .cfi_offset 12, -48
    .cfi_offset 13, -40
    .cfi_offset 14, -32
    .cfi_offset 15, -24
    movq    %r14, %rbx
.L50:
    vzeroupper
    call    __cxa_begin_catch
.L53:
    cmpq    %rbx, %r14
    jne .L107
.LEHB6:
    call    __cxa_rethrow
.LEHE6:
.L76:
    movq    %rax, %r12
    vzeroupper
    call    __cxa_end_catch
    movq    24(%rsp), %rax
    movq    (%rax), %rdi
    testq   %rdi, %rdi
    je  .L56
    call    _ZdlPv
.L56:
    testq   %r13, %r13
    je  .L69
    movq    %r13, %rdi
    call    _ZdlPv
.L69:
    movq    %r12, %rdi
.LEHB7:
    call    _Unwind_Resume
.LEHE7:
.L107:
    movq    (%r14), %rdi
    testq   %rdi, %rdi
    je  .L52
    call    _ZdlPv
.L52:
    addq    $24, %r14
    jmp .L53
    .cfi_endproc
.LFE2360:
    .section    .gcc_except_table
    .align 4
.LLSDAC2360:
    .byte   0xff
    .byte   0x3
    .uleb128 .LLSDATTC2360-.LLSDATTDC2360
.LLSDATTDC2360:
    .byte   0x1
    .uleb128 .LLSDACSEC2360-.LLSDACSBC2360
.LLSDACSBC2360:
    .uleb128 .LEHB6-.LCOLDB6
    .uleb128 .LEHE6-.LEHB6
    .uleb128 .L76-.LCOLDB6
    .uleb128 0
    .uleb128 .LEHB7-.LCOLDB6
    .uleb128 .LEHE7-.LEHB7
    .uleb128 0
    .uleb128 0
.LLSDACSEC2360:
    .byte   0x1
    .byte   0
    .align 4
    .long   0

.LLSDATTC2360:
    .section    .text.unlikely
    .text
    .size   _Z14init_dp_matrixmmRK5Model, .-_Z14init_dp_matrixmmRK5Model
    .section    .text.unlikely
    .size   _Z14init_dp_matrixmmRK5Model.cold, .-_Z14init_dp_matrixmmRK5Model.cold
.LCOLDE6:
    .text
.LHOTE6:
    .section    .text._ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev,"axG",@progbits,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED5Ev,comdat
    .align 2
    .p2align 4
    .weak   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
    .type   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, @function
_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev:
.LFB2637:
    .cfi_startproc
    pushq   %r12
    .cfi_def_cfa_offset 16
    .cfi_offset 12, -16
    movq    %rdi, %r12
    pushq   %rbp
    .cfi_def_cfa_offset 24
    .cfi_offset 6, -24
    pushq   %rbx
    .cfi_def_cfa_offset 32
    .cfi_offset 3, -32
    movq    8(%rdi), %rbx
    movq    (%rdi), %rbp
    cmpq    %rbp, %rbx
    je  .L109
    .p2align 4,,10
    .p2align 3
.L113:
    movq    0(%rbp), %rdi
    testq   %rdi, %rdi
    je  .L110
    addq    $24, %rbp
    call    _ZdlPv
    cmpq    %rbp, %rbx
    jne .L113
.L111:
    movq    (%r12), %rbp
.L109:
    testq   %rbp, %rbp
    je  .L115
    popq    %rbx
    .cfi_remember_state
    .cfi_def_cfa_offset 24
    movq    %rbp, %rdi
    popq    %rbp
    .cfi_def_cfa_offset 16
    popq    %r12
    .cfi_def_cfa_offset 8
    jmp _ZdlPv
    .p2align 4,,10
    .p2align 3
.L110:
    .cfi_restore_state
    addq    $24, %rbp
    cmpq    %rbp, %rbx
    jne .L113
    jmp .L111
    .p2align 4,,10
    .p2align 3
.L115:
    popq    %rbx
    .cfi_def_cfa_offset 24
    popq    %rbp
    .cfi_def_cfa_offset 16
    popq    %r12
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE2637:
    .size   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, .-_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
    .weak   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
    .set    _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
    .section    .text.unlikely
.LCOLDB7:
    .section    .text.startup,"ax",@progbits
.LHOTB7:
    .p2align 4
    .globl  main
    .type   main, @function
main:
.LFB2371:
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    .cfi_lsda 0x3,.LLSDA2371
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movl    $2, %edx
    movl    $10, %esi
    subq    $48, %rsp
    .cfi_def_cfa_offset 64
    leaq    16(%rsp), %rdi
    leaq    8(%rsp), %rcx
    movq    $-8, 8(%rsp)
.LEHB8:
    call    _Z14init_dp_matrixmmRK5Model
.LEHE8:
    leaq    16(%rsp), %rdi
.LEHB9:
    call    _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.LEHE9:
    leaq    16(%rsp), %rdi
    call    _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
    addq    $48, %rsp
    .cfi_remember_state
    .cfi_def_cfa_offset 16
    xorl    %eax, %eax
    popq    %rbp
    .cfi_def_cfa_offset 8
    ret
.L119:
    .cfi_restore_state
    movq    %rax, %rbp
    jmp .L118
    .section    .gcc_except_table
.LLSDA2371:
    .byte   0xff
    .byte   0xff
    .byte   0x1
    .uleb128 .LLSDACSE2371-.LLSDACSB2371
.LLSDACSB2371:
    .uleb128 .LEHB8-.LFB2371
    .uleb128 .LEHE8-.LEHB8
    .uleb128 0
    .uleb128 0
    .uleb128 .LEHB9-.LFB2371
    .uleb128 .LEHE9-.LEHB9
    .uleb128 .L119-.LFB2371
    .uleb128 0
.LLSDACSE2371:
    .section    .text.startup
    .cfi_endproc
    .section    .text.unlikely
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    .cfi_lsda 0x3,.LLSDAC2371
    .type   main.cold, @function
main.cold:
.LFSB2371:
.L118:
    .cfi_def_cfa_offset 64
    .cfi_offset 6, -16
    leaq    16(%rsp), %rdi
    vzeroupper
    call    _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
    movq    %rbp, %rdi
.LEHB10:
    call    _Unwind_Resume
.LEHE10:
    .cfi_endproc
.LFE2371:
    .section    .gcc_except_table
.LLSDAC2371:
    .byte   0xff
    .byte   0xff
    .byte   0x1
    .uleb128 .LLSDACSEC2371-.LLSDACSBC2371
.LLSDACSBC2371:
    .uleb128 .LEHB10-.LCOLDB7
    .uleb128 .LEHE10-.LEHB10
    .uleb128 0
    .uleb128 0
.LLSDACSEC2371:
    .section    .text.unlikely
    .section    .text.startup
    .size   main, .-main
    .section    .text.unlikely
    .size   main.cold, .-main.cold
.LCOLDE7:
    .section    .text.startup
.LHOTE7:
    .p2align 4
    .type   _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, @function
_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB3017:
    .cfi_startproc
    subq    $8, %rsp
    .cfi_def_cfa_offset 16
    movl    $_ZStL8__ioinit, %edi
    call    _ZNSt8ios_base4InitC1Ev
    movl    $__dso_handle, %edx
    movl    $_ZStL8__ioinit, %esi
    movl    $_ZNSt8ios_base4InitD1Ev, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 8
    jmp __cxa_atexit
    .cfi_endproc
.LFE3017:
    .size   _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
    .section    .init_array,"aw"
    .align 8
    .quad   _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
    .local  _ZStL8__ioinit
    .comm   _ZStL8__ioinit,1,1
    .section    .rodata.cst32,"aM",@progbits,32
    .align 32
.LC1:
    .long   1
    .long   2
    .long   3
    .long   4
    .long   5
    .long   6
    .long   7
    .long   8
    .align 32
.LC3:
    .long   8
    .long   8
    .long   8
    .long   8
    .long   8
    .long   8
    .long   8
    .long   8
    .align 32
.LC4:
    .quad   0
    .quad   3
    .quad   6
    .quad   0
    .align 32
.LC5:
    .quad   0
    .quad   1
    .quad   2
    .quad   5
    .hidden __dso_handle
    .ident  "GCC: (Homebrew GCC 9.2.0) 9.2.0"
    .section    .note.GNU-stack,"",@progbits

The assembly for the non -march=native version is available on godbolt.

What is going wrong, is this a compiler bug or is my program ill formed? How can I mitigate this issue if it is a compiler bug?

Additional info

Compiling with -v:

$ ~/tools/octopus/build/brew/bin/g++-9 -O3 -march=native -S bug.cpp -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0) 
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-S' '-v' '-shared-libgcc'
 /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /home/dcooke/tools/octopus/build/brew/nonexistent -idirafter /home/dcooke/tools/octopus/build/brew/include -idirafter /usr/include/x86_64-linux-gnu -idirafter /usr/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=33792 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o bug.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
        compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP

Compiling with -O2 or less makes the problem go away:

$ g++-9 -O2 -march=native -o bug bug.cpp
$ ./bug 
{0 0} {-8 0} 
{0 -8} {0 0} 
{0 -9} {0 0} 
{0 -10} {0 0} 
{0 -11} {0 0} 
{0 -12} {0 0} 
{0 -13} {0 0} 
{0 -14} {0 0} 
{0 -15} {0 0} 
{0 -16} {0 0}

I tried building on a different machine with Intel chips:

$ rpm -q centos-release
centos-release-7-3.1611.el7.centos.x86_64

$ grep model /proc/cpuinfo | head -2
model       : 85
model name  : Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz

$ g++-9 -O3 -march=native -o bug bug.cpp -v
Reading specs from /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/well/gerton/dan/apps/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0) 
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-o' 'bug' '-v' '-shared-libgcc'
 /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /gpfs1/well/gerton/dan/apps/octopus/build/brew/nonexistent -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/include -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/opt/glibc/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mno-pku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o /tmp/cczPrvHP.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
    compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP

$ ./bug 
{0 0} {-8 0} 
{0 -8} {0 0} 
{0 -9} {0 0} 
{0 -10} {0 0} 
{0 -11} {0 0} 
{0 -12} {0 0} 
{0 -13} {0 0} 
{0 -14} {0 0} 
{0 -15} {0 0} 
{0 -16} {0 0} 

The correct output...

-ftree-loop-vectorize is the culprit:

$ g++-9 -march=native -O2 -o bug bug.cpp -ftree-loop-vectorize
$ ./bug
{0 0} {-8 0} 
{-2048 255} {0 0} 
{-2304 255} {0 0} 
{-2560 255} {0 0} 
{-2816 255} {0 0} 
{-3072 255} {0 0} 
{-3328 255} {0 0} 
{-3584 255} {0 0} 
{-3840 255} {0 0} 
{0 -16} {0 0}

None of the other O3 flags result in this behaviour.

like image 348
Daniel Avatar asked Sep 20 '19 10:09

Daniel


1 Answers

This turned out to be due to a bug in binutils gas. This solution was to upgrade my binutils to 2.32.

like image 111
Daniel Avatar answered Oct 01 '22 22:10

Daniel