I'm compiling the program:
#include <iostream>
#include <vector>
#include <cstddef>
#include <algorithm>
struct Model
{
int open, extend;
};
struct Cell
{
int a, b;
};
typedef std::vector<std::vector<Cell>> DPMatrix;
void print(const DPMatrix& matrix)
{
for (std::size_t i = 0; i < matrix.size(); ++i) {
for (std::size_t j = 0; j < matrix[i].size(); ++j) {
std::cout << '{' << matrix[i][j].a << ' ' << matrix[i][j].b << "} ";
}
std::cout << std::endl;
}
}
DPMatrix init_dp_matrix(const std::size_t num_cols, const std::size_t num_rows, const Model& model)
{
DPMatrix result(num_cols, DPMatrix::value_type(num_rows, Cell()));
const int inf = model.open * std::max(num_cols, num_rows);
for (int i = 1; i < num_cols; ++i) {
result[i][0].b = model.open + (i - 1) * model.extend;
}
for (int j = 1; j < num_rows; ++j) {
result[0][j].a = model.open + (j - 1) * model.extend;
}
return result;
}
int main()
{
const Model model = {-8, -1};
const DPMatrix matrix = init_dp_matrix(10, 2, model);
print(matrix);
}
With GCC 9.2.0:
$ g++-9 -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
with -march=native
:
$ g++-9 -O3 -march=native -o bug bug.cpp
On an Ubuntu machine with Intel chips:
$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 18.04.3 LTS
Release: 18.04
Codename: bioni
$ grep model /proc/cpuinfo | head -2
model : 85
model name : Intel(R) Xeon(R) Platinum 8175M CPU @ 2.50GHz
Running the program I get bogus output:
$ ./bug
{0 0} {-8 0}
{-2048 255} {0 0}
{-2304 255} {0 0}
{-2560 255} {0 0}
{-2816 255} {0 0}
{-3072 255} {0 0}
{-3328 255} {0 0}
{-3584 255} {0 0}
{-3840 255} {0 0}
{0 -16} {0 0}
If I compile without -march=native
I get the correct output:
$ g++-9 -O3 -o bug bug.cpp
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0
The assembly for the -match=native
version is:
$ g++-9 -O3 -march=native -S bug.cpp
$ cat bug.s
.file "bug.cpp"
.text
.section .text._ZNKSt5ctypeIcE8do_widenEc,"axG",@progbits,_ZNKSt5ctypeIcE8do_widenEc,comdat
.align 2
.p2align 4
.weak _ZNKSt5ctypeIcE8do_widenEc
.type _ZNKSt5ctypeIcE8do_widenEc, @function
_ZNKSt5ctypeIcE8do_widenEc:
.LFB1303:
.cfi_startproc
movl %esi, %eax
ret
.cfi_endproc
.LFE1303:
.size _ZNKSt5ctypeIcE8do_widenEc, .-_ZNKSt5ctypeIcE8do_widenEc
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "} "
.text
.p2align 4
.globl _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.type _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, @function
_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB2359:
.cfi_startproc
movq (%rdi), %rdx
cmpq %rdx, 8(%rdi)
je .L23
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
movabsq $-6148914691236517205, %r13
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
xorl %r12d, %r12d
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
movq %rdi, %rbp
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq $24, %rsp
.cfi_def_cfa_offset 80
.p2align 4,,10
.p2align 3
.L4:
leaq (%r12,%r12,2), %rbx
salq $3, %rbx
addq %rbx, %rdx
movq 8(%rdx), %rax
xorl %r14d, %r14d
cmpq %rax, (%rdx)
je .L8
.p2align 4,,10
.p2align 3
.L5:
movl $1, %edx
leaq 15(%rsp), %rsi
movl $_ZSt4cout, %edi
movb $123, 15(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq %rax, %rdi
movq 0(%rbp), %rax
leaq 0(,%r14,8), %r15
movq (%rax,%rbx), %rax
movl (%rax,%r14,8), %esi
incq %r14
call _ZNSolsEi
movq %rax, %rdi
movl $1, %edx
leaq 15(%rsp), %rsi
movb $32, 15(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq %rax, %rdi
movq 0(%rbp), %rax
movq (%rax,%rbx), %rax
movl 4(%rax,%r15), %esi
call _ZNSolsEi
movq %rax, %rdi
movl $2, %edx
movl $.LC0, %esi
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq 0(%rbp), %rdx
addq %rbx, %rdx
movq 8(%rdx), %rax
subq (%rdx), %rax
sarq $3, %rax
cmpq %rax, %r14
jb .L5
.L8:
movq _ZSt4cout(%rip), %rax
movq -24(%rax), %rax
movq _ZSt4cout+240(%rax), %r14
testq %r14, %r14
je .L26
cmpb $0, 56(%r14)
je .L9
movsbl 67(%r14), %esi
.L10:
movl $_ZSt4cout, %edi
call _ZNSo3putEc
movq %rax, %rdi
call _ZNSo5flushEv
movq 0(%rbp), %rdx
movq 8(%rbp), %rax
incq %r12
subq %rdx, %rax
sarq $3, %rax
imulq %r13, %rax
cmpq %r12, %rax
ja .L4
addq $24, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L9:
.cfi_restore_state
movq %r14, %rdi
call _ZNKSt5ctypeIcE13_M_widen_initEv
movq (%r14), %rax
movl $10, %esi
movq 48(%rax), %rax
cmpq $_ZNKSt5ctypeIcE8do_widenEc, %rax
je .L10
movq %r14, %rdi
call *%rax
movsbl %al, %esi
jmp .L10
.L23:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
.cfi_restore 13
.cfi_restore 14
.cfi_restore 15
ret
.L26:
.cfi_def_cfa_offset 80
.cfi_offset 3, -56
.cfi_offset 6, -48
.cfi_offset 12, -40
.cfi_offset 13, -32
.cfi_offset 14, -24
.cfi_offset 15, -16
call _ZSt16__throw_bad_castv
.cfi_endproc
.LFE2359:
.size _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC2:
.string "cannot create std::vector larger than max_size()"
.section .text.unlikely,"ax",@progbits
.LCOLDB6:
.text
.LHOTB6:
.p2align 4
.globl _Z14init_dp_matrixmmRK5Model
.type _Z14init_dp_matrixmmRK5Model, @function
_Z14init_dp_matrixmmRK5Model:
.LFB2360:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2360
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movabsq $1152921504606846975, %rax
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 15, -24
.cfi_offset 14, -32
.cfi_offset 13, -40
.cfi_offset 12, -48
.cfi_offset 3, -56
movq %rdi, 24(%rsp)
movq %rsi, 40(%rsp)
movq %rcx, 16(%rsp)
cmpq %rax, %rdx
ja .L103
movq %rdx, %r15
testq %rdx, %rdx
je .L71
leaq 0(,%rdx,8), %rbx
movq %rbx, %rdi
.LEHB0:
call _Znwm
.LEHE0:
movq %rax, %r13
leaq -1(%r15), %rax
cmpq $3, %rax
movq %r15, %rdx
movq %r13, %rax
jbe .L30
shrq $2, %rdx
salq $5, %rdx
addq %r13, %rdx
vpxor %xmm0, %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L32:
vmovdqu32 %ymm0, (%rax)
addq $32, %rax
cmpq %rdx, %rax
jne .L32
movq %r15, %rcx
andq $-4, %rcx
movq %r15, %rdx
andl $3, %edx
leaq 0(%r13,%rcx,8), %rax
cmpq %rcx, %r15
je .L33
.L30:
movq $0, (%rax)
cmpq $1, %rdx
je .L33
movq $0, 8(%rax)
cmpq $2, %rdx
je .L33
movq $0, 16(%rax)
cmpq $3, %rdx
je .L33
movq $0, 24(%rax)
.L33:
leaq 0(%r13,%rbx), %rax
movq %rax, 56(%rsp)
.L29:
movabsq $384307168202282325, %rax
cmpq %rax, 40(%rsp)
ja .L104
movq 40(%rsp), %rax
movq 24(%rsp), %r12
leaq (%rax,%rax,2), %rbx
movq $0, (%r12)
movq $0, 8(%r12)
movq $0, 16(%r12)
salq $3, %rbx
testq %rax, %rax
je .L35
movq %rbx, %rdi
vzeroupper
.LEHB1:
call _Znwm
.LEHE1:
addq %rax, %rbx
movq %rax, (%r12)
movq %rax, 8(%r12)
movq %rbx, 16(%r12)
movq 56(%rsp), %r12
movq %rax, %r14
subq %r13, %r12
movq %r12, %rax
sarq $3, %rax
je .L40
movabsq $1152921504606846975, %rdx
cmpq %rdx, %rax
ja .L41
movq 40(%rsp), %rax
movq %r14, %rbx
movq %rax, 48(%rsp)
.p2align 4,,10
.p2align 3
.L46:
movq $0, (%rbx)
movq $0, 8(%rbx)
movq $0, 16(%rbx)
movq %r12, %rdi
.LEHB2:
call _Znwm
.LEHE2:
leaq (%rax,%r12), %rcx
movq %rax, (%rbx)
movq %rcx, 16(%rbx)
movq %rax, %rdi
cmpq %r13, 56(%rsp)
je .L42
movq %r12, %rdx
movq %r13, %rsi
movq %rcx, 32(%rsp)
call memcpy
movq 32(%rsp), %rcx
addq $24, %rbx
movq %rcx, -16(%rbx)
decq 48(%rsp)
jne .L46
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
.L47:
movq %r13, %rdi
call _ZdlPv
.L48:
movq 16(%rsp), %rax
cmpq $1, 40(%rsp)
movl (%rax), %edx
jbe .L62
movl 4(%rax), %edi
movq 24(%rsp), %rax
movq (%rax), %rsi
movq 40(%rsp), %rax
leaq -2(%rax), %rcx
cmpq $7, %rcx
jbe .L73
movq %rcx, %r8
shrq $3, %r8
leaq (%r8,%r8,2), %r8
salq $6, %r8
vmovdqa64 .LC1(%rip), %ymm3
vmovdqa64 .LC3(%rip), %ymm4
vmovdqa64 .LC4(%rip), %ymm6
vmovdqa64 .LC5(%rip), %ymm5
vpbroadcastd %edi, %ymm10
vpbroadcastd %edx, %ymm9
leaq 24(%rsi), %rax
leaq 24(%rsi,%r8), %r8
vpcmpeqd %ymm8, %ymm8, %ymm8
kxnorb %k1, %k1, %k1
.p2align 4,,10
.p2align 3
.L61:
vmovdqa64 %ymm3, %ymm0
vpaddd %ymm8, %ymm0, %ymm0
vpmulld %ymm10, %ymm0, %ymm0
vmovdqu64 (%rax), %ymm2
vmovdqu64 96(%rax), %ymm1
vpermt2q 32(%rax), %ymm6, %ymm2
vpermt2q 128(%rax), %ymm6, %ymm1
vpermt2q 64(%rax), %ymm5, %ymm2
vpaddd %ymm9, %ymm0, %ymm0
vpermt2q 160(%rax), %ymm5, %ymm1
kmovb %k1, %k2
addq $192, %rax
vpscatterqd %xmm0, 4(,%ymm2,1){%k2}
vperm2i128 $17, %ymm0, %ymm0, %ymm0
kmovb %k1, %k3
vpaddd %ymm4, %ymm3, %ymm3
vpscatterqd %xmm0, 4(,%ymm1,1){%k3}
cmpq %r8, %rax
jne .L61
andq $-8, %rcx
leaq 1(%rcx), %r8
leal 1(%rcx), %eax
.L59:
leaq (%r8,%r8,2), %rcx
movq (%rsi,%rcx,8), %r8
leal -1(%rax), %ecx
imull %edi, %ecx
movq 40(%rsp), %rbx
addl %edx, %ecx
movl %ecx, 4(%r8)
leal 1(%rax), %ecx
movslq %ecx, %r8
cmpq %r8, %rbx
jbe .L62
leaq (%r8,%r8,2), %r8
movq (%rsi,%r8,8), %r9
movl %edi, %r8d
imull %eax, %r8d
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 2(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %ecx
movl %ecx, 4(%r9)
leal 3(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %r8d
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 4(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %ecx
movl %ecx, 4(%r9)
leal 5(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %r8d
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 6(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl $7, %eax
addl %edx, %ecx
cltq
movl %ecx, 4(%r9)
cmpq %rax, %rbx
jbe .L62
imull %r8d, %edi
leaq (%rax,%rax,2), %rax
movq (%rsi,%rax,8), %rax
leal (%rdi,%rdx), %r8d
movl %r8d, 4(%rax)
.L62:
cmpq $1, %r15
jbe .L27
movq 16(%rsp), %rax
leaq -1(%r15), %r8
movl 4(%rax), %edi
movq 24(%rsp), %rax
movq (%rax), %rax
movq (%rax), %rsi
leaq -2(%r15), %rax
cmpq $6, %rax
jbe .L74
movq %r8, %rcx
shrq $3, %rcx
salq $6, %rcx
vmovdqa64 .LC1(%rip), %ymm2
vmovdqa64 .LC3(%rip), %ymm4
vpbroadcastd %edi, %ymm6
vpbroadcastd %edx, %ymm5
movq %rsi, %rax
addq %rsi, %rcx
vpcmpeqd %ymm3, %ymm3, %ymm3
.p2align 4,,10
.p2align 3
.L66:
vmovdqa64 %ymm2, %ymm0
vpaddd %ymm3, %ymm0, %ymm0
vpmulld %ymm6, %ymm0, %ymm0
addq $64, %rax
vpaddd %ymm4, %ymm2, %ymm2
vpaddd %ymm5, %ymm0, %ymm0
vmovd %xmm0, -56(%rax)
vpextrd $1, %xmm0, -48(%rax)
vpextrd $2, %xmm0, -40(%rax)
vpextrd $3, %xmm0, -32(%rax)
vextracti128 $0x1, %ymm0, %xmm0
vmovd %xmm0, -24(%rax)
vpextrd $1, %xmm0, -16(%rax)
vpextrd $2, %xmm0, -8(%rax)
vpextrd $3, %xmm0, (%rax)
cmpq %rcx, %rax
jne .L66
movq %r8, %rcx
andq $-8, %rcx
leaq 1(%rcx), %r9
leal 1(%rcx), %eax
cmpq %r8, %rcx
je .L27
.L64:
leal -1(%rax), %ecx
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 1(%rax), %ecx
movslq %ecx, %r9
cmpq %r15, %r9
jnb .L27
movl %edi, %r8d
imull %eax, %r8d
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
leal 2(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 3(%rax), %ecx
movslq %ecx, %r9
cmpq %r15, %r9
jnb .L27
imull %edi, %r8d
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
leal 4(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 5(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %r8d
addl $6, %eax
cltq
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
cmpq %rax, %r15
jbe .L27
imull %ecx, %edi
addl %edi, %edx
movl %edx, (%rsi,%rax,8)
.L27:
movq 24(%rsp), %rax
vzeroupper
leaq -40(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.p2align 4,,10
.p2align 3
.L37:
.cfi_restore_state
movq %r12, 8(%r14)
addq $24, %r14
cmpq %r14, %rbx
je .L45
.L40:
movq $0, (%r14)
movq %r12, 16(%r14)
cmpq %r13, 56(%rsp)
je .L37
movq %r12, %rdx
movq %r13, %rsi
xorl %edi, %edi
call memcpy
addq $24, %r14
movq %r12, -16(%r14)
cmpq %r14, %rbx
jne .L40
.L45:
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
testq %r13, %r13
je .L48
.L105:
movq %r13, %rdi
call _ZdlPv
jmp .L48
.p2align 4,,10
.p2align 3
.L42:
movq %rcx, 8(%rbx)
addq $24, %rbx
decq 48(%rsp)
jne .L46
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
testq %r13, %r13
je .L48
jmp .L105
.p2align 4,,10
.p2align 3
.L71:
movq $0, 56(%rsp)
xorl %r13d, %r13d
jmp .L29
.p2align 4,,10
.p2align 3
.L35:
testq %r13, %r13
je .L106
vzeroupper
jmp .L47
.L73:
movl $1, %eax
movl $1, %r8d
jmp .L59
.L74:
movl $1, %eax
movl $1, %r9d
jmp .L64
.L106:
movq 16(%rsp), %rax
movl (%rax), %edx
jmp .L62
.L41:
movq $0, (%r14)
movq $0, 8(%r14)
movq $0, 16(%r14)
.LEHB3:
call _ZSt17__throw_bad_allocv
.LEHE3:
.L104:
movl $.LC2, %edi
vzeroupper
.LEHB4:
call _ZSt20__throw_length_errorPKc
.LEHE4:
.L103:
movl $.LC2, %edi
.LEHB5:
call _ZSt20__throw_length_errorPKc
.LEHE5:
.L78:
movq %rax, %rdi
jmp .L49
.L77:
movq %rax, %rdi
jmp .L50
.L75:
movq %rax, %r12
vzeroupper
jmp .L56
.globl __gxx_personality_v0
.section .gcc_except_table,"a",@progbits
.align 4
.LLSDA2360:
.byte 0xff
.byte 0x3
.uleb128 .LLSDATT2360-.LLSDATTD2360
.LLSDATTD2360:
.byte 0x1
.uleb128 .LLSDACSE2360-.LLSDACSB2360
.LLSDACSB2360:
.uleb128 .LEHB0-.LFB2360
.uleb128 .LEHE0-.LEHB0
.uleb128 0
.uleb128 0
.uleb128 .LEHB1-.LFB2360
.uleb128 .LEHE1-.LEHB1
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB2-.LFB2360
.uleb128 .LEHE2-.LEHB2
.uleb128 .L77-.LFB2360
.uleb128 0x1
.uleb128 .LEHB3-.LFB2360
.uleb128 .LEHE3-.LEHB3
.uleb128 .L78-.LFB2360
.uleb128 0x1
.uleb128 .LEHB4-.LFB2360
.uleb128 .LEHE4-.LEHB4
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB5-.LFB2360
.uleb128 .LEHE5-.LEHB5
.uleb128 0
.uleb128 0
.LLSDACSE2360:
.byte 0x1
.byte 0
.align 4
.long 0
.LLSDATT2360:
.text
.cfi_endproc
.section .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2360
.type _Z14init_dp_matrixmmRK5Model.cold, @function
_Z14init_dp_matrixmmRK5Model.cold:
.LFSB2360:
.L49:
.cfi_def_cfa 6, 16
.cfi_offset 3, -56
.cfi_offset 6, -16
.cfi_offset 12, -48
.cfi_offset 13, -40
.cfi_offset 14, -32
.cfi_offset 15, -24
movq %r14, %rbx
.L50:
vzeroupper
call __cxa_begin_catch
.L53:
cmpq %rbx, %r14
jne .L107
.LEHB6:
call __cxa_rethrow
.LEHE6:
.L76:
movq %rax, %r12
vzeroupper
call __cxa_end_catch
movq 24(%rsp), %rax
movq (%rax), %rdi
testq %rdi, %rdi
je .L56
call _ZdlPv
.L56:
testq %r13, %r13
je .L69
movq %r13, %rdi
call _ZdlPv
.L69:
movq %r12, %rdi
.LEHB7:
call _Unwind_Resume
.LEHE7:
.L107:
movq (%r14), %rdi
testq %rdi, %rdi
je .L52
call _ZdlPv
.L52:
addq $24, %r14
jmp .L53
.cfi_endproc
.LFE2360:
.section .gcc_except_table
.align 4
.LLSDAC2360:
.byte 0xff
.byte 0x3
.uleb128 .LLSDATTC2360-.LLSDATTDC2360
.LLSDATTDC2360:
.byte 0x1
.uleb128 .LLSDACSEC2360-.LLSDACSBC2360
.LLSDACSBC2360:
.uleb128 .LEHB6-.LCOLDB6
.uleb128 .LEHE6-.LEHB6
.uleb128 .L76-.LCOLDB6
.uleb128 0
.uleb128 .LEHB7-.LCOLDB6
.uleb128 .LEHE7-.LEHB7
.uleb128 0
.uleb128 0
.LLSDACSEC2360:
.byte 0x1
.byte 0
.align 4
.long 0
.LLSDATTC2360:
.section .text.unlikely
.text
.size _Z14init_dp_matrixmmRK5Model, .-_Z14init_dp_matrixmmRK5Model
.section .text.unlikely
.size _Z14init_dp_matrixmmRK5Model.cold, .-_Z14init_dp_matrixmmRK5Model.cold
.LCOLDE6:
.text
.LHOTE6:
.section .text._ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev,"axG",@progbits,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED5Ev,comdat
.align 2
.p2align 4
.weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.type _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, @function
_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev:
.LFB2637:
.cfi_startproc
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
movq %rdi, %r12
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
movq 8(%rdi), %rbx
movq (%rdi), %rbp
cmpq %rbp, %rbx
je .L109
.p2align 4,,10
.p2align 3
.L113:
movq 0(%rbp), %rdi
testq %rdi, %rdi
je .L110
addq $24, %rbp
call _ZdlPv
cmpq %rbp, %rbx
jne .L113
.L111:
movq (%r12), %rbp
.L109:
testq %rbp, %rbp
je .L115
popq %rbx
.cfi_remember_state
.cfi_def_cfa_offset 24
movq %rbp, %rdi
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
jmp _ZdlPv
.p2align 4,,10
.p2align 3
.L110:
.cfi_restore_state
addq $24, %rbp
cmpq %rbp, %rbx
jne .L113
jmp .L111
.p2align 4,,10
.p2align 3
.L115:
popq %rbx
.cfi_def_cfa_offset 24
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE2637:
.size _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, .-_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
.set _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.section .text.unlikely
.LCOLDB7:
.section .text.startup,"ax",@progbits
.LHOTB7:
.p2align 4
.globl main
.type main, @function
main:
.LFB2371:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2371
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl $2, %edx
movl $10, %esi
subq $48, %rsp
.cfi_def_cfa_offset 64
leaq 16(%rsp), %rdi
leaq 8(%rsp), %rcx
movq $-8, 8(%rsp)
.LEHB8:
call _Z14init_dp_matrixmmRK5Model
.LEHE8:
leaq 16(%rsp), %rdi
.LEHB9:
call _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.LEHE9:
leaq 16(%rsp), %rdi
call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
addq $48, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 16
xorl %eax, %eax
popq %rbp
.cfi_def_cfa_offset 8
ret
.L119:
.cfi_restore_state
movq %rax, %rbp
jmp .L118
.section .gcc_except_table
.LLSDA2371:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSE2371-.LLSDACSB2371
.LLSDACSB2371:
.uleb128 .LEHB8-.LFB2371
.uleb128 .LEHE8-.LEHB8
.uleb128 0
.uleb128 0
.uleb128 .LEHB9-.LFB2371
.uleb128 .LEHE9-.LEHB9
.uleb128 .L119-.LFB2371
.uleb128 0
.LLSDACSE2371:
.section .text.startup
.cfi_endproc
.section .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2371
.type main.cold, @function
main.cold:
.LFSB2371:
.L118:
.cfi_def_cfa_offset 64
.cfi_offset 6, -16
leaq 16(%rsp), %rdi
vzeroupper
call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
movq %rbp, %rdi
.LEHB10:
call _Unwind_Resume
.LEHE10:
.cfi_endproc
.LFE2371:
.section .gcc_except_table
.LLSDAC2371:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSEC2371-.LLSDACSBC2371
.LLSDACSBC2371:
.uleb128 .LEHB10-.LCOLDB7
.uleb128 .LEHE10-.LEHB10
.uleb128 0
.uleb128 0
.LLSDACSEC2371:
.section .text.unlikely
.section .text.startup
.size main, .-main
.section .text.unlikely
.size main.cold, .-main.cold
.LCOLDE7:
.section .text.startup
.LHOTE7:
.p2align 4
.type _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, @function
_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB3017:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $_ZStL8__ioinit, %edi
call _ZNSt8ios_base4InitC1Ev
movl $__dso_handle, %edx
movl $_ZStL8__ioinit, %esi
movl $_ZNSt8ios_base4InitD1Ev, %edi
addq $8, %rsp
.cfi_def_cfa_offset 8
jmp __cxa_atexit
.cfi_endproc
.LFE3017:
.size _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC1:
.long 1
.long 2
.long 3
.long 4
.long 5
.long 6
.long 7
.long 8
.align 32
.LC3:
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.align 32
.LC4:
.quad 0
.quad 3
.quad 6
.quad 0
.align 32
.LC5:
.quad 0
.quad 1
.quad 2
.quad 5
.hidden __dso_handle
.ident "GCC: (Homebrew GCC 9.2.0) 9.2.0"
.section .note.GNU-stack,"",@progbits
The assembly for the non -march=native
version is available on godbolt.
What is going wrong, is this a compiler bug or is my program ill formed? How can I mitigate this issue if it is a compiler bug?
Additional info
Compiling with -v
:
$ ~/tools/octopus/build/brew/bin/g++-9 -O3 -march=native -S bug.cpp -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-S' '-v' '-shared-libgcc'
/home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /home/dcooke/tools/octopus/build/brew/nonexistent -idirafter /home/dcooke/tools/octopus/build/brew/include -idirafter /usr/include/x86_64-linux-gnu -idirafter /usr/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=33792 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o bug.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP
Compiling with -O2
or less makes the problem go away:
$ g++-9 -O2 -march=native -o bug bug.cpp
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0}
I tried building on a different machine with Intel chips:
$ rpm -q centos-release
centos-release-7-3.1611.el7.centos.x86_64
$ grep model /proc/cpuinfo | head -2
model : 85
model name : Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
$ g++-9 -O3 -march=native -o bug bug.cpp -v
Reading specs from /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/well/gerton/dan/apps/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-o' 'bug' '-v' '-shared-libgcc'
/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /gpfs1/well/gerton/dan/apps/octopus/build/brew/nonexistent -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/include -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/opt/glibc/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mno-pku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o /tmp/cczPrvHP.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0}
The correct output...
-ftree-loop-vectorize
is the culprit:
$ g++-9 -march=native -O2 -o bug bug.cpp -ftree-loop-vectorize
$ ./bug
{0 0} {-8 0}
{-2048 255} {0 0}
{-2304 255} {0 0}
{-2560 255} {0 0}
{-2816 255} {0 0}
{-3072 255} {0 0}
{-3328 255} {0 0}
{-3584 255} {0 0}
{-3840 255} {0 0}
{0 -16} {0 0}
None of the other O3
flags result in this behaviour.
This turned out to be due to a bug in binutils gas. This solution was to upgrade my binutils to 2.32.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With