Visual Studio 2012 different values Release/Debug mode

Tags:

This code produces different values in MSVS 2012, Windows 7, when switching between Debug and Release mode:

#include <iostream> using namespace std;  int A[20000];  int main() {      int shift = 0;     int Period = 30;     //Fill array     for(int i = 0; i < 20000; i++) {         A[i] = i * 2 + 123;     }      int sumTotal = 0;     int sum = 0;      for(int bars = Period + 10; bars < 1000; bars++) {         sum = 0;         for(int i = 0; i< Period; i++) {             sum += A[bars - i];         }         sumTotal += sum;     }     cout << sumTotal << endl; }

Can you reproduce or find why? I've been testing with all kind of settings on project properties.

Debug (the correct result): 32630400
Release: 32814720

/GS /GL /analyze- /W3 /Gy /Zc:wchar_t /I"C:\Program Files (x86)\Visual Leak Detector\include" /Z7 /Gm- /O2 /Fd"Release\vc110.pdb" /fp:precise /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /Gd /Oy- /Oi /MD /Fa"Release\" /EHsc /nologo /Fo"Release\" /Fp"Release\Testing.pch"

406

asked Jun 10 '14 14:06

user3023605

1 Answers

I tested the "reduced" version of the code using VS2012 C compiler

int main() {   int A[12] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };    int sum = 0;   int i;    for (i = 0; i < 12; ++i)      sum += A[11 - i];    printf("%d\n", sum);    return 0; }

I compiled it in x64 mode Release config optimized for speed. The bug is still there, but depending on other optimization and code generation settings it reveals itself differently. One version of the code generated "random" result, while another consistently produced 8 as the sum (instead of the proper 12).

This is what the generated code looks like for the version that consistently produces 8

000000013FC81DF0  mov         rax,rsp   000000013FC81DF3  sub         rsp,68h   000000013FC81DF7  movd        xmm1,dword ptr [rax-18h]   000000013FC81DFC  movd        xmm2,dword ptr [rax-10h]   000000013FC81E01  movd        xmm5,dword ptr [rax-0Ch]   000000013FC81E06  xorps       xmm0,xmm0   000000013FC81E09  xorps       xmm3,xmm3    for (i = 0; i < 12; ++i) 000000013FC81E0C  xor         ecx,ecx   000000013FC81E0E  mov         dword ptr [rax-48h],1   000000013FC81E15  mov         dword ptr [rax-44h],1   000000013FC81E1C  mov         dword ptr [rax-40h],1   000000013FC81E23  punpckldq   xmm2,xmm1   000000013FC81E27  mov         dword ptr [rax-3Ch],1   000000013FC81E2E  mov         dword ptr [rax-38h],1   000000013FC81E35  mov         dword ptr [rax-34h],1   {      sum += A[11 - i]; 000000013FC81E3C  movdqa      xmm4,xmmword ptr [__xmm@00000001000000010000000100000001 (013FC83360h)]   000000013FC81E44  paddd       xmm4,xmm0   000000013FC81E48  movd        xmm0,dword ptr [rax-14h]   000000013FC81E4D  mov         dword ptr [rax-30h],1   000000013FC81E54  mov         dword ptr [rax-2Ch],1   000000013FC81E5B  mov         dword ptr [rax-28h],1   000000013FC81E62  mov         dword ptr [rax-24h],1   000000013FC81E69  punpckldq   xmm5,xmm0   000000013FC81E6D  punpckldq   xmm5,xmm2   000000013FC81E71  paddd       xmm5,xmm3   000000013FC81E75  paddd       xmm5,xmm4   000000013FC81E79  mov         dword ptr [rax-20h],1   000000013FC81E80  mov         dword ptr [rax-1Ch],1   000000013FC81E87  mov         r8d,ecx   000000013FC81E8A  movdqa      xmm0,xmm5   000000013FC81E8E  psrldq      xmm0,8   000000013FC81E93  paddd       xmm5,xmm0   000000013FC81E97  movdqa      xmm0,xmm5   000000013FC81E9B  lea         rax,[rax-40h]   000000013FC81E9F  mov         r9d,2   000000013FC81EA5  psrldq      xmm0,4   000000013FC81EAA  paddd       xmm5,xmm0   000000013FC81EAE  movd        edx,xmm5   000000013FC81EB2  nop         word ptr [rax+rax]   {      sum += A[11 - i]; 000000013FC81EC0  add         ecx,dword ptr [rax+4]   000000013FC81EC3  add         r8d,dword ptr [rax]   000000013FC81EC6  lea         rax,[rax-8]   000000013FC81ECA  dec         r9   000000013FC81ECD  jne         main+0D0h (013FC81EC0h)   }  printf("%d\n", sum); 000000013FC81ECF  lea         eax,[r8+rcx]   000000013FC81ED3  lea         rcx,[__security_cookie_complement+8h (013FC84040h)]   000000013FC81EDA  add         edx,eax   000000013FC81EDC  call        qword ptr [__imp_printf (013FC83140h)]    return 0; 000000013FC81EE2  xor         eax,eax   } 000000013FC81EE4  add         rsp,68h   000000013FC81EE8  ret

There's a lot of strange and seemingly unnecessary mumbo-jumbo there left over by the code generator and optimizer, but what this code does can be briefly described as follows.

There are two independent algorithms there employed to produce the final sum, which are apparently supposed to process different portions of the array. I'd guess that two processing flows (non-SSE and SSE) are used to promote parallelism through instruction pipelining.

One algorithm is a simple cycle, which sums array elements, processing two elements per iteration. It can be extracted from the above "interleaved" code as follows

; Initialization 000000013F1E1E0C  xor         ecx,ecx                 ; ecx - odd element sum 000000013F1E1E87  mov         r8d,ecx                 ; r8 - even element sum 000000013F1E1E9B  lea         rax,[rax-40h]           ; start from i = 2 000000013F1E1E9F  mov         r9d,2                   ; do 2 iterations  ; The cycle 000000013F1E1EC0  add         ecx,dword ptr [rax+4]   ; ecx += A[i + 1] 000000013F1E1EC3  add         r8d,dword ptr [rax]     ; r8d += A[i] 000000013F1E1EC6  lea         rax,[rax-8]             ; i -= 2 000000013F1E1ECA  dec         r9                       000000013F1E1ECD  jne         main+0D0h (013F1E1EC0h) ; loop again if r9 is not zero

This algorithm begins adding elements from address rax - 40h, which in my experiment was equal to &A[2] and makes two iterations backwards skipping back over two elements. This accumulates the sum of A[0] and A[2] in register r8 and sum of A[1] and A[3] in register ecx. Thus this part of the algorithm processes 4 elements of the array and properly generates values 2 in both r8 and ecx.

The other part of the algorithm is written using SSE instructions and is apparently responsible for summing the remaining portion of the array. It can be extracted from the code as follows

; Initially xmm5 is zero 000000013F1E1E3C  movdqa      xmm4,xmmword ptr [__xmm@00000001000000010000000100000001 (013F1E3360h)]   000000013F1E1E75  paddd       xmm5,xmm4    000000013F1E1E8A  movdqa      xmm0,xmm5               ; copy 000000013F1E1E8E  psrldq      xmm0,8                  ; shift 000000013F1E1E93  paddd       xmm5,xmm0               ; and add  000000013F1E1E8A  movdqa      xmm0,xmm5               ; copy 000000013F1E1E8E  psrldq      xmm0,4                  ; shift 000000013F1E1E93  paddd       xmm5,xmm0               ; and add  000000013F1E1EAE  movd        edx,xmm5                ; edx - the sum

The general algorithm used by that part is simple: it places value 0x00000001000000010000000100000001 in 128-bit register xmm5, then shifts it 8 bytes to the right (0x00000000000000000000000100000001) and adds it to the original value, producing 0x00000001000000010000000200000002. This is again shifted 4 bytes to the right (0x00000000000000010000000100000002) and added to the previous value again, producing 0x00000001000000020000000300000004. The last 32-bit word 0x00000004 of xmm5 is taken as the result and placed into register edx. Thus this algorithm produces 4 as its final result. It is pretty obvious that this algorithm simply performs the "parallel" addition of consecutive 32-bit words in a 128-bit register. Note, BTW that this algorithm does not even attempt to access A, it starts summing from an embedded constant produced by the compiler/optimizer.

Now, in the end the value of r8 + ecx + edx is reported as the final sum. Obviously, this is only 8, instead of the correct 12. It looks like one of these two algorithms forgot to do some of its work. I don't know which one, but judging by the abundance of "redundant" instructions it looks like it was the SSE algorithm that was supposed to generate 8 in edx instead of 4. One suspicious instruction is this one

000000013FC81E71  paddd       xmm5,xmm3

At that moment xmm3 always contains zero. So, this instruction looks completely redundant and unnecessary. But if xmm3 actually contained another "magic" constant representing another 4 elements of the array (just like xmm4 did), then the algorithm would work properly and would produce the proper sum.

If one uses distinctive initial values for array elements

int A[12] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };

it can be clearly seen that the first (non-SSE) algorithm successfully sums 1, 2, 3, 4, while the second (SSE) algorithm sums 9, 10, 11, 12. 5, 6, 7, 8 remain excluded from consideration, resulting in 52 as final sum instead of the correct 78.

This is definitely a compiler/optimizer bug.

P.S. The same project with the same settings imported into VS2013 Update 2 does not seem to suffer from this bug.

191

answered Sep 22 '22 04:09

AnT

Related questions
                            
                                Why can't I initialize a reference in an initializer list with uniform initialization?
                            
                                Accessing the name of a private-inherited class from a subclass
                            
                                Easy to use SNMP client library for c++? [closed]
                            
                                Singleton instance declared as static variable of GetInstance method, is it thread-safe? [duplicate]
                            
                                How to check for inf (and | or) NaN in a double variable
                            
                                jni.h: No such file or directory
                            
                                In C++, what are the differences between static_cast<double>(a) and double(a)?
                            
                                How are C++11 lambdas represented and passed?
                            
                                Achieve same random number sequence on different OS with same seed
                            
                                CRTP with Protected Derived Member
                            
                                OOP: Which class should own a method? [closed]
                            
                                Get HWND on windows with Qt5 (from WId)
                            
                                Why does this C++ function produce so many branch mispredictions?
                            
                                Array initialization optimization
                            
                                Is size_t the word size?
                            
                                lifetime of a std::initializer_list return value
                            
                                Hinnant's short_alloc and alignment guarantees
                            
                                Simpler way to create a C++ memorystream from (char*, size_t), without copying the data?
                            
                                Testing C++ code for endian-independence
                            
                                Semicolon on a function parameters

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Visual Studio 2012 different values Release/Debug mode

Tags:

c++

visual-studio-2012

visual-studio-2013

visual-c++

compiler-bug

user3023605

People also ask

1 Answers

AnT

Recent Activity

Donate For Us