Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why godbolt generate different asm output than my actual asm code in Visual Studio?

Here's the code generated by godbolt.

Here's the same code generated by Visual studio on my main.asm file (enabled by Project->C/C++->Output Files->Assembly With Source Code (/FAs) under Assembler Output field):

; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1 

    TITLE   c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    .686P
    .XMM
    include listing.inc
    .model  flat

INCLUDELIB OLDNAMES

EXTRN   __imp____std_terminate:PROC
EXTRN   @__security_check_cookie@4:PROC
EXTRN   __imp____CxxFrameHandler3:PROC
PUBLIC  ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z       ; std::less<void>::operator()<double const &,double const &>
PUBLIC  ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
PUBLIC  ??$clamp@N@std@@YAABNABN00@Z            ; std::clamp<double>
PUBLIC  _main
PUBLIC  ?ProcessOptimized@MyPlugin@@QAEXH@Z     ; MyPlugin::ProcessOptimized
PUBLIC  ?Process@MyPlugin@@QAEXH@Z          ; MyPlugin::Process
PUBLIC  ??1MyPlugin@@QAE@XZ             ; MyPlugin::~MyPlugin
PUBLIC  ??0MyPlugin@@QAE@XZ             ; MyPlugin::MyPlugin
PUBLIC  ?ProcessOptimized@Param@@QAEXHH@Z       ; Param::ProcessOptimized
PUBLIC  ?Process@Param@@QAEXHH@Z            ; Param::Process
PUBLIC  ??0Param@@QAE@XZ                ; Param::Param
PUBLIC  __real@3ff0000000000000
PUBLIC  __real@400921fb54442d18
PUBLIC  __real@4024000000000000
PUBLIC  __real@406fe00000000000
PUBLIC  __xmm@00000003000000020000000100000000
PUBLIC  __xmm@400921fb54442d18400921fb54442d18
PUBLIC  __xmm@406fe00000000000406fe00000000000
EXTRN   __chkstk:PROC
EXTRN   ___security_cookie:DWORD
EXTRN   __fltused:DWORD
;   COMDAT __xmm@406fe00000000000406fe00000000000
CONST   SEGMENT
__xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
    DB  '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
CONST   ENDS
;   COMDAT __xmm@400921fb54442d18400921fb54442d18
CONST   SEGMENT
__xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
    DB  018H, '-DT', 0fbH, '!', 09H, '@'
CONST   ENDS
;   COMDAT __xmm@00000003000000020000000100000000
CONST   SEGMENT
__xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
    DB  00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
CONST   ENDS
;   COMDAT __real@406fe00000000000
CONST   SEGMENT
__real@406fe00000000000 DQ 0406fe00000000000r   ; 255
CONST   ENDS
;   COMDAT __real@4024000000000000
CONST   SEGMENT
__real@4024000000000000 DQ 04024000000000000r   ; 10
CONST   ENDS
;   COMDAT __real@400921fb54442d18
CONST   SEGMENT
__real@400921fb54442d18 DQ 0400921fb54442d18r   ; 3.14159
CONST   ENDS
;   COMDAT __real@3ff0000000000000
CONST   SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r   ; 1
CONST   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??0Param@@QAE@XZ
_TEXT   SEGMENT
??0Param@@QAE@XZ PROC                   ; Param::Param, COMDAT
; _this$ = ecx

; 23   :    Param() { }

    xorps   xmm0, xmm0
    mov eax, ecx
    movsd   QWORD PTR [ecx], xmm0
    movsd   QWORD PTR [ecx+16], xmm0
    movsd   xmm0, QWORD PTR __real@4024000000000000
    movsd   QWORD PTR [ecx+32], xmm0
    movsd   xmm0, QWORD PTR __real@3ff0000000000000
    movsd   QWORD PTR [ecx+48], xmm0
    movsd   QWORD PTR [ecx+64], xmm0
    ret 0
??0Param@@QAE@XZ ENDP                   ; Param::Param
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?Process@Param@@QAEXHH@Z
_TEXT   SEGMENT
$T1 = -24                       ; size = 8
$T3 = -16                       ; size = 8
$T2 = -8                        ; size = 8
_voiceIndex$ = 8                    ; size = 4
_blockSize$dead$ = 12                   ; size = 4
?Process@Param@@QAEXHH@Z PROC               ; Param::Process, COMDAT
; _this$ = ecx

; 25   :    inline void Process(int voiceIndex, int blockSize) {

    push    ebp
    mov ebp, esp
    sub esp, 24                 ; 00000018H

; 26   :        double *pB = b[voiceIndex];

    mov eax, DWORD PTR _voiceIndex$[ebp]
    xorps   xmm5, xmm5

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm2, QWORD PTR __real@400921fb54442d18
    push    esi
    mov esi, ecx
    shl eax, 11                 ; 0000000bH
    push    edi
    movsd   QWORD PTR $T1[ebp], xmm2
    mov ecx, 256                ; 00000100H
    movsd   QWORD PTR $T2[ebp], xmm5
    movsd   xmm3, QWORD PTR [esi+48]
    lea edx, DWORD PTR [esi+2128]
    movsd   xmm1, QWORD PTR [esi]
    add edx, eax
    mulsd   xmm3, QWORD PTR [esi+32]
    movsd   xmm4, QWORD PTR [esi+64]
    npad    11
$LL4@Process:
    movsd   xmm0, QWORD PTR [edx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [edx]
    mulsd   xmm0, xmm4
    comisd  xmm0, xmm2
    movsd   QWORD PTR $T3[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN10@Process
    movaps  xmm0, xmm2
    jmp SHORT $LN11@Process
$LN10@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T2[ebp]
    lea edi, DWORD PTR $T3[ebp]
    cmovbe  eax, edi
    movsd   xmm0, QWORD PTR [eax]
$LN11@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

    add edx, 8

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    addsd   xmm1, xmm0
    sub ecx, 1
    jne SHORT $LL4@Process

; 35   :        }
; 36   : 
; 37   :        mPhase = phase;
; 38   :    }

    pop edi
    movsd   QWORD PTR [esi], xmm1
    pop esi
    mov esp, ebp
    pop ebp
    ret 8
?Process@Param@@QAEXHH@Z ENDP               ; Param::Process
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
_TEXT   SEGMENT
_v_phase$ = -16                     ; size = 16
_voiceIndex$ = 8                    ; size = 4
_blockSize$dead$ = 12                   ; size = 4
?ProcessOptimized@Param@@QAEXHH@Z PROC          ; Param::ProcessOptimized, COMDAT
; _this$ = ecx

; 39   :    inline void ProcessOptimized(int voiceIndex, int blockSize) {

    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -16                ; fffffff0H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov DWORD PTR [esp+4], ebp
    mov ebp, esp

; 40   :        double *pB = b[voiceIndex];

    mov eax, DWORD PTR _voiceIndex$[ebx]
    mov edx, ecx
    shl eax, 11                 ; 0000000bH
    xorps   xmm3, xmm3
    xorps   xmm2, xmm2
    sub esp, 16                 ; 00000010H
    xorps   xmm7, xmm7
    mov ecx, 128                ; 00000080H

; 41   :        double *pC = c[voiceIndex];
; 42   :        double phase = mPhaseOptimized;
; 43   :        double bp0 = mNoteFrequency * mHostPitch;

    movsd   xmm5, QWORD PTR [edx+48]
    mulsd   xmm5, QWORD PTR [edx+32]

; 44   : 
; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

    movsd   xmm6, QWORD PTR [edx+64]

; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49   : 
; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 54   : 
; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR [eax+edx+80]
    movups  xmm4, XMMWORD PTR [eax+edx+80]
    movups  xmm1, XMMWORD PTR [eax+edx+2128]
    mulsd   xmm5, xmm6
    unpcklpd xmm3, xmm0

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm0, QWORD PTR [eax+edx+2128]
    add eax, 2136               ; 00000858H
    unpcklpd xmm2, xmm0
    add eax, edx

; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
; 59   : 
; 60   :        __m128d v_phaseAcc1;
; 61   :        __m128d v_phaseAcc2;
; 62   :        __m128d v_phase = _mm_set1_pd(phase);

    movsd   xmm0, QWORD PTR [edx+16]
    unpcklpd xmm5, xmm5
    unpcklpd xmm6, xmm6
    mulpd   xmm4, xmm5
    mulpd   xmm1, xmm6
    mulpd   xmm3, xmm5
    mulpd   xmm2, xmm6
    unpcklpd xmm0, xmm0
    npad    2
$LL4@ProcessOpt:

; 63   : 
; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65   :            // some other code (that will use phase, like sin(phase))
; 66   : 
; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

    addpd   xmm1, xmm4

; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75   : 
; 76   :            v_pB0 = _mm_load_pd(pB + 2);

    movups  xmm4, XMMWORD PTR [eax-2040]
    addpd   xmm2, xmm3

; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78   :            v_pC0 = _mm_load_pd(pC + 2);
; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80   : 
; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);

    movups  xmm3, XMMWORD PTR [eax-2048]
    maxpd   xmm1, xmm7
    maxpd   xmm2, xmm7
    minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    addpd   xmm0, xmm1
    movups  xmm1, XMMWORD PTR [eax+8]
    addpd   xmm0, xmm2

; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);

    movups  xmm2, XMMWORD PTR [eax]
    add eax, 16                 ; 00000010H
    movaps  XMMWORD PTR _v_phase$[ebp], xmm0
    mulpd   xmm4, xmm5
    mulpd   xmm1, xmm6
    mulpd   xmm3, xmm5

; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm6
    sub ecx, 1
    jne SHORT $LL4@ProcessOpt

; 85   :        }
; 86   : 
; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

    movsd   xmm0, QWORD PTR _v_phase$[ebp+8]
    movsd   QWORD PTR [edx+16], xmm0

; 88   :    }

    mov esp, ebp
    pop ebp
    mov esp, ebx
    pop ebx
    ret 8
?ProcessOptimized@Param@@QAEXHH@Z ENDP          ; Param::ProcessOptimized
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??0MyPlugin@@QAE@XZ
_TEXT   SEGMENT
??0MyPlugin@@QAE@XZ PROC                ; MyPlugin::MyPlugin, COMDAT
; _this$ = ecx

; 97   :        // fill b
; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    movaps  xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
    xorps   xmm0, xmm0
    movaps  xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
    xor edx, edx
    push    esi
    mov esi, ecx
    push    edi

; 14   :    alignas(16) double mPhase = 0.0;

    movsd   QWORD PTR [esi], xmm0

; 97   :        // fill b
; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR [esi+88]

; 15   :    alignas(16) double mPhaseOptimized = 0.0;

    movsd   QWORD PTR [esi+16], xmm0

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   xmm0, QWORD PTR __real@4024000000000000
    movsd   QWORD PTR [esi+32], xmm0

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   xmm0, QWORD PTR __real@3ff0000000000000
    movsd   QWORD PTR [esi+48], xmm0

; 18   :    alignas(16) double mRadiansPerSample = 1.0;

    movsd   QWORD PTR [esi+64], xmm0
$LL7@MyPlugin:

; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));

    movd    xmm0, edx
    lea eax, DWORD PTR [edx+2]
    pshufd  xmm1, xmm0, 0
    lea ecx, DWORD PTR [ecx+32]
    movq    xmm0, xmm2
    add edx, 4
    paddd   xmm1, xmm0
    cvtdq2pd xmm0, xmm1
    divpd   xmm0, xmm3

; 101  : 
; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;

    movlpd  QWORD PTR [ecx-40], xmm0
    movhpd  QWORD PTR [ecx-32], xmm0
    movd    xmm0, eax
    pshufd  xmm1, xmm0, 0
    movq    xmm0, xmm2
    paddd   xmm1, xmm0
    cvtdq2pd xmm0, xmm1
    divpd   xmm0, xmm3
    movlpd  QWORD PTR [ecx-24], xmm0
    movhpd  QWORD PTR [ecx-16], xmm0
    cmp edx, 256                ; 00000100H
    jl  SHORT $LL7@MyPlugin

; 103  :            }
; 104  :        }
; 105  : 
; 106  :        // fill c
; 107  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea edi, DWORD PTR [esi+2128]
    xor eax, eax
    mov ecx, 512                ; 00000200H
    rep stosd

; 109  :                double value = 0.0;
; 110  : 
; 111  :                mParam1.c[voiceIndex][sampleIndex] = value;
; 112  :            }
; 113  :        }
; 114  :    }

    pop edi
    mov eax, esi
    pop esi
    ret 0
??0MyPlugin@@QAE@XZ ENDP                ; MyPlugin::MyPlugin
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??1MyPlugin@@QAE@XZ
_TEXT   SEGMENT
??1MyPlugin@@QAE@XZ PROC                ; MyPlugin::~MyPlugin, COMDAT
; _this$dead$ = ecx

; 115  :    ~MyPlugin() { }

    ret 0
??1MyPlugin@@QAE@XZ ENDP                ; MyPlugin::~MyPlugin
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?Process@MyPlugin@@QAEXH@Z
_TEXT   SEGMENT
$T2 = -28                       ; size = 8
$T4 = -20                       ; size = 8
$T3 = -12                       ; size = 8
_blockSize$dead$ = 8                    ; size = 4
?Process@MyPlugin@@QAEXH@Z PROC             ; MyPlugin::Process, COMDAT
; _this$ = ecx

; 117  :    void Process(int blockSize) {

    push    ebp
    mov ebp, esp
    sub esp, 28                 ; 0000001cH

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm2, QWORD PTR __real@400921fb54442d18
    xorps   xmm5, xmm5

; 117  :    void Process(int blockSize) {

    push    esi
    mov esi, ecx

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T2[ebp], xmm2

; 117  :    void Process(int blockSize) {

    push    edi

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T3[ebp], xmm5
    mov edx, 256                ; 00000100H
    movsd   xmm3, QWORD PTR [esi+48]

; 27   :        double *pC = c[voiceIndex];

    lea ecx, DWORD PTR [esi+2128]

; 28   :        double phase = mPhase;
; 29   :        double bp0 = mNoteFrequency * mHostPitch;

    movsd   xmm1, QWORD PTR [esi]
    mulsd   xmm3, QWORD PTR [esi+32]
    movsd   xmm4, QWORD PTR [esi+64]
    npad    3
$LL9@Process:

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm0, QWORD PTR [ecx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [ecx]
    mulsd   xmm0, xmm4
    comisd  xmm0, xmm2
    movsd   QWORD PTR $T4[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN15@Process
    movaps  xmm0, xmm2
    jmp SHORT $LN16@Process
$LN15@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T3[ebp]
    lea edi, DWORD PTR $T4[ebp]
    cmovbe  eax, edi
    movsd   xmm0, QWORD PTR [eax]
$LN16@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

    add ecx, 8

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    addsd   xmm1, xmm0
    sub edx, 1
    jne SHORT $LL9@Process

; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119  :            mParam1.Process(voiceIndex, blockSize);
; 120  :        }
; 121  :    }

    pop edi

; 37   :        mPhase = phase;

    movsd   QWORD PTR [esi], xmm1

; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119  :            mParam1.Process(voiceIndex, blockSize);
; 120  :        }
; 121  :    }

    pop esi
    mov esp, ebp
    pop ebp
    ret 4
?Process@MyPlugin@@QAEXH@Z ENDP             ; MyPlugin::Process
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
_TEXT   SEGMENT
_v_phase$31 = -16                   ; size = 16
_blockSize$dead$ = 8                    ; size = 4
?ProcessOptimized@MyPlugin@@QAEXH@Z PROC        ; MyPlugin::ProcessOptimized, COMDAT
; _this$ = ecx

; 122  :    void ProcessOptimized(int blockSize) {

    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -16                ; fffffff0H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov DWORD PTR [esp+4], ebp
    mov ebp, esp
    mov edx, ecx
    xorps   xmm3, xmm3
    xorps   xmm2, xmm2
    sub esp, 16                 ; 00000010H

; 40   :        double *pB = b[voiceIndex];

    mov ecx, 128                ; 00000080H
    movsd   xmm6, QWORD PTR [edx+48]
    lea eax, DWORD PTR [edx+2136]
    mulsd   xmm6, QWORD PTR [edx+32]

; 41   :        double *pC = c[voiceIndex];
; 42   :        double phase = mPhaseOptimized;
; 43   :        double bp0 = mNoteFrequency * mHostPitch;
; 44   : 
; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

    movsd   xmm7, QWORD PTR [edx+64]

; 54   : 
; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR [edx+80]
    movsd   xmm5, QWORD PTR [edx+16]
    movups  xmm4, XMMWORD PTR [edx+80]
    movups  xmm1, XMMWORD PTR [edx+2128]
    mulsd   xmm6, xmm7
    unpcklpd xmm3, xmm0

; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm0, QWORD PTR [edx+2128]
    unpcklpd xmm7, xmm7
    unpcklpd xmm6, xmm6
    unpcklpd xmm2, xmm0
    xorps   xmm0, xmm0

; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49   : 
; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);

    mulpd   xmm4, xmm6

; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);

    mulpd   xmm1, xmm7

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);

    mulpd   xmm3, xmm6

; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm7

; 59   : 
; 60   :        __m128d v_phaseAcc1;
; 61   :        __m128d v_phaseAcc2;
; 62   :        __m128d v_phase = _mm_set1_pd(phase);

    unpcklpd xmm5, xmm5
    npad    13
$LL9@ProcessOpt:

; 63   : 
; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65   :            // some other code (that will use phase, like sin(phase))
; 66   : 
; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

    addpd   xmm1, xmm4

; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75   : 
; 76   :            v_pB0 = _mm_load_pd(pB + 2);

    movups  xmm4, XMMWORD PTR [eax-2040]
    addpd   xmm2, xmm3

; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78   :            v_pC0 = _mm_load_pd(pC + 2);
; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80   : 
; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);

    movups  xmm3, XMMWORD PTR [eax-2048]
    maxpd   xmm1, xmm0
    maxpd   xmm2, xmm0
    minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    addpd   xmm5, xmm1
    movups  xmm1, XMMWORD PTR [eax+8]
    addpd   xmm5, xmm2

; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);

    movups  xmm2, XMMWORD PTR [eax]
    add eax, 16                 ; 00000010H
    movaps  XMMWORD PTR _v_phase$31[ebp], xmm5
    mulpd   xmm4, xmm6
    mulpd   xmm1, xmm7
    mulpd   xmm3, xmm6

; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm7
    sub ecx, 1
    jne SHORT $LL9@ProcessOpt

; 85   :        }
; 86   : 
; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

    movsd   xmm0, QWORD PTR _v_phase$31[ebp+8]
    movsd   QWORD PTR [edx+16], xmm0

; 123  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 124  :            mParam1.ProcessOptimized(voiceIndex, blockSize);
; 125  :        }
; 126  :    }

    mov esp, ebp
    pop ebp
    mov esp, ebx
    pop ebx
    ret 4
?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP        ; MyPlugin::ProcessOptimized
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT _main
_TEXT   SEGMENT
_counterProcessing$1$ = -4304               ; size = 4
_counterProcessing$ = -4304             ; size = 8
_bp0$1$ = -4296                     ; size = 8
_v_radiansPerSample$1$ = -4288              ; size = 16
$T3 = -4264                     ; size = 8
_v_phase$38 = -4256                 ; size = 16
$T4 = -4256                     ; size = 8
$T2 = -4232                     ; size = 8
tv1040 = -4224                      ; size = 16
tv1039 = -4208                      ; size = 16
_myPlugin$ = -4192                  ; size = 4176
__$ArrayPad$ = -4                   ; size = 4
_main   PROC                        ; COMDAT

; 129  : int main() {

    push    ebp
    mov ebp, esp
    and esp, -16                ; fffffff0H
    mov eax, 4312               ; 000010d8H
    call    __chkstk
    mov eax, DWORD PTR ___security_cookie
    xor eax, esp
    mov DWORD PTR __$ArrayPad$[esp+4312], eax

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   xmm0, QWORD PTR __real@4024000000000000

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR _myPlugin$[esp+4392]
    movsd   xmm1, QWORD PTR __real@406fe00000000000
    xorps   xmm2, xmm2

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   QWORD PTR _myPlugin$[esp+4344], xmm0

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    xor eax, eax

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   xmm0, QWORD PTR __real@3ff0000000000000

; 129  : int main() {

    push    esi
    push    edi

; 14   :    alignas(16) double mPhase = 0.0;

    movsd   QWORD PTR _myPlugin$[esp+4320], xmm2

; 15   :    alignas(16) double mPhaseOptimized = 0.0;

    movsd   QWORD PTR _myPlugin$[esp+4336], xmm2

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   QWORD PTR _myPlugin$[esp+4368], xmm0

; 18   :    alignas(16) double mRadiansPerSample = 1.0;

    movsd   QWORD PTR _myPlugin$[esp+4384], xmm0
$LL11@main:
    movd    xmm0, eax

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR [ecx+8]

; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));

    cvtdq2pd xmm0, xmm0
    inc eax
    divsd   xmm0, xmm1

; 101  : 
; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;

    movsd   QWORD PTR [ecx-8], xmm0
    cmp eax, 256                ; 00000100H
    jl  SHORT $LL11@main

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm6, QWORD PTR __real@400921fb54442d18

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea edi, DWORD PTR _myPlugin$[esp+6448]
    mov ecx, 512                ; 00000200H

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T2[esp+4320], xmm6

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    xor eax, eax

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T3[esp+4320], xmm2

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    rep stosd
    movsd   xmm3, QWORD PTR _myPlugin$[esp+4352]
    xorps   xmm0, xmm0
    mulsd   xmm3, QWORD PTR _myPlugin$[esp+4368]

; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movaps  xmm4, xmm2
    movsd   xmm1, QWORD PTR _myPlugin$[esp+4384]

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm5, QWORD PTR _myPlugin$[esp+4336]

; 130  :    MyPlugin myPlugin;
; 131  : 
; 132  :    long long numProcessing = 5;
; 133  :    long long counterProcessing = 0;

    movlpd  QWORD PTR _counterProcessing$[esp+4320], xmm0

; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR _myPlugin$[esp+4400]
    movaps  xmm7, xmm3
    mulsd   xmm7, QWORD PTR _myPlugin$[esp+4384]

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    mov edi, DWORD PTR _counterProcessing$[esp+4324]
    mov esi, DWORD PTR _counterProcessing$[esp+4320]
    unpcklpd xmm4, xmm0
    movsd   xmm0, QWORD PTR _myPlugin$[esp+6448]
    movups  XMMWORD PTR tv1040[esp+4320], xmm4
    movaps  xmm4, xmm2
    unpcklpd xmm1, xmm1
    unpcklpd xmm4, xmm0
    movups  XMMWORD PTR tv1039[esp+4320], xmm4
    movsd   xmm4, QWORD PTR _myPlugin$[esp+4320]
    movsd   QWORD PTR _bp0$1$[esp+4320], xmm3
    unpcklpd xmm7, xmm7
    movaps  XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
    npad    8
$LL2@main:

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    add esi, 1

; 26   :        double *pB = b[voiceIndex];

    lea ecx, DWORD PTR _myPlugin$[esp+6448]

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    mov DWORD PTR _counterProcessing$1$[esp+4320], esi

; 26   :        double *pB = b[voiceIndex];

    mov edx, 256                ; 00000100H

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    adc edi, 0
    npad    10
$LL29@main:

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm0, QWORD PTR [ecx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [ecx]
    mulsd   xmm0, QWORD PTR _myPlugin$[esp+4384]
    comisd  xmm0, xmm6
    movsd   QWORD PTR $T4[esp+4320], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN35@main
    movaps  xmm0, xmm6
    jmp SHORT $LN36@main
$LN35@main:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm2, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T3[esp+4320]
    lea esi, DWORD PTR $T4[esp+4320]
    cmovbe  eax, esi
    movsd   xmm0, QWORD PTR [eax]

// ...

(Note: I've removed some lines because StackOverflow limit it.)

Its pretty different. Also, I see the code generated by VS is a bit redundant i.e. search for string phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);: there are many.

Which settings am I missing?. I've matched the same MSVC version (19.15), on a X86 build, placing also the actual optimization I have.

like image 936
markzzz Avatar asked Oct 24 '25 13:10

markzzz


1 Answers

It doesn't seem that you're using the same compiler flags. The assembly dump from Visual Studio shows that each function was optimized with the flags /Ogtp, which are used internally when you specify /Og in the command line. On the other hand, in the godbolt version, you used /Ot /O2, which internally correspond to /Ogtpy. If I manually add the /Oy flag, the code becomes slightly different, but still not the same as the one generated by Visual Studio.

I realize that the compiler versions are not exactly the same, but the difference between 19.15.26726.0 and 19.15.26732.1 is very minor and probably only includes bug fixes. I think there are other flags that are different. You can go to the Property Pages of your project and find all the compiler options that have been used in the "All Options" and "Additional Options" panes. In the Release build, many options are used other than /arch:SSE2 /Ot /O2. Note that /arch:SSE2 is the default, so you don't have to explicitly specify it. Also, /O2 implies /Ot. So /arch:SSE2 /Ot /O2 is equivalent to /O2.

like image 98
Hadi Brais Avatar answered Oct 27 '25 04:10

Hadi Brais