Here's the code generated by godbolt.
Here's the same code generated by Visual studio on my main.asm file (enabled by Project->C/C++->Output Files->Assembly With Source Code (/FAs) under Assembler Output field):
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1
TITLE c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB OLDNAMES
EXTRN __imp____std_terminate:PROC
EXTRN @__security_check_cookie@4:PROC
EXTRN __imp____CxxFrameHandler3:PROC
PUBLIC ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z ; std::less<void>::operator()<double const &,double const &>
PUBLIC ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
PUBLIC ??$clamp@N@std@@YAABNABN00@Z ; std::clamp<double>
PUBLIC _main
PUBLIC ?ProcessOptimized@MyPlugin@@QAEXH@Z ; MyPlugin::ProcessOptimized
PUBLIC ?Process@MyPlugin@@QAEXH@Z ; MyPlugin::Process
PUBLIC ??1MyPlugin@@QAE@XZ ; MyPlugin::~MyPlugin
PUBLIC ??0MyPlugin@@QAE@XZ ; MyPlugin::MyPlugin
PUBLIC ?ProcessOptimized@Param@@QAEXHH@Z ; Param::ProcessOptimized
PUBLIC ?Process@Param@@QAEXHH@Z ; Param::Process
PUBLIC ??0Param@@QAE@XZ ; Param::Param
PUBLIC __real@3ff0000000000000
PUBLIC __real@400921fb54442d18
PUBLIC __real@4024000000000000
PUBLIC __real@406fe00000000000
PUBLIC __xmm@00000003000000020000000100000000
PUBLIC __xmm@400921fb54442d18400921fb54442d18
PUBLIC __xmm@406fe00000000000406fe00000000000
EXTRN __chkstk:PROC
EXTRN ___security_cookie:DWORD
EXTRN __fltused:DWORD
; COMDAT __xmm@406fe00000000000406fe00000000000
CONST SEGMENT
__xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
DB '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
CONST ENDS
; COMDAT __xmm@400921fb54442d18400921fb54442d18
CONST SEGMENT
__xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
DB 018H, '-DT', 0fbH, '!', 09H, '@'
CONST ENDS
; COMDAT __xmm@00000003000000020000000100000000
CONST SEGMENT
__xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
DB 00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
CONST ENDS
; COMDAT __real@406fe00000000000
CONST SEGMENT
__real@406fe00000000000 DQ 0406fe00000000000r ; 255
CONST ENDS
; COMDAT __real@4024000000000000
CONST SEGMENT
__real@4024000000000000 DQ 04024000000000000r ; 10
CONST ENDS
; COMDAT __real@400921fb54442d18
CONST SEGMENT
__real@400921fb54442d18 DQ 0400921fb54442d18r ; 3.14159
CONST ENDS
; COMDAT __real@3ff0000000000000
CONST SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r ; 1
CONST ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0Param@@QAE@XZ
_TEXT SEGMENT
??0Param@@QAE@XZ PROC ; Param::Param, COMDAT
; _this$ = ecx
; 23 : Param() { }
xorps xmm0, xmm0
mov eax, ecx
movsd QWORD PTR [ecx], xmm0
movsd QWORD PTR [ecx+16], xmm0
movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [ecx+32], xmm0
movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [ecx+48], xmm0
movsd QWORD PTR [ecx+64], xmm0
ret 0
??0Param@@QAE@XZ ENDP ; Param::Param
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@Param@@QAEXHH@Z
_TEXT SEGMENT
$T1 = -24 ; size = 8
$T3 = -16 ; size = 8
$T2 = -8 ; size = 8
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?Process@Param@@QAEXHH@Z PROC ; Param::Process, COMDAT
; _this$ = ecx
; 25 : inline void Process(int voiceIndex, int blockSize) {
push ebp
mov ebp, esp
sub esp, 24 ; 00000018H
; 26 : double *pB = b[voiceIndex];
mov eax, DWORD PTR _voiceIndex$[ebp]
xorps xmm5, xmm5
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm2, QWORD PTR __real@400921fb54442d18
push esi
mov esi, ecx
shl eax, 11 ; 0000000bH
push edi
movsd QWORD PTR $T1[ebp], xmm2
mov ecx, 256 ; 00000100H
movsd QWORD PTR $T2[ebp], xmm5
movsd xmm3, QWORD PTR [esi+48]
lea edx, DWORD PTR [esi+2128]
movsd xmm1, QWORD PTR [esi]
add edx, eax
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 11
$LL4@Process:
movsd xmm0, QWORD PTR [edx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [edx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T3[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN10@Process
movaps xmm0, xmm2
jmp SHORT $LN11@Process
$LN10@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T2[ebp]
lea edi, DWORD PTR $T3[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN11@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
add edx, 8
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
addsd xmm1, xmm0
sub ecx, 1
jne SHORT $LL4@Process
; 35 : }
; 36 :
; 37 : mPhase = phase;
; 38 : }
pop edi
movsd QWORD PTR [esi], xmm1
pop esi
mov esp, ebp
pop ebp
ret 8
?Process@Param@@QAEXHH@Z ENDP ; Param::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
_TEXT SEGMENT
_v_phase$ = -16 ; size = 16
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?ProcessOptimized@Param@@QAEXHH@Z PROC ; Param::ProcessOptimized, COMDAT
; _this$ = ecx
; 39 : inline void ProcessOptimized(int voiceIndex, int blockSize) {
push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp
; 40 : double *pB = b[voiceIndex];
mov eax, DWORD PTR _voiceIndex$[ebx]
mov edx, ecx
shl eax, 11 ; 0000000bH
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H
xorps xmm7, xmm7
mov ecx, 128 ; 00000080H
; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;
movsd xmm5, QWORD PTR [edx+48]
mulsd xmm5, QWORD PTR [edx+32]
; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
movsd xmm6, QWORD PTR [edx+64]
; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR [eax+edx+80]
movups xmm4, XMMWORD PTR [eax+edx+80]
movups xmm1, XMMWORD PTR [eax+edx+2128]
mulsd xmm5, xmm6
unpcklpd xmm3, xmm0
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm0, QWORD PTR [eax+edx+2128]
add eax, 2136 ; 00000858H
unpcklpd xmm2, xmm0
add eax, edx
; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);
movsd xmm0, QWORD PTR [edx+16]
unpcklpd xmm5, xmm5
unpcklpd xmm6, xmm6
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5
mulpd xmm2, xmm6
unpcklpd xmm0, xmm0
npad 2
$LL4@ProcessOpt:
; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
addpd xmm1, xmm4
; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);
movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3
; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);
movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm7
maxpd xmm2, xmm7
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm0, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm0, xmm2
; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);
movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$[ebp], xmm0
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5
; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm6
sub ecx, 1
jne SHORT $LL4@ProcessOpt
; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
movsd xmm0, QWORD PTR _v_phase$[ebp+8]
movsd QWORD PTR [edx+16], xmm0
; 88 : }
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 8
?ProcessOptimized@Param@@QAEXHH@Z ENDP ; Param::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0MyPlugin@@QAE@XZ
_TEXT SEGMENT
??0MyPlugin@@QAE@XZ PROC ; MyPlugin::MyPlugin, COMDAT
; _this$ = ecx
; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
movaps xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
xorps xmm0, xmm0
movaps xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
xor edx, edx
push esi
mov esi, ecx
push edi
; 14 : alignas(16) double mPhase = 0.0;
movsd QWORD PTR [esi], xmm0
; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR [esi+88]
; 15 : alignas(16) double mPhaseOptimized = 0.0;
movsd QWORD PTR [esi+16], xmm0
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [esi+32], xmm0
; 17 : alignas(16) double mHostPitch = 1.0;
movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [esi+48], xmm0
; 18 : alignas(16) double mRadiansPerSample = 1.0;
movsd QWORD PTR [esi+64], xmm0
$LL7@MyPlugin:
; 100 : double value = (sampleIndex / ((double)bufferSize - 1));
movd xmm0, edx
lea eax, DWORD PTR [edx+2]
pshufd xmm1, xmm0, 0
lea ecx, DWORD PTR [ecx+32]
movq xmm0, xmm2
add edx, 4
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3
; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;
movlpd QWORD PTR [ecx-40], xmm0
movhpd QWORD PTR [ecx-32], xmm0
movd xmm0, eax
pshufd xmm1, xmm0, 0
movq xmm0, xmm2
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3
movlpd QWORD PTR [ecx-24], xmm0
movhpd QWORD PTR [ecx-16], xmm0
cmp edx, 256 ; 00000100H
jl SHORT $LL7@MyPlugin
; 103 : }
; 104 : }
; 105 :
; 106 : // fill c
; 107 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea edi, DWORD PTR [esi+2128]
xor eax, eax
mov ecx, 512 ; 00000200H
rep stosd
; 109 : double value = 0.0;
; 110 :
; 111 : mParam1.c[voiceIndex][sampleIndex] = value;
; 112 : }
; 113 : }
; 114 : }
pop edi
mov eax, esi
pop esi
ret 0
??0MyPlugin@@QAE@XZ ENDP ; MyPlugin::MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??1MyPlugin@@QAE@XZ
_TEXT SEGMENT
??1MyPlugin@@QAE@XZ PROC ; MyPlugin::~MyPlugin, COMDAT
; _this$dead$ = ecx
; 115 : ~MyPlugin() { }
ret 0
??1MyPlugin@@QAE@XZ ENDP ; MyPlugin::~MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
$T2 = -28 ; size = 8
$T4 = -20 ; size = 8
$T3 = -12 ; size = 8
_blockSize$dead$ = 8 ; size = 4
?Process@MyPlugin@@QAEXH@Z PROC ; MyPlugin::Process, COMDAT
; _this$ = ecx
; 117 : void Process(int blockSize) {
push ebp
mov ebp, esp
sub esp, 28 ; 0000001cH
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm2, QWORD PTR __real@400921fb54442d18
xorps xmm5, xmm5
; 117 : void Process(int blockSize) {
push esi
mov esi, ecx
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T2[ebp], xmm2
; 117 : void Process(int blockSize) {
push edi
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T3[ebp], xmm5
mov edx, 256 ; 00000100H
movsd xmm3, QWORD PTR [esi+48]
; 27 : double *pC = c[voiceIndex];
lea ecx, DWORD PTR [esi+2128]
; 28 : double phase = mPhase;
; 29 : double bp0 = mNoteFrequency * mHostPitch;
movsd xmm1, QWORD PTR [esi]
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 3
$LL9@Process:
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T4[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN15@Process
movaps xmm0, xmm2
jmp SHORT $LN16@Process
$LN15@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T3[ebp]
lea edi, DWORD PTR $T4[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN16@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
add ecx, 8
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
addsd xmm1, xmm0
sub edx, 1
jne SHORT $LL9@Process
; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }
pop edi
; 37 : mPhase = phase;
movsd QWORD PTR [esi], xmm1
; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }
pop esi
mov esp, ebp
pop ebp
ret 4
?Process@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
_v_phase$31 = -16 ; size = 16
_blockSize$dead$ = 8 ; size = 4
?ProcessOptimized@MyPlugin@@QAEXH@Z PROC ; MyPlugin::ProcessOptimized, COMDAT
; _this$ = ecx
; 122 : void ProcessOptimized(int blockSize) {
push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp
mov edx, ecx
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H
; 40 : double *pB = b[voiceIndex];
mov ecx, 128 ; 00000080H
movsd xmm6, QWORD PTR [edx+48]
lea eax, DWORD PTR [edx+2136]
mulsd xmm6, QWORD PTR [edx+32]
; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;
; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
movsd xmm7, QWORD PTR [edx+64]
; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR [edx+80]
movsd xmm5, QWORD PTR [edx+16]
movups xmm4, XMMWORD PTR [edx+80]
movups xmm1, XMMWORD PTR [edx+2128]
mulsd xmm6, xmm7
unpcklpd xmm3, xmm0
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm0, QWORD PTR [edx+2128]
unpcklpd xmm7, xmm7
unpcklpd xmm6, xmm6
unpcklpd xmm2, xmm0
xorps xmm0, xmm0
; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
mulpd xmm4, xmm6
; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
mulpd xmm1, xmm7
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
mulpd xmm3, xmm6
; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm7
; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);
unpcklpd xmm5, xmm5
npad 13
$LL9@ProcessOpt:
; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
addpd xmm1, xmm4
; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);
movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3
; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);
movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm0
maxpd xmm2, xmm0
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm5, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm5, xmm2
; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);
movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$31[ebp], xmm5
mulpd xmm4, xmm6
mulpd xmm1, xmm7
mulpd xmm3, xmm6
; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm7
sub ecx, 1
jne SHORT $LL9@ProcessOpt
; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
movsd xmm0, QWORD PTR _v_phase$31[ebp+8]
movsd QWORD PTR [edx+16], xmm0
; 123 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 124 : mParam1.ProcessOptimized(voiceIndex, blockSize);
; 125 : }
; 126 : }
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 4
?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT _main
_TEXT SEGMENT
_counterProcessing$1$ = -4304 ; size = 4
_counterProcessing$ = -4304 ; size = 8
_bp0$1$ = -4296 ; size = 8
_v_radiansPerSample$1$ = -4288 ; size = 16
$T3 = -4264 ; size = 8
_v_phase$38 = -4256 ; size = 16
$T4 = -4256 ; size = 8
$T2 = -4232 ; size = 8
tv1040 = -4224 ; size = 16
tv1039 = -4208 ; size = 16
_myPlugin$ = -4192 ; size = 4176
__$ArrayPad$ = -4 ; size = 4
_main PROC ; COMDAT
; 129 : int main() {
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
mov eax, 4312 ; 000010d8H
call __chkstk
mov eax, DWORD PTR ___security_cookie
xor eax, esp
mov DWORD PTR __$ArrayPad$[esp+4312], eax
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd xmm0, QWORD PTR __real@4024000000000000
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR _myPlugin$[esp+4392]
movsd xmm1, QWORD PTR __real@406fe00000000000
xorps xmm2, xmm2
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd QWORD PTR _myPlugin$[esp+4344], xmm0
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
xor eax, eax
; 17 : alignas(16) double mHostPitch = 1.0;
movsd xmm0, QWORD PTR __real@3ff0000000000000
; 129 : int main() {
push esi
push edi
; 14 : alignas(16) double mPhase = 0.0;
movsd QWORD PTR _myPlugin$[esp+4320], xmm2
; 15 : alignas(16) double mPhaseOptimized = 0.0;
movsd QWORD PTR _myPlugin$[esp+4336], xmm2
; 17 : alignas(16) double mHostPitch = 1.0;
movsd QWORD PTR _myPlugin$[esp+4368], xmm0
; 18 : alignas(16) double mRadiansPerSample = 1.0;
movsd QWORD PTR _myPlugin$[esp+4384], xmm0
$LL11@main:
movd xmm0, eax
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR [ecx+8]
; 100 : double value = (sampleIndex / ((double)bufferSize - 1));
cvtdq2pd xmm0, xmm0
inc eax
divsd xmm0, xmm1
; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;
movsd QWORD PTR [ecx-8], xmm0
cmp eax, 256 ; 00000100H
jl SHORT $LL11@main
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm6, QWORD PTR __real@400921fb54442d18
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea edi, DWORD PTR _myPlugin$[esp+6448]
mov ecx, 512 ; 00000200H
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T2[esp+4320], xmm6
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
xor eax, eax
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T3[esp+4320], xmm2
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
rep stosd
movsd xmm3, QWORD PTR _myPlugin$[esp+4352]
xorps xmm0, xmm0
mulsd xmm3, QWORD PTR _myPlugin$[esp+4368]
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movaps xmm4, xmm2
movsd xmm1, QWORD PTR _myPlugin$[esp+4384]
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm5, QWORD PTR _myPlugin$[esp+4336]
; 130 : MyPlugin myPlugin;
; 131 :
; 132 : long long numProcessing = 5;
; 133 : long long counterProcessing = 0;
movlpd QWORD PTR _counterProcessing$[esp+4320], xmm0
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR _myPlugin$[esp+4400]
movaps xmm7, xmm3
mulsd xmm7, QWORD PTR _myPlugin$[esp+4384]
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
mov edi, DWORD PTR _counterProcessing$[esp+4324]
mov esi, DWORD PTR _counterProcessing$[esp+4320]
unpcklpd xmm4, xmm0
movsd xmm0, QWORD PTR _myPlugin$[esp+6448]
movups XMMWORD PTR tv1040[esp+4320], xmm4
movaps xmm4, xmm2
unpcklpd xmm1, xmm1
unpcklpd xmm4, xmm0
movups XMMWORD PTR tv1039[esp+4320], xmm4
movsd xmm4, QWORD PTR _myPlugin$[esp+4320]
movsd QWORD PTR _bp0$1$[esp+4320], xmm3
unpcklpd xmm7, xmm7
movaps XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
npad 8
$LL2@main:
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
add esi, 1
; 26 : double *pB = b[voiceIndex];
lea ecx, DWORD PTR _myPlugin$[esp+6448]
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
mov DWORD PTR _counterProcessing$1$[esp+4320], esi
; 26 : double *pB = b[voiceIndex];
mov edx, 256 ; 00000100H
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
adc edi, 0
npad 10
$LL29@main:
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, QWORD PTR _myPlugin$[esp+4384]
comisd xmm0, xmm6
movsd QWORD PTR $T4[esp+4320], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN35@main
movaps xmm0, xmm6
jmp SHORT $LN36@main
$LN35@main:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm2, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T3[esp+4320]
lea esi, DWORD PTR $T4[esp+4320]
cmovbe eax, esi
movsd xmm0, QWORD PTR [eax]
// ...
(Note: I've removed some lines because StackOverflow limit it.)
Its pretty different. Also, I see the code generated by VS is a bit redundant i.e. search for string phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);: there are many.
Which settings am I missing?. I've matched the same MSVC version (19.15), on a X86 build, placing also the actual optimization I have.
It doesn't seem that you're using the same compiler flags. The assembly dump from Visual Studio shows that each function was optimized with the flags /Ogtp, which are used internally when you specify /Og in the command line. On the other hand, in the godbolt version, you used /Ot /O2, which internally correspond to /Ogtpy. If I manually add the /Oy flag, the code becomes slightly different, but still not the same as the one generated by Visual Studio.
I realize that the compiler versions are not exactly the same, but the difference between 19.15.26726.0 and 19.15.26732.1 is very minor and probably only includes bug fixes. I think there are other flags that are different. You can go to the Property Pages of your project and find all the compiler options that have been used in the "All Options" and "Additional Options" panes. In the Release build, many options are used other than /arch:SSE2 /Ot /O2. Note that /arch:SSE2 is the default, so you don't have to explicitly specify it. Also, /O2 implies /Ot. So /arch:SSE2 /Ot /O2 is equivalent to /O2.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With