I am second time trying to implement fast mul128x64x2 function. First time I ask the question without comparision with _umul128 MSVC version. Now I made such a comparison and the results that I got show that the _umul128 function slower then native scalar and handmade simd AVX 1.0 code.
Below my test code:
#include <iostream>
#include <chrono>
#include <intrin.h>
#include <emmintrin.h>
#include <immintrin.h>
#pragma intrinsic(_umul128)
constexpr uint32_t LOW[4] = { 4294967295u, 0u, 4294967295u, 0u };
__forceinline void multiply128x128( const uint32_t ABCD[4], const uint32_t EFGH[4], uint32_t OUT[2][4] ) noexcept
{
__m128i L = _mm_lddqu_si128( reinterpret_cast< __m128i const* >( LOW ) );
__m128i IN = _mm_lddqu_si128( reinterpret_cast< __m128i const* >( EFGH ) );
__m128i A = _mm_set1_epi32( ABCD[0] );
__m128i B = _mm_set1_epi32( ABCD[1] );
__m128i C = _mm_set1_epi32( ABCD[2] );
__m128i D = _mm_set1_epi32( ABCD[3] );
__m128i ED = _mm_mul_epu32( IN, D );
__m128i EC = _mm_mul_epu32( IN, C );
__m128i EB = _mm_mul_epu32( IN, B );
__m128i EA = _mm_mul_epu32( IN, A );
IN = _mm_srli_epi64( IN, 32 );
__m128i FD = _mm_mul_epu32( IN, D );
__m128i FC = _mm_mul_epu32( IN, C );
__m128i FB = _mm_mul_epu32( IN, B );
__m128i FA = _mm_mul_epu32( IN, A );
__m128i FD_H = _mm_srli_epi64( FD, 32 );
__m128i FD_L = _mm_and_si128 ( L, FD );
__m128i FC_H = _mm_srli_epi64( FC, 32 );
__m128i FC_L = _mm_and_si128 ( L, FC );
__m128i FB_H = _mm_srli_epi64( FB, 32 );
__m128i FB_L = _mm_and_si128 ( L, FB );
__m128i FA_H = _mm_srli_epi64( FA, 32 );
__m128i FA_L = _mm_and_si128 ( L, FA );
__m128i ED_H = _mm_srli_epi64( ED, 32 );
__m128i ED_L = _mm_and_si128 ( L, ED );
__m128i EC_H = _mm_srli_epi64( EC, 32 );
__m128i EC_L = _mm_and_si128 ( L, EC );
__m128i EB_H = _mm_srli_epi64( EB, 32 );
__m128i EB_L = _mm_and_si128 ( L, EB );
__m128i EA_H = _mm_srli_epi64( EA, 32 );
__m128i EA_L = _mm_and_si128 ( L, EA );
__m128i SUM_FC_L_FD_H = _mm_add_epi64( FC_L, FD_H );
__m128i SUM_FB_L_FC_H = _mm_add_epi64( FB_L, FC_H );
__m128i SUM_FA_L_FB_H = _mm_add_epi64( FA_L, FB_H );
__m128i SUM_EC_L_ED_H = _mm_add_epi64( EC_L, ED_H );
__m128i SUM_EB_L_EC_H = _mm_add_epi64( EB_L, EC_H );
__m128i SUM_EA_L_EB_H = _mm_add_epi64( EA_L, EB_H );
__m128i SUM_FC_L_FD_H_ED_L = _mm_add_epi64( SUM_FC_L_FD_H, ED_L );
__m128i SUM_FB_L_FC_H_EC_L_ED_H = _mm_add_epi64( SUM_FB_L_FC_H, SUM_EC_L_ED_H );
__m128i SUM_FA_L_FB_H_EB_L_EC_H = _mm_add_epi64( SUM_FA_L_FB_H, SUM_EB_L_EC_H );
__m128i SUM_FA_H_EA_L_EB_H = _mm_add_epi64( FA_H, SUM_EA_L_EB_H );
__m128i SUM_FC_L_FD_H_ED_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L, 32 );
SUM_FC_L_FD_H_ED_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L, SUM_FB_L_FC_H_EC_L_ED_H );
__m128i SUM_FC_L_FD_H_ED_L_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L, 32 );
SUM_FC_L_FD_H_ED_L_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L, SUM_FA_L_FB_H_EB_L_EC_H );
__m128i SUM_FC_L_FD_H_ED_L_L_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L_L, 32 );
SUM_FC_L_FD_H_ED_L_L_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L_L, SUM_FA_H_EA_L_EB_H );
__m128i SUM_FC_L_FD_H_ED_L_L_L_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L_L_L, 32 );
SUM_FC_L_FD_H_ED_L_L_L_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L_L_L, EA_H );
OUT[0][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L.m128i_u32[0];
OUT[0][1] = SUM_FC_L_FD_H_ED_L_L_L_L.m128i_u32[0];
OUT[0][2] = SUM_FC_L_FD_H_ED_L_L_L.m128i_u32[0];
OUT[0][3] = SUM_FC_L_FD_H_ED_L_L.m128i_u32[0];
OUT[1][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L.m128i_u32[2];
OUT[1][1] = SUM_FC_L_FD_H_ED_L_L_L_L.m128i_u32[2];
OUT[1][2] = SUM_FC_L_FD_H_ED_L_L_L.m128i_u32[2];
OUT[1][3] = SUM_FC_L_FD_H_ED_L_L.m128i_u32[2];
}
__forceinline void multiply128x128_1( const uint32_t ABCD[4], const uint32_t EFGH[4], uint32_t OUT[2][4] ) noexcept
{
uint64_t ED = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t FD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t GD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t HD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t SUM_FC_L_FD_H = ( FC & 0xFFFFFFFF ) + ( FD >> 32u );
uint64_t SUM_FB_L_FC_H = ( FB & 0xFFFFFFFF ) + ( FC >> 32u );
uint64_t SUM_FA_L_FB_H = ( FA & 0xFFFFFFFF ) + ( FB >> 32u );
uint64_t SUM_EC_L_ED_H = ( EC & 0xFFFFFFFF ) + ( ED >> 32u );
uint64_t SUM_EB_L_EC_H = ( EB & 0xFFFFFFFF ) + ( EC >> 32u );
uint64_t SUM_EA_L_EB_H = ( EA & 0xFFFFFFFF ) + ( EB >> 32u );
uint64_t SUM_HC_L_HD_H = ( HC & 0xFFFFFFFF ) + ( HD >> 32u );
uint64_t SUM_HB_L_HC_H = ( HB & 0xFFFFFFFF ) + ( HC >> 32u );
uint64_t SUM_HA_L_HB_H = ( HA & 0xFFFFFFFF ) + ( HB >> 32u );
uint64_t SUM_GC_L_GD_H = ( GC & 0xFFFFFFFF ) + ( GD >> 32u );
uint64_t SUM_GB_L_GC_H = ( GB & 0xFFFFFFFF ) + ( GC >> 32u );
uint64_t SUM_GA_L_GB_H = ( GA & 0xFFFFFFFF ) + ( GB >> 32u );
uint64_t SUM_FC_L_FD_H_ED_L = SUM_FC_L_FD_H + ( ED & 0xFFFFFFFF );
uint64_t SUM_FB_L_FC_H_EC_L_ED_H = SUM_FB_L_FC_H + SUM_EC_L_ED_H;
uint64_t SUM_FA_L_FB_H_EB_L_EC_H = SUM_FA_L_FB_H + SUM_EB_L_EC_H;
uint64_t SUM_FA_H_EA_L_EB_H = SUM_EA_L_EB_H + ( FA >> 32u );
uint64_t SUM_FC_L_FD_H_ED_L_L = ( SUM_FC_L_FD_H_ED_L >> 32u ) + SUM_FB_L_FC_H_EC_L_ED_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L = ( SUM_FC_L_FD_H_ED_L_L >> 32u ) + SUM_FA_L_FB_H_EB_L_EC_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L_L = ( SUM_FC_L_FD_H_ED_L_L_L >> 32u ) + SUM_FA_H_EA_L_EB_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L_L_L = ( SUM_FC_L_FD_H_ED_L_L_L_L >> 32u ) + ( EA >> 32u );
uint64_t SUM_HC_L_HD_H_GD_L = SUM_HC_L_HD_H + ( GD & 0xFFFFFFFF );
uint64_t SUM_HB_L_HC_H_GC_L_GD_H = SUM_HB_L_HC_H + SUM_GC_L_GD_H;
uint64_t SUM_HA_L_HB_H_GB_L_GC_H = SUM_HA_L_HB_H + SUM_GB_L_GC_H;
uint64_t SUM_HA_H_GA_L_GB_H = SUM_GA_L_GB_H + ( HA >> 32u );
uint64_t SUM_HC_L_HD_H_GD_L_L = ( SUM_HC_L_HD_H_GD_L >> 32u ) + SUM_HB_L_HC_H_GC_L_GD_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L = ( SUM_HC_L_HD_H_GD_L_L >> 32u ) + SUM_HA_L_HB_H_GB_L_GC_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L_L = ( SUM_HC_L_HD_H_GD_L_L_L >> 32u ) + SUM_HA_H_GA_L_GB_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L_L_L = ( SUM_HC_L_HD_H_GD_L_L_L_L >> 32u ) + ( GA >> 32u );
OUT[0][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L;
OUT[0][1] = SUM_FC_L_FD_H_ED_L_L_L_L;
OUT[0][2] = SUM_FC_L_FD_H_ED_L_L_L;
OUT[0][3] = SUM_FC_L_FD_H_ED_L_L;
OUT[1][0] = SUM_HC_L_HD_H_GD_L_L_L_L_L;
OUT[1][1] = SUM_HC_L_HD_H_GD_L_L_L_L;
OUT[1][2] = SUM_HC_L_HD_H_GD_L_L_L;
OUT[1][3] = SUM_HC_L_HD_H_GD_L_L;
}
__forceinline void mulShift( const uint64_t* const m, const uint64_t* const mul , uint32_t OUT[2][4]) noexcept
{
uint64_t B0[2];
uint64_t B2[2];
{
B0[0] = _umul128( m[1], mul[0], &B0[1] );
B2[0] = _umul128( m[0], mul[0], &B2[1] );
uint64_t S = B0[1] + B2[0];
OUT[0][2] = S >> 32;
OUT[0][3] = S & 0xFFFFFFFF;
uint64_t M = B2[1] + ( S < B2[0] );
OUT[0][1] = M & 0xFFFFFFFF;
OUT[0][0] = M >> 32;
}
{
B0[0] = _umul128( m[1], mul[1], &B0[1] );
B2[0] = _umul128( m[0], mul[1], &B2[1] );
uint64_t S = B0[1] + B2[0];
OUT[1][2] = S >> 32;
OUT[1][3] = S & 0xFFFFFFFF;
uint64_t M = B2[1] + ( S < B2[0] );
OUT[1][1] = M & 0xFFFFFFFF;
OUT[1][0] = M >> 32;
}
}
constexpr uint32_t N = 1 << 28;
int main()
{
uint32_t OUT[2][4];
uint32_t ABCD[4] = { 4294967295u, 4294967295u, 4294967295u, 4294967295u };
uint32_t EFGH[4] = { 4294967295u, 4294967295u, 4294967295u, 4294967295u };
multiply128x128_1( ABCD, EFGH, OUT );
uint64_t S_1 = 0u;
uint64_t S_2 = 0u;
uint64_t S_3 = 0u;
auto start_1 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < N; ++i )
{
EFGH[0] = i;
EFGH[1] = i;
EFGH[2] = i + 1;
EFGH[3] = i + 1;
ABCD[0] = i;
ABCD[1] = i;
ABCD[2] = i + 1;
ABCD[3] = i + 1;
multiply128x128( ABCD, EFGH, OUT );
S_1 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_1 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_1 = std::chrono::high_resolution_clock::now();
std::cout << "Test A: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_1 - start_1 ).count() << '\n';
auto start_2 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < N; ++i )
{
EFGH[0] = i;
EFGH[1] = i;
EFGH[2] = i + 1;
EFGH[3] = i + 1;
ABCD[0] = i;
ABCD[1] = i;
ABCD[2] = i + 1;
ABCD[3] = i + 1;
mulShift( reinterpret_cast<const uint64_t*>( ABCD ), reinterpret_cast<const uint64_t*>( EFGH ), OUT );
S_2 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_2 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_2 = std::chrono::high_resolution_clock::now();
std::cout << "Test B: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_2 - start_2 ).count() << '\n';
auto start_3 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < N; ++i )
{
EFGH[0] = i;
EFGH[1] = i;
EFGH[2] = i + 1;
EFGH[3] = i + 1;
ABCD[0] = i;
ABCD[1] = i;
ABCD[2] = i + 1;
ABCD[3] = i + 1;
multiply128x128_1( ABCD, EFGH, OUT );
S_3 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_3 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_3 = std::chrono::high_resolution_clock::now();
std::cout << "Test C: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_3 - start_3 ).count() << '\n';
std::cout << S_1 << " " << S_2 << " " << S_3 << '\n';
}
Why is _umul128 so slow? Maybe i did some mistakes in my test code above?
My results: Test A (simd): 4546ms. Test B (_umul128): 6637ms. Test C (scalar): 2333ms.
Tested on Windows 10, x64, MSVC 2019
The _umul128
version isn't really that slow but you're gimping it with store-forwarding stalls by messing around with 32-bit arrays that makes MSVC emit terrible asm.
Optimization is defeating your benchmark; the pure C version isn't really that fast.
Especially with the simple input data:
ABCD[0] = EFGH[0] = i;
ABCD[1] = EFGH[1] = i;
ABCD[2] = EFGH[2] = i + 1;
ABCD[3] = EFGH[3] = i + 1;
Initializing both inputs like this creates a huge amount of opportunity for optimization after inlining the pure C version. It does i*i
4 times, and i*(i+1)
= i*i + i
another 8 times, and also (i+1)*(i+1)
4 times. MSVC isn't dumb and notices this. This is called Common Subexpression Elimination (CSE).
You'll need to come up with a more sophisticated way to fake input if you want to see how slow the pure C really is. Maybe generate ahead of time then loop over memory containing inputs? Setting up inputs from a loop counter costs almost as much as a multiply.
MSVC's asm output confirms that much of the work optimized away for the pure C version. (Godbolt with MSVC 19.22 for x64)
...
$LL10@main:
lea r15, QWORD PTR [rax+1]
mov rcx, r15
mov r9, r15
imul rcx, rax # only 3, not 16, imul instructions.
imul rax, rax # (None appear later in this loop in the ... part)
imul r9, r15
mov edi, ecx
mov r14, rcx
mov r8d, eax
shr r14, 32 ; 00000020H
shr rax, 32 ; 00000020H
...
sub r13, 1
jne $LL10@main
MSVC is bad at optimizing intrinsics and does all 4 mul m64
instructions instead of noticing that ii * i1i1
is done twice.
More importantly, the _umul128
loop is hurt by store-forwarding stalls because it actually stores your array to memory with 32-bit stores and then uses 64-bit loads to feed mul m64
.
Also, handling the output in 32-bit chunks just shoots yourself in the foot, introducing extra shifts and mov
operations.
This is not complicated, literally just 3 instructions, mul r64
and imul r64, r64
plus an add
for the high half, is all that's needed. GCC/clang easily emit the right thing, and the x86-64 System V calling convention can return a 128-bit int in registers.
On Godbolt: https://godbolt.org/z/DcZhSl
#include <stdint.h>
#ifdef __GNUC__
typedef unsigned __int128 u128;
u128 mul128x64( u128 a, uint64_t b) {
return a * b;
}
#endif
# clang -O3 for the x86-64 System V ABI (Linux)
mul128x64(unsigned __int128, unsigned long): #
mov rax, rdi
imul rsi, rdx
mul rdx
add rdx, rsi
ret
For MSVC we have to do that ourself, and the calling convention means the result is returned in memory.
#ifdef _MSC_VER
#include <intrin.h>
struct u128 { uint64_t u64[2]; };
u128 mul128x64( uint64_t a_lo, uint64_t a_hi, uint64_t b)
{
uint64_t lolo_high;
uint64_t lolo = _umul128( a_lo, b, &lolo_high );
uint64_t lohi = a_hi * b;
return {{lolo, lohi + lolo_high}};
}
#endif
# MSVC x64 -O2
u128 mul128x64(unsigned __int64,unsigned __int64,unsigned __int64) PROC
mov rax, r9
mul rdx
imul r8, r9
mov QWORD PTR [rcx], rax # store the retval into hidden pointer
mov rax, rcx
add r8, rdx
mov QWORD PTR [rcx+8], r8
ret 0
Your __m128i
intrinsics version is unlikely to be a win. Modern x86 (mainstream Intel SnB-family, AMD Ryzen) has 1/clock throughput for mul
and imul
. (Except Ryzen where widening i/mul r64
has 2c throughput, but still 1/clock for imul r64,r64
.)
So overall throughput for a 64 x 128-bit multiply on Sandybridge-family is one per 2 cycles (bottlenecked on port 1), if you implement in C that compiles to asm like this.
Given that you need more than 4 pmuludq
instructions to implement a multiply, AVX1 is a non-starter. (Skylake has 0.5c throughput for pmuludq
. Sandybridge has 1c throughput so you'd need to get the job done in 2 pmuludq
insns per multiply (on average) to compete with scalar. And that's without considering all the shift / shuffle / add work that needs doing.
Possibly worth considering on Bulldozer-family where 64-bit scalar multiply is 4c throughput but pmuludq
is 1c. (https://agner.org/optimize/) Producing 128 product bits per cycle (two 32x32 => 64-bit products) is better than producing 128 product bits per 4 cycles, if you can get them shifted and added without eating up too many extra cycles.
Again, MSVC is bad at constant-propagation or CSE optimization through intrinsincs, so your intrinsics version doesn't benefit from anything.
Your test code also uses _mm_set1_epi32( )
from scalar integer loop variables, requiring vmovd
and vpshufd
instructions.
And you get scalar store / vector reload for the lddqu
intrinsics on those arrays, so again you have store-forwarding stalls.
The only hope for this being good with SSE2 or AVX1 is if your data comes from memory, not registers. Or if you can keep your data in vector registers for a long time, not constantly moving it back and forth. Especially on Bulldozer-family where int <-> SIMD has high latency.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With