I have found that manually calculating the %
operator on __int128
is significantly faster than the built-in compiler operator. I will show you how to calculate modulo 9, but the method can be used to calculate modulo any other number.
First, consider the built-in compiler operator:
uint64_t mod9_v1(unsigned __int128 n)
{
return n % 9;
}
Now consider my manual implementation:
uint64_t mod9_v2(unsigned __int128 n)
{
uint64_t r = 0;
r += (uint32_t)(n);
r += (uint32_t)(n >> 32) * (uint64_t)4;
r += (uint32_t)(n >> 64) * (uint64_t)7;
r += (uint32_t)(n >> 96);
return r % 9;
}
Measuring over 100,000,000 random numbers gives the following results:
mod9_v1 | 3.986052 secs
mod9_v2 | 1.814339 secs
GCC 9.3.0 with -march=native -O3
was used on AMD Ryzen Threadripper 2990WX.
Here is a link to godbolt.
I would like to ask if it behaves the same way on your side? (Before reporting a bug to GCC Bugzilla).
UPDATE: On request, I supply a generated assembly:
mod9_v1:
sub rsp, 8
mov edx, 9
xor ecx, ecx
call __umodti3
add rsp, 8
ret
mod9_v2:
mov rax, rdi
shrd rax, rsi, 32
mov rdx, rsi
mov r8d, eax
shr rdx, 32
mov eax, edi
add rax, rdx
lea rax, [rax+r8*4]
mov esi, esi
lea rcx, [rax+rsi*8]
sub rcx, rsi
mov rax, rcx
movabs rdx, -2049638230412172401
mul rdx
mov rax, rdx
shr rax, 3
and rdx, -8
add rdx, rax
mov rax, rcx
sub rax, rdx
ret
Integer division and modulo are relatively slow because there is no direct hardware support (they compile to multiple instruction sequences).
Modulo operator is expensive but the division is expensive too. So converting your code from using modulo operator to division is not going to optimize your code. Show activity on this post. Modulo can be done with a single processor instruction on most architectures (ex.
A modulo-operation is very slow. An if is most likely to be faster than a modulo and more readable.
Modular exponentiation can be performed with a negative exponent e by finding the modular multiplicative inverse d of b modulo m using the extended Euclidean algorithm. That is: c = be mod m = d−e mod m, where e < 0 and b ⋅ d ≡ 1 (mod m). Modular exponentiation is efficient to compute, even for very large integers.
The reason for this difference is clear from the assembly listings: the %
operator applied to 128-bit integers is implemented via a library call to a generic function that cannot take advantage of compile time knowledge of the divisor value, which makes it possible to turn division and modulo operations into much faster multiplications.
The timing difference is even more significant on my old Macbook-pro using clang, where I mod_v2()
is x15 times faster than mod_v1()
.
Note however these remarks:
for
loop, not after the first printf
as currently coded.rand_u128()
only produces 124 bits assuming RAND_MAX
is 0x7fffffff
.Using your slicing approach, I extended you code to reduce the number of steps using slices of 42, 42 and 44 bits, which further improves the timings (because 242 % 9 == 1):
#pragma GCC diagnostic ignored "-Wpedantic"
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <inttypes.h>
#include <stdio.h>
#include <time.h>
static uint64_t mod9_v1(unsigned __int128 n) {
return n % 9;
}
static uint64_t mod9_v2(unsigned __int128 n) {
uint64_t r = 0;
r += (uint32_t)(n);
r += (uint32_t)(n >> 32) * (uint64_t)(((uint64_t)1ULL << 32) % 9);
r += (uint32_t)(n >> 64) * (uint64_t)(((unsigned __int128)1 << 64) % 9);
r += (uint32_t)(n >> 96);
return r % 9;
}
static uint64_t mod9_v3(unsigned __int128 n) {
return (((uint64_t)(n >> 0) & 0x3ffffffffff) +
((uint64_t)(n >> 42) & 0x3ffffffffff) +
((uint64_t)(n >> 84))) % 9;
}
unsigned __int128 rand_u128() {
return ((unsigned __int128)rand() << 97 ^
(unsigned __int128)rand() << 66 ^
(unsigned __int128)rand() << 35 ^
(unsigned __int128)rand() << 4 ^
(unsigned __int128)rand());
}
#define N 100000000
int main() {
srand(42);
unsigned __int128 *arr = malloc(sizeof(unsigned __int128) * N);
if (arr == NULL) {
return 1;
}
for (size_t n = 0; n < N; ++n) {
arr[n] = rand_u128();
}
#if 1
/* check that modulo 9 is calculated correctly */
for (size_t n = 0; n < N; ++n) {
uint64_t m = mod9_v1(arr[n]);
assert(m == mod9_v2(arr[n]));
assert(m == mod9_v3(arr[n]));
}
#endif
clock_t clk1 = -clock();
uint64_t sum1 = 0;
for (size_t n = 0; n < N; ++n) {
sum1 += mod9_v1(arr[n]);
}
clk1 += clock();
clock_t clk2 = -clock();
uint64_t sum2 = 0;
for (size_t n = 0; n < N; ++n) {
sum2 += mod9_v2(arr[n]);
}
clk2 += clock();
clock_t clk3 = -clock();
uint64_t sum3 = 0;
for (size_t n = 0; n < N; ++n) {
sum3 += mod9_v3(arr[n]);
}
clk3 += clock();
printf("mod9_v1: sum=%"PRIu64", elapsed time: %.3f secs\n", sum1, clk1 / (double)CLOCKS_PER_SEC);
printf("mod9_v2: sum=%"PRIu64", elapsed time: %.3f secs\n", sum2, clk2 / (double)CLOCKS_PER_SEC);
printf("mod9_v3: sum=%"PRIu64", elapsed time: %.3f secs\n", sum3, clk3 / (double)CLOCKS_PER_SEC);
free(arr);
return 0;
}
Here are the timings on my linux server (gcc):
mod9_v1: sum=400041273, elapsed time: 7.992 secs
mod9_v2: sum=400041273, elapsed time: 1.295 secs
mod9_v3: sum=400041273, elapsed time: 1.131 secs
The same code on my Macbook (clang):
mod9_v1: sum=399978071, elapsed time: 32.900 secs
mod9_v2: sum=399978071, elapsed time: 0.204 secs
mod9_v3: sum=399978071, elapsed time: 0.185 secs
In the mean time (while waiting for Bugzilla), you could let the preprocessor do the optimization for you. E.g. define a macro called MOD_INT128(n,d) :
#define MODCALC0(n,d) ((65536*n)%d)
#define MODCALC1(n,d) MODCALC0(MODCALC0(n,d),d)
#define MODCALC2(n,d) MODCALC1(MODCALC1(n,d),d)
#define MODCALC3(n,d) MODCALC2(MODCALC1(n,d),d)
#define MODPARAM(n,d,a,b,c) \
((uint64_t)((uint32_t)(n) ) + \
(uint64_t)((uint32_t)(n >> 32) * (uint64_t)a) + \
(uint64_t)((uint32_t)(n >> 64) * (uint64_t)b) + \
(uint64_t)((uint32_t)(n >> 96) * (uint64_t)c) ) % d
#define MOD_INT128(n,d) MODPARAM(n,d,MODCALC1(1,d),MODCALC2(1,d),MODCALC3(1,d))
Now,
uint64_t mod9_v3(unsigned __int128 n)
{
return MOD_INT128( n, 9 );
}
will generate similar assembly language as the mod9_v2() function, and
uint64_t mod8_v3(unsigned __int128 n)
{
return MOD_INT128( n, 8 );
}
works fine with already existing optimization (GCC 10.2.0)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With