I've been trying to improve the performance of large (multi-gigabyte) bitarray operations. I'm no SIMD expert, but it appears SIMD is, in all cases, slower than scalar operations. I've tried several optimizations, including loop unrolling, to no avail. Based on the assembly, it appears it's because scalars are able to use the registers. But, if I'm doing something stupid, please let me know. Otherwise, I'm happy to keep scalars... it's much, much simpler.
/* gcc -Wall -O3 bitwise-and.c -o bitwise-and -m64 -fomit-frame-pointer -mtune=nocona -msse2 */
#ifdef ENABLE_PREFETCH
#warning "SIMD PREFETCHING ENABLED"
#else
#warning "SIMD PREFETCHING DISABLED"
#endif
#ifdef ENABLE_SIMD_UNROLLING
#warning "UNROLLING SIMD"
#else
#warning "NOT UNROLLING SIMD"
#endif
#ifdef AVOID_TEMP_VARS
#warning "AVOIDING SIMD TEMPORARY VARIABLES"
#else
#warning "USING SIMD TEMPORARY VARIABLES"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <setjmp.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include <assert.h>
#define __forceinline __attribute__((always_inline))
double
microtime (void)
{
struct timeval time;
gettimeofday(&time, NULL);
return (double) time.tv_sec * 1E6 + (double) time.tv_usec;
}
__forceinline void
simd_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
const __m128i *wrd_ptr = (__m128i *) src;
const __m128i *wrd_end = (__m128i *) (src + block_size);
__m128i *dst_ptr = (__m128i *) dst;
_mm_empty();
do
{
__m128i xmm1;
__m128i xmm2;
#ifdef ENABLE_SIMD_UNROLLING
# ifdef ENABLE_PREFETCH
_mm_prefetch((src + 512), _MM_HINT_NTA);
# endif
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
#else
# ifdef AVOID_TEMP_VARS
xmm1 = _mm_and_si128(*dst_ptr, *wrd_ptr);
# else
xmm1 = _mm_load_si128(wrd_ptr);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
# endif
_mm_store_si128(dst_ptr, xmm1);
++dst_ptr;
++wrd_ptr;
#endif
} while (wrd_ptr < wrd_end);
}
__forceinline void
word_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
unsigned int *wrd_ptr = (unsigned int *) src;
unsigned int *wrd_end = (unsigned int *) (src + block_size);
unsigned int *dst_ptr = (unsigned int *) dst;
do
{
dst_ptr[0] &= wrd_ptr[0];
dst_ptr[1] &= wrd_ptr[1];
dst_ptr[2] &= wrd_ptr[2];
dst_ptr[3] &= wrd_ptr[3];
dst_ptr += 4;
wrd_ptr += 4;
} while (wrd_ptr < wrd_end);
}
int
main (int argc, char **argv)
{
unsigned char *dest;
unsigned char *key1;
unsigned char *key2;
size_t minlen = (1024UL * 1024UL * 512UL);
double start_time = 0.0f;
double end_time = 0.0f;
posix_memalign((void *) &key1, sizeof(__m128i), minlen);
posix_memalign((void *) &key2, sizeof(__m128i), minlen);
posix_memalign((void *) &dest, sizeof(__m128i), minlen);
key1[128] = 0xff;
key2[128] = 0x03;
// 128-bit SIMD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
simd_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 4xWORD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
word_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
free(dest);
free(key2);
free(key1);
return EXIT_SUCCESS;
}
/* vi: set et sw=2 ts=2: */
In the multiplication case, the normal version actually performs about 20% faster than the bitwise equivalent. On the other hand, division is nearly twice as fast with the bitwise shift and the bitwise modulus (really just an & ) is more than three times faster!
Bitwise operations are incredibly simple and thus usually faster than arithmetic operations. For example to get the green portion of an rgb value, the arithmetic approach is (rgb / 256) % 256 .
No. First, using bitwise operators in contrast to logical operators is prone to error (e.g., doing a right-shift by 1 is NOT equivalent to multiplying by two). Second, the performance benefit is negligible (if any).
On most older microprocessors, bitwise operations are slightly faster than addition and subtraction operations and usually significantly faster than multiplication and division operations.
What's happening here is that you're being bitten by lazy allocation of virtual memory. If you change your code to this:
// 128-bit SIMD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
simd_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("SIMD Elapsed : %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 4xWORD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
word_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 128-bit SIMD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
simd_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("SIMD Elapsed : %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 4xWORD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
word_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
you should see results something like this:
$ ./bitwise-and
SIMD Elapsed : 630061.000000s
Scalar Elapsed: 228156.000000s
SIMD Elapsed : 182645.000000s
Scalar Elapsed: 202697.000000s
$
Explanation: the first time you iterate through your large memory allocations you are generating page faults, as previously unused pages get wired in. This gives an artificially high time for the first benchmark, which happens to be the SIMD benchmark. On the second and subsequent benchmarks the pages are all wired in and you get a more accurate benchmark, and as expected the SIMD routine is slightly faster than the scalar routine. The difference is not as large as might be expected, as you are executing only one ALU instruction for every 2 loads + 1 store, so performance is limited by DRAM bandwidth rather than computational efficiency.
As a general rule when writing benchmarking code: always call the benchmark routine at least once prior to any actual timing measurements, so that all memory allocations are properly wired in. After that run the benchmark routine a number of times in a loop and ignore any outliers.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With