Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Bitwise-AND Slower with SIMD than Scalar

I've been trying to improve the performance of large (multi-gigabyte) bitarray operations. I'm no SIMD expert, but it appears SIMD is, in all cases, slower than scalar operations. I've tried several optimizations, including loop unrolling, to no avail. Based on the assembly, it appears it's because scalars are able to use the registers. But, if I'm doing something stupid, please let me know. Otherwise, I'm happy to keep scalars... it's much, much simpler.

/* gcc -Wall -O3 bitwise-and.c -o bitwise-and -m64 -fomit-frame-pointer -mtune=nocona -msse2 */

#ifdef ENABLE_PREFETCH
#warning "SIMD PREFETCHING ENABLED"
#else
#warning "SIMD PREFETCHING DISABLED"
#endif

#ifdef ENABLE_SIMD_UNROLLING
#warning "UNROLLING SIMD"
#else
#warning "NOT UNROLLING SIMD"
#endif

#ifdef AVOID_TEMP_VARS
#warning "AVOIDING SIMD TEMPORARY VARIABLES"
#else
#warning "USING SIMD TEMPORARY VARIABLES"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <setjmp.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include <assert.h>

#define __forceinline __attribute__((always_inline))

double
microtime (void)
{
  struct timeval time;
  gettimeofday(&time, NULL);

  return (double) time.tv_sec * 1E6 + (double) time.tv_usec;
}

__forceinline void
simd_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
  const __m128i *wrd_ptr = (__m128i *) src;
  const __m128i *wrd_end = (__m128i *) (src + block_size);
  __m128i *dst_ptr = (__m128i *) dst;

  _mm_empty();

  do
  {
    __m128i xmm1;
    __m128i xmm2;

#ifdef ENABLE_SIMD_UNROLLING

# ifdef ENABLE_PREFETCH
    _mm_prefetch((src + 512), _MM_HINT_NTA);
# endif

    xmm1 = _mm_load_si128(wrd_ptr++);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
    _mm_store_si128(dst_ptr++, xmm1);

    xmm1 = _mm_load_si128(wrd_ptr++);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
    _mm_store_si128(dst_ptr++, xmm1);

    xmm1 = _mm_load_si128(wrd_ptr++);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
    _mm_store_si128(dst_ptr++, xmm1);

    xmm1 = _mm_load_si128(wrd_ptr++);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
    _mm_store_si128(dst_ptr++, xmm1);
#else
# ifdef AVOID_TEMP_VARS
    xmm1 = _mm_and_si128(*dst_ptr, *wrd_ptr);
# else
    xmm1 = _mm_load_si128(wrd_ptr);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
# endif
    _mm_store_si128(dst_ptr, xmm1);
    ++dst_ptr;
    ++wrd_ptr;
#endif
  } while (wrd_ptr < wrd_end);
}

__forceinline void
word_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
  unsigned int *wrd_ptr = (unsigned int *) src;
  unsigned int *wrd_end = (unsigned int *) (src + block_size);
  unsigned int *dst_ptr = (unsigned int *) dst;
  do
  {
    dst_ptr[0] &= wrd_ptr[0];
    dst_ptr[1] &= wrd_ptr[1];
    dst_ptr[2] &= wrd_ptr[2];
    dst_ptr[3] &= wrd_ptr[3];

    dst_ptr += 4;
    wrd_ptr += 4;
  } while (wrd_ptr < wrd_end);
}

int
main (int argc, char **argv)
{
  unsigned char *dest;
  unsigned char *key1;
  unsigned char *key2;
  size_t minlen = (1024UL * 1024UL * 512UL);
  double start_time = 0.0f;
  double end_time = 0.0f;

  posix_memalign((void *) &key1, sizeof(__m128i), minlen);
  posix_memalign((void *) &key2, sizeof(__m128i), minlen);
  posix_memalign((void *) &dest, sizeof(__m128i), minlen);

  key1[128] = 0xff;
  key2[128] = 0x03;

  // 128-bit SIMD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  simd_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("Elapsed: %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  // 4xWORD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  word_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("Elapsed: %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  free(dest);
  free(key2);
  free(key1);

  return EXIT_SUCCESS;
}

/* vi: set et sw=2 ts=2: */
like image 452
Jonah H. Harris Avatar asked Sep 19 '13 20:09

Jonah H. Harris


People also ask

Is Bitwise faster than modulus?

In the multiplication case, the normal version actually performs about 20% faster than the bitwise equivalent. On the other hand, division is nearly twice as fast with the bitwise shift and the bitwise modulus (really just an & ) is more than three times faster!

Are bitwise operations faster than arithmetic?

Bitwise operations are incredibly simple and thus usually faster than arithmetic operations. For example to get the green portion of an rgb value, the arithmetic approach is (rgb / 256) % 256 .

Is Bitwise or faster than logical or?

No. First, using bitwise operators in contrast to logical operators is prone to error (e.g., doing a right-shift by 1 is NOT equivalent to multiplying by two). Second, the performance benefit is negligible (if any).

Is bit shifting faster than addition?

On most older microprocessors, bitwise operations are slightly faster than addition and subtraction operations and usually significantly faster than multiplication and division operations.


1 Answers

What's happening here is that you're being bitten by lazy allocation of virtual memory. If you change your code to this:

  // 128-bit SIMD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  simd_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("SIMD Elapsed  : %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  // 4xWORD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  word_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  // 128-bit SIMD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  simd_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("SIMD Elapsed  : %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  // 4xWORD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  word_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

you should see results something like this:

$ ./bitwise-and 
SIMD Elapsed  : 630061.000000s
Scalar Elapsed: 228156.000000s
SIMD Elapsed  : 182645.000000s
Scalar Elapsed: 202697.000000s
$ 

Explanation: the first time you iterate through your large memory allocations you are generating page faults, as previously unused pages get wired in. This gives an artificially high time for the first benchmark, which happens to be the SIMD benchmark. On the second and subsequent benchmarks the pages are all wired in and you get a more accurate benchmark, and as expected the SIMD routine is slightly faster than the scalar routine. The difference is not as large as might be expected, as you are executing only one ALU instruction for every 2 loads + 1 store, so performance is limited by DRAM bandwidth rather than computational efficiency.

As a general rule when writing benchmarking code: always call the benchmark routine at least once prior to any actual timing measurements, so that all memory allocations are properly wired in. After that run the benchmark routine a number of times in a loop and ignore any outliers.

like image 57
Paul R Avatar answered Sep 30 '22 15:09

Paul R