Fast counting the number of set bits in __m128i register

2 Answers

Here are some codes I used in an old project (there is a research paper about it). The function popcnt8 below computes the number of bits set in each byte.

SSE2-only version (based on Algorithm 3 in Hacker's Delight book):

static const __m128i popcount_mask1 = _mm_set1_epi8(0x77);
static const __m128i popcount_mask2 = _mm_set1_epi8(0x0F);
static inline __m128i popcnt8(__m128i x) {
    __m128i n;
    // Count bits in each 4-bit field.
    n = _mm_srli_epi64(x, 1);
    n = _mm_and_si128(popcount_mask1, n);
    x = _mm_sub_epi8(x, n);
    n = _mm_srli_epi64(n, 1);
    n = _mm_and_si128(popcount_mask1, n);
    x = _mm_sub_epi8(x, n);
    n = _mm_srli_epi64(n, 1);
    n = _mm_and_si128(popcount_mask1, n);
    x = _mm_sub_epi8(x, n);
    x = _mm_add_epi8(x, _mm_srli_epi16(x, 4));
    x = _mm_and_si128(popcount_mask2, x);
    return x;
}

SSSE3 version (due to Wojciech Mula):

static const __m128i popcount_mask = _mm_set1_epi8(0x0F);
static const __m128i popcount_table = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
static inline __m128i popcnt8(__m128i n) {
    const __m128i pcnt0 = _mm_shuffle_epi8(popcount_table, _mm_and_si128(n, popcount_mask));
    const __m128i pcnt1 = _mm_shuffle_epi8(popcount_table, _mm_and_si128(_mm_srli_epi16(n, 4), popcount_mask));
    return _mm_add_epi8(pcnt0, pcnt1);
}

XOP version (equivalent to SSSE3, but uses XOP instructions which are faster on AMD Bulldozer)

static const __m128i popcount_mask = _mm_set1_epi8(0x0F);
static const __m128i popcount_table = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
static const __m128i popcount_shift = _mm_set1_epi8(-4);
static inline __m128i popcount8(__m128i n) {
    const __m128i pcnt0 = _mm_perm_epi8(popcount_table, popcount_table, _mm_and_si128(n, popcount_mask));
    const __m128i pcnt1 = _mm_perm_epi8(popcount_table, popcount_table, _mm_shl_epi8(n, popcount_shift));
    return _mm_add_epi8(pcnt0, pcnt1);
}

Function popcnt64 below counts the number of bits in the low and high 64-bit parts of the SSE register:

SSE2 version:

static inline __m128i popcnt64(__m128i n) {
    const __m128i cnt8 = popcnt8(n);
    return _mm_sad_epu8(cnt8, _mm_setzero_si128());
}

XOP version:

static inline __m128i popcnt64(__m128i n) {
    const __m128i cnt8 = popcnt8(n);
    return _mm_haddq_epi8(cnt8);
}

Finally, the function popcnt128 below count the number of bits in the whole 128-bit register:

static inline int popcnt128(__m128i n) {
    const __m128i cnt64 = popcnt64(n);
    const __m128i cnt64_hi = _mm_unpackhi_epi64(cnt64, cnt64);
    const __m128i cnt128 = _mm_add_epi32(cnt64, cnt64_hi);
    return _mm_cvtsi128_si32(cnt128);
}

However, a more efficient way to implement popcnt128 is to use hardware POPCNT instruction (on processors which support it):

static inline int popcnt128(__m128i n) {
    const __m128i n_hi = _mm_unpackhi_epi64(n, n);
    #ifdef _MSC_VER
        return __popcnt64(_mm_cvtsi128_si64(n)) + __popcnt64(_mm_cvtsi128_si64(n_hi));
    #else
        return __popcntq(_mm_cvtsi128_si64(n)) + __popcntq(_mm_cvtsi128_si64(n_hi));
    #endif
}

104

answered Nov 10 '22 09:11

Marat Dukhan

Here is a version base on Bit Twiddling Hacks - Counting Set Bits in Parallel with naming similar to other intrinsic functions as well as some extra functions for 16 32 and 64 bit vectors

#include "immintrin.h"

/* bit masks: 0x55 = 01010101, 0x33 = 00110011, 0x0f = 00001111 */
static const __m128i m1 = {0x5555555555555555ULL,0x5555555555555555ULL};
static const __m128i m2 = {0x3333333333333333ULL,0x3333333333333333ULL};
static const __m128i m3 = {0x0f0f0f0f0f0f0f0fULL,0x0f0f0f0f0f0f0f0fULL};
static const __m128i m4 = {0x001f001f001f001fULL,0x001f001f001f001fULL};
static const __m128i m5 = {0x0000003f0000003fULL,0x0000003f0000003fULL};

__m128i _mm_popcnt_epi8(__m128i x) {
    /* Note: if we returned x here it would be like _mm_popcnt_epi1(x) */ 
    __m128i y;
    /* add even and odd bits*/
    y = _mm_srli_epi64(x,1);  //put even bits in odd place
    y = _mm_and_si128(y,m1);  //mask out the even bits (0x55)
    x = _mm_subs_epu8(x,y);   //shortcut to mask even bits and add
    /* if we just returned x here it would be like _mm_popcnt_epi2(x) */ 
    /* now add the half nibbles */
    y = _mm_srli_epi64 (x,2); //move half nibbles in place to add
    y = _mm_and_si128(y,m2);  //mask off the extra half nibbles (0x0f)
    x = _mm_and_si128(x,m2);  //ditto
    x = _mm_adds_epu8(x,y);   //totals are a maximum of 5 bits (0x1f)
    /* if we just returned x here it would be like _mm_popcnt_epi4(x) */ 
    /* now add the nibbles */
    y = _mm_srli_epi64(x,4);  //move nibbles in place to add
    x = _mm_adds_epu8(x,y);   //totals are a maximum of 6 bits (0x3f)
    x = _mm_and_si128(x,m3);  //mask off the extra bits
    return x;
}

__m128i _mm_popcnt_epi16(__m128i x) {
    __m128i y;
    x = _mm_popcnt_epi8(x);    //get byte popcount
    y = _mm_srli_si128(x,1);   //copy even bytes for adding
    x = _mm_add_epi16(x,y);    //add even bytes into the odd bytes
    return _mm_and_si128(x,m4);//mask off the even byte and return
}

__m128i _mm_popcnt_epi32(__m128i x) {
    __m128i y;
    x = _mm_popcnt_epi16(x);   //get word popcount
    y = _mm_srli_si128(x,2);   //copy even words for adding
    x = _mm_add_epi32(x,y);    //add even words into odd words
    return _mm_and_si128(x,m5);//mask off the even words and return
}

__m128i _mm_popcnt_epi64(__m128i x){
    /* _mm_sad_epu8() is weird
       It takes the absolute difference of bytes between 2 __m128i
       then horizontal adds the lower and upper 8 differences
       and stores the sums in the lower and upper 64 bits
    */
    return _mm_sad_epu8(_mm_popcnt_epi8(x),(__m128i){0});
}

int _mm_popcnt_si128(__m128i x){
    x = _mm_popcnt_epi64(x);
    __m128i y = _mm_srli_si128(x,8);
    return _mm_add_epi64(x,y)[0];
    //alternative: __builtin_popcntll(x[0])+__builtin_popcntll(x[1]);
}

answered Nov 10 '22 09:11

technosaurus

Related questions
                            
                                Allocate executable ram in c on linux
                            
                                Ensure that file state on the client is in sync with NFS server
                            
                                Never defined structure
                            
                                What is ellipsis operator in c [duplicate]
                            
                                What is a comma separated set of assignments?
                            
                                To invoke a variadic function with unamed arguments of another variadic function
                            
                                compile error C2099: initializer is not a constant [duplicate]
                            
                                Use one socket in two threads [duplicate]
                            
                                Enumeration Scope
                            
                                why sizeof('a') is 4 in C? [duplicate]
                            
                                how to find per-user disk-space-usage?
                            
                                DBusWatch and DBusTimeout examples
                            
                                Define Array in C
                            
                                C struct to void* pointer
                            
                                C socket: recv and send all data
                            
                                Why does this C array reference look incorrect?
                            
                                Is it a bad idea to use GCC's -fms-extensions?
                            
                                Send data to multiple sockets using pipes, tee() and splice()
                            
                                C: wrongly nested switch/case blocks surprisingly works
                            
                                open() what happens if I open twice the same file?

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Fast counting the number of set bits in __m128i register

Tags:

c

hammingweight

simd

sse

sse2

enzom83

People also ask

2 Answers

Marat Dukhan

technosaurus

Recent Activity

Donate For Us