Fastest 64-bit population count (Hamming weight)

Question

I had to calculate the Hamming weight for a quite fast continious flow of 64-bit data and using the popcnt assembly instruction throws me a exception om my Intel Core i7-4650U.

I checked my bible Hacker's delight and scanned the web for all kinds of algorithms (it's a bunch out there since they started tackling this 'problem' at the birth of computing).

I spent the weekend playing around with some ideas of my own and came up with these algorithms, where I'm almost at the speed I can move data in and out of the CPU.

    //64-bit popcnt using BMI2
_popcnt_bmi2:
        mov         (%rdi),%r11
        pext        %r11,%r11,%r11
        not         %r11
        tzcnt       %r11,%r11
        mov         %r11,(%rdx)
        add         $8h,%rdi
        add         $8h,%rdx
        dec         %rsi
        jnz         _popcnt_bmi2
        ret

In the above code I use pext (BMI2) where the incoming data is using itself as the mask. Then all bits existing will collapse starting with the least significant bit in the result register (itself again). Then I need to calculate the number of collapsed bits so I invert all bits then use tzcnt to count the number of, now zeroes. I thought it was a quite nice idea.

Then I also tried a AVX2 approach:

//64-bit popcnt using AVX2
_popcnt_avx2:
        vmovdqa     (%rcx),%ymm2
        add         $20h,%rcx
        vmovdqa     (%rcx),%ymm3
        add         $20h,%rcx
        vmovdqa     (%rcx),%ymm4
popcnt_avx2_loop:
        vmovdqa     (%rdi),%ymm0
        vpand       %ymm0, %ymm2, %ymm1
        vpandn      %ymm0, %ymm2, %ymm0
        vpsrld      $4h,%ymm0, %ymm0
        vpshufb     %ymm1, %ymm3, %ymm1
        vpshufb     %ymm0, %ymm3, %ymm0
        vpaddb      %ymm1,%ymm0,%ymm0       //popcnt (8-bits)
        vpsadbw     %ymm0,%ymm4,%ymm0       //popcnt (64-bits)
        vmovdqa     %ymm0,(%rdx)
        add         $20h,%rdi
        add         $20h,%rdx
        dec         %rsi
        jnz         popcnt_avx2_loop

In the AVX2 case I read 32 bytes, then mask out the nibbles (ymm2), then I use ymm3 as a look up table for bit counting the nibbles. Then I add the results to 8-bit's, and then I use the super condensed vpsadbw to add 8 bytes to a 64-bit value (ymm4 = 0).

Anyone got something faster up their sleves?

Edit:

The failing POPCNT was due to to a error I made in my code, that function works om my Intel Core i7-4650U. Please see my post below displaying the bench results.

Anders Cedronius · Accepted Answer

OK came to the conclusion that it was no idea of trying to be 'smart', I benched:

the built in intrinsic popcount: _mm_popcnt_u64

bmi2:__tzcnt_u64(~_pext_u64(data[i],data[i])); against three assembler functions

popcnt, bmi2 and avx2.

They all run at the speed you can move memory in and out of my:

cat /proc/cpuinfo

-Intel(R) Xeon(R) CPU E3-1275 v3 @ 3.50GHz

FYI:

main.c:

// Hamming weight bench

#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <smmintrin.h>
#include <immintrin.h>
#include <x86intrin.h>
#include <math.h>

#define DISPLAY_HEIGHT  4
#define DISPLAY_WIDTH   32
#define NUM_DATA_OBJECTS  40000000
#define ITTERATIONS 20

// The source data (+32 to avoid the quantization out of memory problem)
__attribute__ ((aligned(32))) static long long unsigned data[NUM_DATA_OBJECTS+32]={};
__attribute__ ((aligned(32))) static long long unsigned data_out[NUM_DATA_OBJECTS+32]={};
__attribute__ ((aligned(32))) static unsigned char k1[32*3]={
    0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,
    0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03,0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03,0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,
    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
};


extern "C" {
void popcnt_popcnt(long long unsigned[],unsigned int,long long unsigned[]);
void popcnt_bmi2(long long unsigned[],unsigned int,long long unsigned[]);
void popcnt_avx2(long long unsigned[],unsigned int,long long unsigned[],unsigned char[]);
}

void populate_data()
{
    for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
    {
        data[i] = rand();
    }
}

void display_source_data()
{
    printf ("
Data in(start):
");
    for (unsigned int j = 0; j < DISPLAY_HEIGHT; j++)
    {
        for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
        {
            printf ("0x%02llux,",data[i+(j*DISPLAY_WIDTH)]);
        }
        printf ("
");
    }
}

void bench_popcnt()
{
    for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
    {
        data_out[i] = _mm_popcnt_u64(data[i]);
    }
}

void bench_move_data_memcpy()
{
    memcpy(data_out,data,NUM_DATA_OBJECTS*8);
}

// __tzcnt64 ??
void bench_bmi2()
{
    for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
    {
        data_out[i]=__tzcnt_u64(~_pext_u64(data[i],data[i]));
    }
}

void display_dest_data()
{
    printf ("
Data out:
");
    for (unsigned int j = 0; j < DISPLAY_HEIGHT; j++)
    {
        for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
        {
            printf ("0x%02llux,",data_out[i+(j*DISPLAY_WIDTH)]);
        }
        printf ("
");
    }
}


int main() {
    struct timeval t0;
    struct timeval t1;
    long elapsed[ITTERATIONS]={0};
    long avrg=0;

    for (unsigned int i = 0; i < ITTERATIONS; i++)
    {
        populate_data();
        //    display_source_data();
        gettimeofday(&t0, 0);
        bench_move_data_memcpy();
        gettimeofday(&t1, 0);
        elapsed[i]= (((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000);
        printf ("Time_to_move_data_without_processing: %ld
",elapsed[i]);
    }

    avrg=0;
    for (unsigned int i = 1; i < ITTERATIONS; i++){
        avrg+=elapsed[i];
    }
    printf ("Average time_to_move_data: %ld
",avrg/(ITTERATIONS-1));

    //display_dest_data();

    for (unsigned int i = 0; i < ITTERATIONS; i++)
    {
        populate_data();
        //    display_source_data();
        gettimeofday(&t0, 0);
        bench_popcnt();
        gettimeofday(&t1, 0);
        elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
        printf ("popcnt: %ld
",elapsed[i]);
    }

    avrg=0;
    for (unsigned int i = 1; i < ITTERATIONS; i++){
        avrg+=elapsed[i];
    }
    printf ("Average popcnt: %ld
",avrg/(ITTERATIONS-1));

    //display_dest_data();

    for (unsigned int i = 0; i < ITTERATIONS; i++)
    {
        populate_data();
        //    display_source_data();
        gettimeofday(&t0, 0);
        bench_bmi2();
        gettimeofday(&t1, 0);
        elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
        printf ("bmi2: %ld
",elapsed[i]);
    }

    avrg=0;
    for (unsigned int i = 1; i < ITTERATIONS; i++){
        avrg+=elapsed[i];
    }
    printf ("Average bmi2: %ld
",avrg/(ITTERATIONS-1));

    //display_dest_data();


    printf ("Now test the assembler functions
");

    for (unsigned int i = 0; i < ITTERATIONS; i++)
    {
        populate_data();
        //    display_source_data();
        gettimeofday(&t0, 0);
        popcnt_popcnt(data,NUM_DATA_OBJECTS,data_out);
        gettimeofday(&t1, 0);
        elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
        printf ("popcnt_asm: %ld
",elapsed[i]);
    }

    avrg=0;
    for (unsigned int i = 1; i < ITTERATIONS; i++){
        avrg+=elapsed[i];
    }
    printf ("Average popcnt_asm: %ld
",avrg/(ITTERATIONS-1));

    //display_dest_data();

    for (unsigned int i = 0; i < ITTERATIONS; i++)
    {
        populate_data();
        //    display_source_data();
        gettimeofday(&t0, 0);
        popcnt_bmi2(data,NUM_DATA_OBJECTS,data_out);
        gettimeofday(&t1, 0);
        elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
        printf ("bmi2_asm: %ld
",elapsed[i]);
    }

    avrg=0;
    for (unsigned int i = 1; i < ITTERATIONS; i++){
        avrg+=elapsed[i];
    }
    printf ("Average bmi2_asm: %ld
",avrg/(ITTERATIONS-1));

    //display_dest_data();

    for (unsigned int i = 0; i < ITTERATIONS; i++)
    {
        populate_data();
        //    display_source_data();
        gettimeofday(&t0, 0);
        popcnt_avx2(data,(unsigned int)ceil((NUM_DATA_OBJECTS*8)/32.0),data_out,k1);
        gettimeofday(&t1, 0);
        elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
        printf ("avx2_asm: %ld
",elapsed[i]);
    }

    avrg=0;
    for (unsigned int i = 1; i < ITTERATIONS; i++){
        avrg+=elapsed[i];
    }
    printf ("Average avx2_asm: %ld
",avrg/(ITTERATIONS-1));

    //display_dest_data();

    return 0;
}

The engine.s

//
//  avx2_bmi2_popcnt bench
//

.global popcnt_bmi2 , popcnt_avx2, popcnt_popcnt
.align 2

//64-bit popcnt using the built-in popcnt instruction
popcnt_popcnt:
        popcntq     (%rdi), %r11
        mov         %r11,(%rdx)
        add         $8,%rdi
        add         $8,%rdx
        dec         %rsi
        jnz         popcnt_popcnt
        ret

//64-bit popcnt using BMI2
popcnt_bmi2:
        mov         (%rdi),%r11
        pextq       %r11,%r11,%r11
        not         %r11
        tzcnt       %r11,%r11
        mov         %r11,(%rdx)
        add         $8,%rdi
        add         $8,%rdx
        dec         %rsi
        jnz         popcnt_bmi2
        ret

//64-bit popcnt using AVX2
popcnt_avx2:
        vmovdqa     (%rcx),%ymm2
        add         $0x20,%rcx
        vmovdqa     (%rcx),%ymm3
        add         $0x20,%rcx
        vmovdqa     (%rcx),%ymm4
popcnt_avx2_loop:
        vmovdqa     (%rdi),%ymm0
        vpand       %ymm0, %ymm2, %ymm1
        vpandn      %ymm0, %ymm2, %ymm0
        vpsrld      $4,%ymm0, %ymm0
        vpshufb     %ymm1, %ymm3, %ymm1
        vpshufb     %ymm0, %ymm3, %ymm0
        vpaddb      %ymm1,%ymm0,%ymm0
        vpsadbw     %ymm0,%ymm4,%ymm0
        vmovdqa     %ymm0,(%rdx)
        add         $0x20,%rdi
        add         $0x20,%rdx
        dec         %rsi
        jnz         popcnt_avx2_loop
        ret

Compile the sources:

g++ -march=native -mavx -mpopcnt -O3 main.c engine.s

set the CPU to performance:

cpufreq-set -g performance

Run the bench:

sudo chrt -r 10 ./a.out

Result:

Average time_to_move_data: 61

Average popcnt: 61

Average bmi2: 61

Now test the assembler functions

Average popcnt_asm: 61

Average bmi2_asm: 61

Average avx2_asm: 61

Fastest 64-bit population count (Hamming weight)

Tags:

performance

optimization

assembly

avx

simd

Anders Cedronius

1 Answers

Anders Cedronius

Recent Activity

Donate For Us

Fastest 64-bit population count (Hamming weight)

Tags:

performance

optimization

assembly

avx

simd

Anders Cedronius

1 Answers

Anders Cedronius

Related questions

Recent Activity

Donate For Us