I had to calculate the Hamming weight for a quite fast continious flow of 64-bit data and using the popcnt
assembly instruction throws me a exception om my Intel Core i7-4650U.
I checked my bible Hacker's delight and scanned the web for all kinds of algorithms (it's a bunch out there since they started tackling this 'problem' at the birth of computing).
I spent the weekend playing around with some ideas of my own and came up with these algorithms, where I'm almost at the speed I can move data in and out of the CPU.
//64-bit popcnt using BMI2
_popcnt_bmi2:
mov (%rdi),%r11
pext %r11,%r11,%r11
not %r11
tzcnt %r11,%r11
mov %r11,(%rdx)
add $8h,%rdi
add $8h,%rdx
dec %rsi
jnz _popcnt_bmi2
ret
In the above code I use pext
(BMI2) where the incoming data is using itself as the mask. Then all bits existing will collapse starting with the least significant bit in the result register (itself again). Then I need to calculate the number of collapsed bits so I invert all bits then use tzcnt
to count the number of, now zeroes. I thought it was a quite nice idea.
Then I also tried a AVX2 approach:
//64-bit popcnt using AVX2
_popcnt_avx2:
vmovdqa (%rcx),%ymm2
add $20h,%rcx
vmovdqa (%rcx),%ymm3
add $20h,%rcx
vmovdqa (%rcx),%ymm4
popcnt_avx2_loop:
vmovdqa (%rdi),%ymm0
vpand %ymm0, %ymm2, %ymm1
vpandn %ymm0, %ymm2, %ymm0
vpsrld $4h,%ymm0, %ymm0
vpshufb %ymm1, %ymm3, %ymm1
vpshufb %ymm0, %ymm3, %ymm0
vpaddb %ymm1,%ymm0,%ymm0 //popcnt (8-bits)
vpsadbw %ymm0,%ymm4,%ymm0 //popcnt (64-bits)
vmovdqa %ymm0,(%rdx)
add $20h,%rdi
add $20h,%rdx
dec %rsi
jnz popcnt_avx2_loop
In the AVX2 case I read 32 bytes, then mask out the nibbles (ymm2
), then I use ymm3
as a look up table for bit counting the nibbles. Then I add the results to 8-bit's, and then I use the super condensed vpsadbw
to add 8 bytes to a 64-bit value (ymm4
= 0).
Anyone got something faster up their sleves?
Edit:
The failing POPCNT
was due to to a error I made in my code, that function works om my Intel Core i7-4650U. Please see my post below displaying the bench results.
OK came to the conclusion that it was no idea of trying to be 'smart', I benched:
the built in intrinsic popcount: _mm_popcnt_u64
bmi2:__tzcnt_u64(~_pext_u64(data[i],data[i]));
against three assembler functions
popcnt, bmi2 and avx2.
They all run at the speed you can move memory in and out of my:
cat /proc/cpuinfo
-Intel(R) Xeon(R) CPU E3-1275 v3 @ 3.50GHz
FYI:
main.c:
// Hamming weight bench
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <smmintrin.h>
#include <immintrin.h>
#include <x86intrin.h>
#include <math.h>
#define DISPLAY_HEIGHT 4
#define DISPLAY_WIDTH 32
#define NUM_DATA_OBJECTS 40000000
#define ITTERATIONS 20
// The source data (+32 to avoid the quantization out of memory problem)
__attribute__ ((aligned(32))) static long long unsigned data[NUM_DATA_OBJECTS+32]={};
__attribute__ ((aligned(32))) static long long unsigned data_out[NUM_DATA_OBJECTS+32]={};
__attribute__ ((aligned(32))) static unsigned char k1[32*3]={
0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,
0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03,0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03,0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
};
extern "C" {
void popcnt_popcnt(long long unsigned[],unsigned int,long long unsigned[]);
void popcnt_bmi2(long long unsigned[],unsigned int,long long unsigned[]);
void popcnt_avx2(long long unsigned[],unsigned int,long long unsigned[],unsigned char[]);
}
void populate_data()
{
for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
{
data[i] = rand();
}
}
void display_source_data()
{
printf ("\r\nData in(start):\r\n");
for (unsigned int j = 0; j < DISPLAY_HEIGHT; j++)
{
for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
{
printf ("0x%02llux,",data[i+(j*DISPLAY_WIDTH)]);
}
printf ("\r\n");
}
}
void bench_popcnt()
{
for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
{
data_out[i] = _mm_popcnt_u64(data[i]);
}
}
void bench_move_data_memcpy()
{
memcpy(data_out,data,NUM_DATA_OBJECTS*8);
}
// __tzcnt64 ??
void bench_bmi2()
{
for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
{
data_out[i]=__tzcnt_u64(~_pext_u64(data[i],data[i]));
}
}
void display_dest_data()
{
printf ("\r\nData out:\r\n");
for (unsigned int j = 0; j < DISPLAY_HEIGHT; j++)
{
for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
{
printf ("0x%02llux,",data_out[i+(j*DISPLAY_WIDTH)]);
}
printf ("\r\n");
}
}
int main() {
struct timeval t0;
struct timeval t1;
long elapsed[ITTERATIONS]={0};
long avrg=0;
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
bench_move_data_memcpy();
gettimeofday(&t1, 0);
elapsed[i]= (((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000);
printf ("Time_to_move_data_without_processing: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average time_to_move_data: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
bench_popcnt();
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("popcnt: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average popcnt: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
bench_bmi2();
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("bmi2: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average bmi2: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
printf ("Now test the assembler functions\n");
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
popcnt_popcnt(data,NUM_DATA_OBJECTS,data_out);
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("popcnt_asm: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average popcnt_asm: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
popcnt_bmi2(data,NUM_DATA_OBJECTS,data_out);
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("bmi2_asm: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average bmi2_asm: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
popcnt_avx2(data,(unsigned int)ceil((NUM_DATA_OBJECTS*8)/32.0),data_out,k1);
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("avx2_asm: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average avx2_asm: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
return 0;
}
The engine.s
//
// avx2_bmi2_popcnt bench
//
.global popcnt_bmi2 , popcnt_avx2, popcnt_popcnt
.align 2
//64-bit popcnt using the built-in popcnt instruction
popcnt_popcnt:
popcntq (%rdi), %r11
mov %r11,(%rdx)
add $8,%rdi
add $8,%rdx
dec %rsi
jnz popcnt_popcnt
ret
//64-bit popcnt using BMI2
popcnt_bmi2:
mov (%rdi),%r11
pextq %r11,%r11,%r11
not %r11
tzcnt %r11,%r11
mov %r11,(%rdx)
add $8,%rdi
add $8,%rdx
dec %rsi
jnz popcnt_bmi2
ret
//64-bit popcnt using AVX2
popcnt_avx2:
vmovdqa (%rcx),%ymm2
add $0x20,%rcx
vmovdqa (%rcx),%ymm3
add $0x20,%rcx
vmovdqa (%rcx),%ymm4
popcnt_avx2_loop:
vmovdqa (%rdi),%ymm0
vpand %ymm0, %ymm2, %ymm1
vpandn %ymm0, %ymm2, %ymm0
vpsrld $4,%ymm0, %ymm0
vpshufb %ymm1, %ymm3, %ymm1
vpshufb %ymm0, %ymm3, %ymm0
vpaddb %ymm1,%ymm0,%ymm0
vpsadbw %ymm0,%ymm4,%ymm0
vmovdqa %ymm0,(%rdx)
add $0x20,%rdi
add $0x20,%rdx
dec %rsi
jnz popcnt_avx2_loop
ret
Compile the sources:
g++ -march=native -mavx -mpopcnt -O3 main.c engine.s
set the CPU to performance:
cpufreq-set -g performance
Run the bench:
sudo chrt -r 10 ./a.out
Result:
Average time_to_move_data: 61
Average popcnt: 61
Average bmi2: 61
Now test the assembler functions
Average popcnt_asm: 61
Average bmi2_asm: 61
Average avx2_asm: 61
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With