Bit count : preprocessor magic vs modern C++

Question

Suppose that I want to create a compile time constructed bit count lookup table for 64bit integers in 16 bit chunks. The only way I know to do this is the following code:

#define B4(n) n, n + 1, n + 1, n + 2 #define B6(n)   B4(n),   B4(n + 1),   B4(n + 1),  B4(n + 2)   #define B8(n)   B6(n),   B6(n + 1),   B6(n + 1),  B6(n + 2) #define B10(n)  B8(n),   B8(n + 1),   B8(n + 1),  B8(n + 2) #define B12(n)  B10(n),  B10(n + 1),  B10(n + 1), B10(n + 2) #define B14(n)  B12(n),  B12(n + 1),  B12(n + 1), B12(n + 2) #define B16(n)  B14(n),  B14(n + 1),  B14(n + 1), B14(n + 2) #define COUNT_BITS B16(0), B16(1), B16(1), B16(2)  unsigned int lookup[65536] = { COUNT_BITS };

Is there a modern (C++11/14) way to obtain the same result?

Richard Hodges · Accepted Answer

Why not use the standard library?

#include <bitset>  int bits_in(std::uint64_t u) {     auto bs = std::bitset<64>(u);     return bs.count(); }

resulting assembler (Compiled with -O2 -march=native):

bits_in(unsigned long):         xor     eax, eax         popcnt  rax, rdi         ret

It is worth mentioning at this point that not all x86 processors have this instruction so (at least with gcc) you will need to let it know what architecture to compile for.

@tambre mentioned that in reality, when it can, the optimiser will go further:

volatile int results[3];  int main() {     results[0] = bits_in(255);     results[1] = bits_in(1023);     results[2] = bits_in(0x8000800080008000);    }

resulting assembler:

main:         mov     DWORD PTR results[rip], 8         xor     eax, eax         mov     DWORD PTR results[rip+4], 10         mov     DWORD PTR results[rip+8], 4         ret

Old-school bit-twiddlers like me need to find new problems to solve :)

Update

Not everyone was happy that the solution relies on cpu help to compute the bitcount. So what if we used an autogenerated table but allowed the developer to configure the size of it? (warning - long compile time for the 16-bit table version)

#include <utility> #include <cstdint> #include <array> #include <numeric> #include <bitset>   template<std::size_t word_size, std::size_t...Is> constexpr auto generate(std::integral_constant<std::size_t, word_size>, std::index_sequence<Is...>) {     struct popcount_type {         constexpr auto operator()(int i) const {             int bits = 0;             while (i) {                 i &= i - 1;                 ++bits;             }             return bits;         }     };     constexpr auto popcnt = popcount_type();      return std::array<int, sizeof...(Is)>             {                     {popcnt(Is)...}             }; }  template<class T> constexpr auto power2(T x) {     T result = 1;     for (T i = 0; i < x; ++i)         result *= 2;     return result; }   template<class TableWord> struct table {     static constexpr auto word_size = std::numeric_limits<TableWord>::digits;     static constexpr auto table_length = power2(word_size);     using array_type = std::array<int, table_length>;     static const array_type& get_data() {         static const array_type data = generate(std::integral_constant<std::size_t, word_size>(),                                            std::make_index_sequence<table_length>());         return data;     };  };  template<class Word> struct use_table_word { };  template<class Word, class TableWord = std::uint8_t> int bits_in(Word val, use_table_word<TableWord> = use_table_word<TableWord>()) {     constexpr auto table_word_size = std::numeric_limits<TableWord>::digits;      constexpr auto word_size = std::numeric_limits<Word>::digits;     constexpr auto times = word_size / table_word_size;     static_assert(times > 0, "incompatible");      auto reduce = [val](auto times) {         return (val >> (table_word_size * times)) & (power2(table_word_size) - 1);     };      auto const& data = table<TableWord>::get_data();     auto result = 0;     for (int i = 0; i < times; ++i) {         result += data[reduce(i)];     }     return result; }  volatile int results[3];  #include <iostream>  int main() {     auto input = std::uint64_t(1023);     results[0] = bits_in(input);     results[0] = bits_in(input, use_table_word<std::uint16_t>());      results[1] = bits_in(0x8000800080008000);     results[2] = bits_in(34567890);      for (int i = 0; i < 3; ++i) {         std::cout << results[i] << std::endl;     }     return 0; }

Final Update

This version allows the use of any number of bits in the lookup table and supports any input type, even if it's smaller than the number of bits in the lookup table.

It also short-circuits if the high bits are zero.

#include <utility> #include <cstdint> #include <array> #include <numeric> #include <algorithm>  namespace detail {     template<std::size_t bits, typename = void>     struct smallest_word;      template<std::size_t bits>     struct smallest_word<bits, std::enable_if_t<(bits <= 8)>>     {         using type = std::uint8_t;     };      template<std::size_t bits>     struct smallest_word<bits, std::enable_if_t<(bits > 8 and bits <= 16)>>     {         using type = std::uint16_t;     };      template<std::size_t bits>     struct smallest_word<bits, std::enable_if_t<(bits > 16 and bits <= 32)>>     {         using type = std::uint32_t;     };      template<std::size_t bits>     struct smallest_word<bits, std::enable_if_t<(bits > 32 and bits <= 64)>>     {         using type = std::uint64_t;     }; }  template<std::size_t bits> using smallest_word = typename detail::smallest_word<bits>::type;  template<class WordType, std::size_t bits, std::size_t...Is> constexpr auto generate(std::index_sequence<Is...>) {      using word_type = WordType;      struct popcount_type {         constexpr auto operator()(word_type i) const {             int result = 0;             while (i) {                 i &= i - 1;                 ++result;             }             return result;         }     };     constexpr auto popcnt = popcount_type();      return std::array<word_type, sizeof...(Is)>             {                     {popcnt(Is)...}             }; }  template<class T> constexpr auto power2(T x) {     return T(1) << x; }  template<std::size_t word_size> struct table {      static constexpr auto table_length = power2(word_size);      using word_type = smallest_word<word_size>;      using array_type = std::array<word_type, table_length>;      static const array_type& get_data() {         static const array_type data = generate<word_type, word_size>(std::make_index_sequence<table_length>());         return data;     };      template<class Type, std::size_t bits>     static constexpr auto n_bits() {         auto result = Type();         auto b = bits;         while(b--) {             result = (result << 1) | Type(1);         }         return result;     };      template<class Uint>     int operator()(Uint i) const {         constexpr auto mask = n_bits<Uint, word_size>();         return get_data()[i & mask];     }  };  template<int bits> struct use_bits {     static constexpr auto digits = bits; };  template<class T> constexpr auto minimum(T x, T y) { return x < y ? x : y; }  template<class Word, class UseBits = use_bits<8>> int bits_in(Word val, UseBits = UseBits()) {      using word_type = std::make_unsigned_t<Word>;     auto uval = static_cast<word_type>(val);       constexpr auto table_word_size = UseBits::digits;     constexpr auto word_size = std::numeric_limits<word_type>::digits;      auto const& mytable = table<table_word_size>();     int result = 0;     while (uval)     {         result += mytable(uval); #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wshift-count-overflow"                 uval >>= minimum(table_word_size, word_size); #pragma clang diagnostic pop     }      return result; }  volatile int results[4];  #include <iostream>  int main() {     auto input = std::uint8_t(127);     results[0] = bits_in(input);     results[1] = bits_in(input, use_bits<4>());     results[2] = bits_in(input, use_bits<11>());     results[3] = bits_in(input, use_bits<15>());      for (auto&& i : results) {         std::cout << i << std::endl;     }      auto input2 = 0xabcdef;     results[0] = bits_in(input2);     results[1] = bits_in(input2, use_bits<4>());     results[2] = bits_in(input2, use_bits<11>());     results[3] = bits_in(input2, use_bits<15>());      for (auto&& i : results) {         std::cout << i << std::endl;     }      auto input3 = -1;     results[0] = bits_in(input3);     results[1] = bits_in(input3, use_bits<4>());     results[2] = bits_in(input3, use_bits<11>());     results[3] = bits_in(input3, use_bits<15>());      for (auto&& i : results) {         std::cout << i << std::endl;     }      return 0; }

example output:

7 7 7 7 17 17 17 17 32 32 32 32

The resulting assembly output for a call to bits_in(int, use_bits<11>()) for example, becomes:

.L16:         mov     edx, edi         and     edx, 2047         movzx   edx, WORD PTR table<11ul>::get_data()::data[rdx+rdx]         add     eax, edx         shr     edi, 11         jne     .L16

Which seems reasonable to me.

DAle · Answer

This is a C++14 solution, built basically around the usage of constexpr:

// this struct is a primitive replacement of the std::array that  // has no 'constexpr reference operator[]' in C++14  template<int N> struct lookup_table {     int table[N];      constexpr int& operator[](size_t i) { return table[i]; }     constexpr const int& operator[](size_t i) const { return table[i]; } };  constexpr int bit_count(int i) {      int bits = 0;      while (i) { i &= i-1; ++bits; }      return bits; }  template<int N>  constexpr lookup_table<N> generate() {     lookup_table<N> table = {};      for (int i = 0; i < N; ++i)         table[i] = bit_count(i);      return table; }  template<int I> struct Check {     Check() { std::cout <<  I << "
"; } };  constexpr auto table = generate<65536>();  int main() {     // checks that they are evaluated at compile-time      Check<table[5]>();     Check<table[65535]>();     return 0; }

Runnable version: http://ideone.com/zQB86O

Bit count : preprocessor magic vs modern C++

Tags:

c++

bit-manipulation

c-preprocessor

c++11

c++14

Giorgio Gambino

2 Answers

Update

Final Update

Richard Hodges

DAle

Recent Activity

Donate For Us

Bit count : preprocessor magic vs modern C++

Tags:

c++

bit-manipulation

c-preprocessor

c++11

c++14

Giorgio Gambino

2 Answers

Update

Final Update

Richard Hodges

DAle

Related questions

Recent Activity

Donate For Us