Suppose that I want to create a compile time constructed bit count lookup table for 64bit integers in 16 bit chunks. The only way I know to do this is the following code:
#define B4(n) n, n + 1, n + 1, n + 2 #define B6(n) B4(n), B4(n + 1), B4(n + 1), B4(n + 2) #define B8(n) B6(n), B6(n + 1), B6(n + 1), B6(n + 2) #define B10(n) B8(n), B8(n + 1), B8(n + 1), B8(n + 2) #define B12(n) B10(n), B10(n + 1), B10(n + 1), B10(n + 2) #define B14(n) B12(n), B12(n + 1), B12(n + 1), B12(n + 2) #define B16(n) B14(n), B14(n + 1), B14(n + 1), B14(n + 2) #define COUNT_BITS B16(0), B16(1), B16(1), B16(2) unsigned int lookup[65536] = { COUNT_BITS };
Is there a modern (C++11/14) way to obtain the same result?
Why not use the standard library?
#include <bitset> int bits_in(std::uint64_t u) { auto bs = std::bitset<64>(u); return bs.count(); }
resulting assembler (Compiled with -O2 -march=native
):
bits_in(unsigned long): xor eax, eax popcnt rax, rdi ret
It is worth mentioning at this point that not all x86 processors have this instruction so (at least with gcc) you will need to let it know what architecture to compile for.
@tambre mentioned that in reality, when it can, the optimiser will go further:
volatile int results[3]; int main() { results[0] = bits_in(255); results[1] = bits_in(1023); results[2] = bits_in(0x8000800080008000); }
resulting assembler:
main: mov DWORD PTR results[rip], 8 xor eax, eax mov DWORD PTR results[rip+4], 10 mov DWORD PTR results[rip+8], 4 ret
Old-school bit-twiddlers like me need to find new problems to solve :)
Not everyone was happy that the solution relies on cpu help to compute the bitcount. So what if we used an autogenerated table but allowed the developer to configure the size of it? (warning - long compile time for the 16-bit table version)
#include <utility> #include <cstdint> #include <array> #include <numeric> #include <bitset> template<std::size_t word_size, std::size_t...Is> constexpr auto generate(std::integral_constant<std::size_t, word_size>, std::index_sequence<Is...>) { struct popcount_type { constexpr auto operator()(int i) const { int bits = 0; while (i) { i &= i - 1; ++bits; } return bits; } }; constexpr auto popcnt = popcount_type(); return std::array<int, sizeof...(Is)> { {popcnt(Is)...} }; } template<class T> constexpr auto power2(T x) { T result = 1; for (T i = 0; i < x; ++i) result *= 2; return result; } template<class TableWord> struct table { static constexpr auto word_size = std::numeric_limits<TableWord>::digits; static constexpr auto table_length = power2(word_size); using array_type = std::array<int, table_length>; static const array_type& get_data() { static const array_type data = generate(std::integral_constant<std::size_t, word_size>(), std::make_index_sequence<table_length>()); return data; }; }; template<class Word> struct use_table_word { }; template<class Word, class TableWord = std::uint8_t> int bits_in(Word val, use_table_word<TableWord> = use_table_word<TableWord>()) { constexpr auto table_word_size = std::numeric_limits<TableWord>::digits; constexpr auto word_size = std::numeric_limits<Word>::digits; constexpr auto times = word_size / table_word_size; static_assert(times > 0, "incompatible"); auto reduce = [val](auto times) { return (val >> (table_word_size * times)) & (power2(table_word_size) - 1); }; auto const& data = table<TableWord>::get_data(); auto result = 0; for (int i = 0; i < times; ++i) { result += data[reduce(i)]; } return result; } volatile int results[3]; #include <iostream> int main() { auto input = std::uint64_t(1023); results[0] = bits_in(input); results[0] = bits_in(input, use_table_word<std::uint16_t>()); results[1] = bits_in(0x8000800080008000); results[2] = bits_in(34567890); for (int i = 0; i < 3; ++i) { std::cout << results[i] << std::endl; } return 0; }
This version allows the use of any number of bits in the lookup table and supports any input type, even if it's smaller than the number of bits in the lookup table.
It also short-circuits if the high bits are zero.
#include <utility> #include <cstdint> #include <array> #include <numeric> #include <algorithm> namespace detail { template<std::size_t bits, typename = void> struct smallest_word; template<std::size_t bits> struct smallest_word<bits, std::enable_if_t<(bits <= 8)>> { using type = std::uint8_t; }; template<std::size_t bits> struct smallest_word<bits, std::enable_if_t<(bits > 8 and bits <= 16)>> { using type = std::uint16_t; }; template<std::size_t bits> struct smallest_word<bits, std::enable_if_t<(bits > 16 and bits <= 32)>> { using type = std::uint32_t; }; template<std::size_t bits> struct smallest_word<bits, std::enable_if_t<(bits > 32 and bits <= 64)>> { using type = std::uint64_t; }; } template<std::size_t bits> using smallest_word = typename detail::smallest_word<bits>::type; template<class WordType, std::size_t bits, std::size_t...Is> constexpr auto generate(std::index_sequence<Is...>) { using word_type = WordType; struct popcount_type { constexpr auto operator()(word_type i) const { int result = 0; while (i) { i &= i - 1; ++result; } return result; } }; constexpr auto popcnt = popcount_type(); return std::array<word_type, sizeof...(Is)> { {popcnt(Is)...} }; } template<class T> constexpr auto power2(T x) { return T(1) << x; } template<std::size_t word_size> struct table { static constexpr auto table_length = power2(word_size); using word_type = smallest_word<word_size>; using array_type = std::array<word_type, table_length>; static const array_type& get_data() { static const array_type data = generate<word_type, word_size>(std::make_index_sequence<table_length>()); return data; }; template<class Type, std::size_t bits> static constexpr auto n_bits() { auto result = Type(); auto b = bits; while(b--) { result = (result << 1) | Type(1); } return result; }; template<class Uint> int operator()(Uint i) const { constexpr auto mask = n_bits<Uint, word_size>(); return get_data()[i & mask]; } }; template<int bits> struct use_bits { static constexpr auto digits = bits; }; template<class T> constexpr auto minimum(T x, T y) { return x < y ? x : y; } template<class Word, class UseBits = use_bits<8>> int bits_in(Word val, UseBits = UseBits()) { using word_type = std::make_unsigned_t<Word>; auto uval = static_cast<word_type>(val); constexpr auto table_word_size = UseBits::digits; constexpr auto word_size = std::numeric_limits<word_type>::digits; auto const& mytable = table<table_word_size>(); int result = 0; while (uval) { result += mytable(uval); #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wshift-count-overflow" uval >>= minimum(table_word_size, word_size); #pragma clang diagnostic pop } return result; } volatile int results[4]; #include <iostream> int main() { auto input = std::uint8_t(127); results[0] = bits_in(input); results[1] = bits_in(input, use_bits<4>()); results[2] = bits_in(input, use_bits<11>()); results[3] = bits_in(input, use_bits<15>()); for (auto&& i : results) { std::cout << i << std::endl; } auto input2 = 0xabcdef; results[0] = bits_in(input2); results[1] = bits_in(input2, use_bits<4>()); results[2] = bits_in(input2, use_bits<11>()); results[3] = bits_in(input2, use_bits<15>()); for (auto&& i : results) { std::cout << i << std::endl; } auto input3 = -1; results[0] = bits_in(input3); results[1] = bits_in(input3, use_bits<4>()); results[2] = bits_in(input3, use_bits<11>()); results[3] = bits_in(input3, use_bits<15>()); for (auto&& i : results) { std::cout << i << std::endl; } return 0; }
example output:
7 7 7 7 17 17 17 17 32 32 32 32
The resulting assembly output for a call to bits_in(int, use_bits<11>())
for example, becomes:
.L16: mov edx, edi and edx, 2047 movzx edx, WORD PTR table<11ul>::get_data()::data[rdx+rdx] add eax, edx shr edi, 11 jne .L16
Which seems reasonable to me.
This is a C++14 solution, built basically around the usage of constexpr
:
// this struct is a primitive replacement of the std::array that // has no 'constexpr reference operator[]' in C++14 template<int N> struct lookup_table { int table[N]; constexpr int& operator[](size_t i) { return table[i]; } constexpr const int& operator[](size_t i) const { return table[i]; } }; constexpr int bit_count(int i) { int bits = 0; while (i) { i &= i-1; ++bits; } return bits; } template<int N> constexpr lookup_table<N> generate() { lookup_table<N> table = {}; for (int i = 0; i < N; ++i) table[i] = bit_count(i); return table; } template<int I> struct Check { Check() { std::cout << I << "\n"; } }; constexpr auto table = generate<65536>(); int main() { // checks that they are evaluated at compile-time Check<table[5]>(); Check<table[65535]>(); return 0; }
Runnable version: http://ideone.com/zQB86O
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With