I have an array of 4 longs that I am wanting to count the number of set bits within a given range. This is the function that I am currently using (where bitcount(uint64_t)
is an inline asm function that gives the number of set bits in the argument):
unsigned count_bits(const uint64_t *long_ptr, size_t begin, size_t end)
{
uint64_t start_mask = ~((1L << (begin & 63)) - 1);
uint64_t end_mask = ((1L << (end & 63)) - 1);
if (begin >= 0 && begin < 64) {
if (end < 64) {
return bitcount(long_ptr[0] & start_mask & end_mask);
} else if (end < 128) {
return bitcount(long_ptr[0] & start_mask) + bitcount(long_ptr[1] & end_mask);
} else if (end < 192) {
return bitcount(long_ptr[0] & start_mask) + bitcount(long_ptr[1]) + bitcount(long_ptr[2] & end_mask);
} else if (end<256) {
return bitcount(long_ptr[0] & start_mask) + bitcount(long_ptr[1]) + bitcount(long_ptr[2]) + bitcount(long_ptr[3] & end_mask);
} else {
return bitcount(long_ptr[0] & start_mask) + bitcount(long_ptr[1]) + bitcount(long_ptr[2]) + bitcount(long_ptr[3]);
}
} else if (begin >= 64 && begin < 128) {
if (end < 128) {
return bitcount(long_ptr[1] & start_mask & end_mask);
} else if (end < 192) {
return bitcount(long_ptr[1] & start_mask) + bitcount(long_ptr[2] & end_mask);
} else if (end < 256) {
return bitcount(long_ptr[1] & start_mask) + bitcount(long_ptr[2]) + bitcount(long_ptr[3] & end_mask);
} else {
return bitcount(long_ptr[1] & start_mask) + bitcount(long_ptr[2]) + bitcount(long_ptr[3]);
}
} else if (begin >= 128 && begin < 192) {
if (end < 192) {
return bitcount(long_ptr[2] & start_mask & end_mask);
} else if (end < 256) {
return bitcount(long_ptr[2] & start_mask) + bitcount(long_ptr[3] & end_mask);
} else {
return bitcount(long_ptr[2] & start_mask) + bitcount(long_ptr[3]);
}
} else if (begin<256) {
if (end < 256) {
return bitcount(long_ptr[3] & start_mask & end_mask);
} else {
return bitcount(long_ptr[3] & start_mask);
}
} else {
return 0;
}
}
I am finding that performance of this code is quite good, but I am wondering if there is anything I can do to make it faster, or if a redesign of the algorithm could result in a performance boost.
I have created 2 different versions with zero branching and I believe David Wohlferd comment should be selected due compactness. I do not believe that none branching version would be any faster. Processor branch prediction may eliminate effectively jumps on consistent data. While none branching one will count bits 4 times all the time(unless SSE?). I am publishing here my second(very short) none branch version. First was complicated with bit computations.
unsigned bitcount2(const uint64_t *long_ptr, size_t begin, size_t end)
{
uint64_t mask[] = { 0, 0, 0, ~((1ULL << (begin & 63)) - 1), -1LL, -1LL, -1LL, ((1ULL << (end & 63)) - 1), 0, 0, 0 };
uint64_t* b_start = mask+(3 - begin / 64);
uint64_t* b_end = mask + (7 - end / 64);
return bitcount(long_ptr[0] & b_start[0] & b_end[0]) +
bitcount(long_ptr[1] & b_start[1] & b_end[1]) +
bitcount(long_ptr[2] & b_start[2] & b_end[2]) +
bitcount(long_ptr[3] & b_start[3] & b_end[3]);
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With