My task is to check (>trillions checks), does two int contain any of predefined pairs of nibbles (first pair 0x2 0x7; second 0xd 0x8). For example:
bit offset: 12345678
first int: 0x3d542783 first pair of 0x2 second: 0xd
second int: 0x486378d9 nibbles: 0x7 pair: 0x8
^ ^
So, for this example I mark two offsets with needed pairs (offsets are 2 and 5; but not a 7). Actual offsets and number of found pair are not needed in my task.
So, for given two ints the question is: Does them contains the any of these pairs of nibbles at the same offset.
I checked my program, this part is the hottest place (gprof
proven); and it is called a very-very many times (gcov
proven). Actually it is the 3rd or 4th loop (most nested) of nested loops.
My current code is slow (I rewrite it as function, but it is a code from the inner loop):
static inline int nibble_check (uint32_t A, uint32_t B)
__attribute__((always_inline))
{
int i;
for(i=0;i<8;i++)
if( ( ( (A&0xf) ==0xD) && ( (B&0xf) ==0x8) ) // first pair
|| ( ( (A&0xf) ==0x2) && ( (B&0xf) ==0x7) ) ) // second pair
return 1; // nibbles found
else {
A>>=4;
B>>=4;
}
return 0; // nibbles not found
}
The other task is finding this pairs not only at offsets 0,4,8 bits and so on, but at offsets 0,2,4,8,10,... bits:
#define douburu_nibble_check(A,B) (nibble_check(A,B) || nibble_check(A>>2, B>>2) )
Is it possible to rewrite this function and macro in parallel way?
My compiler is gcc452 and cpu is Intel Core2 Solo in 32bit mode (x86).
There are tricks for testing for a zero byte in a word (see e.g. http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord); a fast method is to use this expression:
(x - 0x01010101) & ~x & 0x80808080
which evaluates to some non-zero value if any of the 4 bytes within the 32-bit word are 0, or 0 otherwise.
This method can be adapted to work here:
static inline int nibble_check(uint32_t A, uint32_t B)
{
uint32_t tmp1, tmp2;
tmp1 = (A ^ 0x22222222) | (B ^ 0x77777777);
tmp2 = (A ^ 0xdddddddd) | (B ^ 0x88888888);
return !!(((tmp1 - 0x11111111) & ~tmp1 & 0x88888888) |
((tmp2 - 0x11111111) & ~tmp2 & 0x88888888));
}
The fastest solution is probably to use some kind of lookup table.
How constrained are you on memory? A 16 bit table would be 64K and let you test 4 nibbles at once. So 4 (1 for each nibble) of them would be 256K.
If I understand your problem, I think this will work. It's an 8 bit example -you can expand it to 16 bits. :
/* Look for 0x2 in either nibble - hits on 0x02, 0x20, 0x22 */
char table_0x2[] = {
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x02 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20, 0x22 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
char table_0x7[] = { fill this in };
char table_0xd[] = { fill this in };
char table_0x8[] = { fill this in };
int nibble_check (uint32_t A, uint32_t B)
{
int i;
for (i = 0; i < 4; i++) {
if ((table_0x2[A & 0xff] && table_0x7[B & 0xff]) ||
(table_0xd[A & 0xff] && table_0x8[B & 0xff])) {
/*
* check to see if the A&B hits are in corresponding
* nibbles - return 1 or break
*/
}
A = A >> 8;
B = B >> 8;
}
return 0;
}
Here's a better implementation:
/* 16 bit tables - upper 8 bits are A, lower 8 bits are B */
/* for 0x02, 0x07 */
char *table_2_7;
/* for 0x0d, 0x08 */
char *table_d_8;
void init(void)
{
int i;
int j;
/* error checking eliminated for brevity */
table_2_7 = malloc(64 * 1024);
table_d_8 = malloc(64 * 1024);
memset(table_2_7, 0, 64 * 1024);
memset(table_d_8, 0, 64 * 1024);
for (i = 0 ; i < 16; i++) {
for (j = 0 ; j < 16; j++) {
table_2_7[(i << 12) | (0x2 << 8) | (j << 4) | (0x7 << 0)] = 1;
table_2_7[(0x2 << 12) | (i << 8) | (0x7 << 4) | (j << 0)] = 1;
table_d_8[(i << 12) | (0xd << 8) | (j << 4) | (0x8 << 0)] = 1;
table_d_8[(0xd << 12) | (i << 8) | (0x8 << 4) | (j << 0)] = 1;
}
}
}
int nibble_check(uint32_t A, uint32_t B)
{
int i;
for (i = 0; i < 4; i++) {
if (table_2_7[ ((A & 0xff) << 8) | (B & 0xff) ] ||
table_d_8[ ((A & 0xff) << 8) | (B & 0xff) ]) {
return 1;
}
A = A >> 8;
B = B >> 8;
}
return 0;
}
You could possibly throw out some non-matching candidates earlier:
int nibble_check (uint32_t A, uint32_t B)
{
if ( !(A & B & 0x22222222) && !(A & B & 0x88888888))
return 0;
//rest of checking here...
}
Have you tried unrolling the loop?
if( ( ((A & 0x0000000F) == 0x0000000D) && ((B & 0x0000000F) == 0x00000008) )
|| ( ((A & 0x000000F0) == 0x000000D0) && ((B & 0x000000F0) == 0x00000080) )
|| ( ((A & 0x00000F00) == 0x00000D00) && ((B & 0x00000F00) == 0x00000800) )
|| ( ((A & 0x0000F000) == 0x0000D000) && ((B & 0x0000F000) == 0x00008000) )
// etc
// Then repeat with 2 & 7
I believe unrolling the loop will result in the same number of bitwise and operations, and the same number of comparisons, but you'll save the effort of performing all the right shifts and storing the results.
Edit: (in response to unrolling results in conditional and nonconditional jumps)
This would eliminate any jumps, at the expense of doing additional work. It's been a while since I worked on something that needed this type of optimization, but this should result in no jumps whatsoever. (If it doesn't, try replacing the && with &. The && may be triggering the compiler to produce short-circuiting logic, but & may make it evaluate the second half always, with no jumps.)
bool result = false;
result |= ( ((A & 0x0000000F) == 0x0000000D) && ((B & 0x0000000F) == 0x00000008) )
result |= ( ((A & 0x000000F0) == 0x000000D0) && ((B & 0x000000F0) == 0x00000080) )
result |= ( ((A & 0x00000F00) == 0x00000D00) && ((B & 0x00000F00) == 0x00000800) )
result |= ( ((A & 0x0000F000) == 0x0000D000) && ((B & 0x0000F000) == 0x00008000) )
// etc
return result;
static inline int nibble_check (uint32_t A, uint32_t B)
__attribute__((always_inline))
{
// shift x by n nibbles
#define s(x, n) ((x) << 4 * (n))
// mask the nth nibble of x
#define m(x, n) ((x) & s(0xf, n))
// D^8 and 2^7 both == 5, so check for that first, for speed
// this is equivalent to
// (A_nibble == 0XD && B_nibble == 0x8) || (A_nibble == 0x2 && B_nibble == 0x7)
#define t(n) (m(AB,n) == s(5,n) && (m(B,n) == s(7,n) || m(B,n) == s(8,n))
uint32_t AB x = A ^ B;
return t(0) || t(1) || t(2) || t(3) || t(4) || t(5) || t(6) || t(7);
#undef t
#undef m
#undef s
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With