Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Fast search of some nibbles in two ints at same offset (C, microoptimisation)

My task is to check (>trillions checks), does two int contain any of predefined pairs of nibbles (first pair 0x2 0x7; second 0xd 0x8). For example:

bit offset:   12345678
first int:  0x3d542783     first pair of  0x2    second:   0xd   
second int: 0x486378d9      nibbles:      0x7      pair:   0x8
               ^  ^

So, for this example I mark two offsets with needed pairs (offsets are 2 and 5; but not a 7). Actual offsets and number of found pair are not needed in my task.

So, for given two ints the question is: Does them contains the any of these pairs of nibbles at the same offset.

I checked my program, this part is the hottest place (gprof proven); and it is called a very-very many times (gcov proven). Actually it is the 3rd or 4th loop (most nested) of nested loops.

My current code is slow (I rewrite it as function, but it is a code from the inner loop):

static inline int nibble_check (uint32_t A, uint32_t B)
 __attribute__((always_inline))
{
  int i;
  for(i=0;i<8;i++)

    if(  ( ( (A&0xf) ==0xD) && ( (B&0xf) ==0x8) )     // first pair
      || ( ( (A&0xf) ==0x2) && ( (B&0xf) ==0x7) )  )  // second pair
        return 1; // nibbles found
    else {
        A>>=4;
        B>>=4;
    }

  return 0; // nibbles not found
}

The other task is finding this pairs not only at offsets 0,4,8 bits and so on, but at offsets 0,2,4,8,10,... bits:

#define douburu_nibble_check(A,B) (nibble_check(A,B) || nibble_check(A>>2, B>>2) )

Is it possible to rewrite this function and macro in parallel way?

My compiler is gcc452 and cpu is Intel Core2 Solo in 32bit mode (x86).

like image 925
osgx Avatar asked Mar 03 '11 23:03

osgx


5 Answers

There are tricks for testing for a zero byte in a word (see e.g. http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord); a fast method is to use this expression:

(x - 0x01010101) & ~x & 0x80808080

which evaluates to some non-zero value if any of the 4 bytes within the 32-bit word are 0, or 0 otherwise.

This method can be adapted to work here:

static inline int nibble_check(uint32_t A, uint32_t B)
{
  uint32_t tmp1, tmp2;

  tmp1 = (A ^ 0x22222222) | (B ^ 0x77777777);
  tmp2 = (A ^ 0xdddddddd) | (B ^ 0x88888888);

  return !!(((tmp1 - 0x11111111) & ~tmp1 & 0x88888888) |
            ((tmp2 - 0x11111111) & ~tmp2 & 0x88888888));
}
like image 124
Matthew Slattery Avatar answered Nov 19 '22 21:11

Matthew Slattery


The fastest solution is probably to use some kind of lookup table.

How constrained are you on memory? A 16 bit table would be 64K and let you test 4 nibbles at once. So 4 (1 for each nibble) of them would be 256K.

If I understand your problem, I think this will work. It's an 8 bit example -you can expand it to 16 bits. :

 /* Look for 0x2 in either nibble - hits on 0x02, 0x20, 0x22 */
 char table_0x2[] = {
     0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x02 */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20, 0x22 */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };

 char table_0x7[] = { fill this in };
 char table_0xd[] = { fill this in };
 char table_0x8[] = { fill this in };

 int nibble_check (uint32_t A, uint32_t B)
 {

       int i;

       for (i = 0; i < 4; i++) {
           if ((table_0x2[A & 0xff] && table_0x7[B & 0xff]) ||
               (table_0xd[A & 0xff] && table_0x8[B & 0xff])) {
                  /*
                   * check to see if the A&B hits are in corresponding
                   * nibbles - return 1 or break
                   */
           }

           A = A >> 8;
           B = B >> 8;

       }
       return 0;
   }

Here's a better implementation:

 /* 16 bit tables - upper 8 bits are A, lower 8 bits are B */
 /* for 0x02, 0x07 */
 char *table_2_7;
 /* for 0x0d, 0x08 */
 char *table_d_8;

 void init(void)
 {
     int i;
     int j;

     /* error checking eliminated for brevity */
     table_2_7 = malloc(64 * 1024);
     table_d_8 = malloc(64 * 1024);

     memset(table_2_7, 0, 64 * 1024);
     memset(table_d_8, 0, 64 * 1024);

     for (i = 0 ; i < 16; i++) {
         for (j = 0 ; j < 16; j++) {
             table_2_7[(i << 12)   | (0x2 << 8)  | (j << 4)   | (0x7 << 0)] = 1;
             table_2_7[(0x2 << 12) | (i << 8)    | (0x7 << 4) | (j << 0)] = 1;

             table_d_8[(i << 12)   | (0xd << 8)  | (j << 4)    | (0x8 << 0)] = 1;
             table_d_8[(0xd << 12) | (i << 8)    | (0x8 << 4) | (j << 0)] = 1;
    }
}


 }

 int nibble_check(uint32_t A, uint32_t B)
 {
     int i;

     for (i = 0; i < 4; i++) {
         if (table_2_7[ ((A & 0xff) << 8) | (B & 0xff) ] ||
             table_d_8[ ((A & 0xff) << 8) | (B & 0xff) ]) {
             return 1;
         }

         A = A >> 8;
         B = B >> 8;

     }
     return 0;
 }
like image 25
JayM Avatar answered Nov 19 '22 21:11

JayM


You could possibly throw out some non-matching candidates earlier:

int nibble_check (uint32_t A, uint32_t B) 
{
    if ( !(A & B & 0x22222222) && !(A & B & 0x88888888))
       return 0;
    //rest of checking here...
}
like image 21
AShelly Avatar answered Nov 19 '22 21:11

AShelly


Have you tried unrolling the loop?

if( ( ((A & 0x0000000F) == 0x0000000D) && ((B & 0x0000000F) == 0x00000008) )
 || ( ((A & 0x000000F0) == 0x000000D0) && ((B & 0x000000F0) == 0x00000080) )
 || ( ((A & 0x00000F00) == 0x00000D00) && ((B & 0x00000F00) == 0x00000800) )
 || ( ((A & 0x0000F000) == 0x0000D000) && ((B & 0x0000F000) == 0x00008000) )
// etc
// Then repeat with 2 & 7

I believe unrolling the loop will result in the same number of bitwise and operations, and the same number of comparisons, but you'll save the effort of performing all the right shifts and storing the results.

Edit: (in response to unrolling results in conditional and nonconditional jumps)

This would eliminate any jumps, at the expense of doing additional work. It's been a while since I worked on something that needed this type of optimization, but this should result in no jumps whatsoever. (If it doesn't, try replacing the && with &. The && may be triggering the compiler to produce short-circuiting logic, but & may make it evaluate the second half always, with no jumps.)

bool result = false;
result |= ( ((A & 0x0000000F) == 0x0000000D) && ((B & 0x0000000F) == 0x00000008) )
result |= ( ((A & 0x000000F0) == 0x000000D0) && ((B & 0x000000F0) == 0x00000080) )
result |= ( ((A & 0x00000F00) == 0x00000D00) && ((B & 0x00000F00) == 0x00000800) )
result |= ( ((A & 0x0000F000) == 0x0000D000) && ((B & 0x0000F000) == 0x00008000) )
// etc
return result;
like image 27
David Yaw Avatar answered Nov 19 '22 21:11

David Yaw


static inline int nibble_check (uint32_t A, uint32_t B)
 __attribute__((always_inline))
{
    // shift x by n nibbles
    #define s(x, n) ((x) << 4 * (n))
    // mask the nth nibble of x
    #define m(x, n) ((x) & s(0xf, n))
    // D^8 and 2^7 both == 5, so check for that first, for speed
    // this is equivalent to
    // (A_nibble == 0XD && B_nibble == 0x8) || (A_nibble == 0x2 && B_nibble == 0x7)
    #define t(n) (m(AB,n) == s(5,n) && (m(B,n) == s(7,n) || m(B,n) == s(8,n))

    uint32_t AB x = A ^ B;

    return t(0) || t(1) || t(2) || t(3) || t(4) || t(5) || t(6) || t(7);
    #undef t
    #undef m
    #undef s
}
like image 1
Jim Balter Avatar answered Nov 19 '22 21:11

Jim Balter