Struct Node {
Node *N[SIZE];
int value;
};
struct Trie {
Node *root;
Node* findNode(Key *key) {
Node *C = &root;
char u;
while (1) {
u = key->next();
if (u < 0) return C;
// if (C->N[0] == C->N[0]); // this line will speed up execution significantly
C = C->N[u];
if (C == 0) return 0;
}
}
void addNode(Key *key, int value){...};
};
In this implementation of Prefix Tree (aka Trie) I found out that 90% of findNode()
execution time is taken by a single operation C=C->N[u];
In my attempt to speed up this code, I randomly added the line that is commented in the snipped above, and code became 30% faster! Why is that?
UPDATE
Here is complete program.
#include "stdio.h"
#include "sys/time.h"
long time1000() {
timeval val;
gettimeofday(&val, 0);
val.tv_sec &= 0xffff;
return val.tv_sec * 1000 + val.tv_usec / 1000;
}
struct BitScanner {
void *p;
int count, pos;
BitScanner (void *p, int count) {
this->p = p;
this->count = count;
pos = 0;
}
int next() {
int bpos = pos >> 1;
if (bpos >= count) return -1;
unsigned char b = ((unsigned char*)p)[bpos];
if (pos++ & 1) return (b >>= 4);
return b & 0xf;
}
};
struct Node {
Node *N[16];
__int64_t value;
Node() : N(), value(-1) { }
};
struct Trie16 {
Node root;
bool add(void *key, int count, __int64_t value) {
Node *C = &root;
BitScanner B(key, count);
while (true) {
int u = B.next();
if (u < 0) {
if (C->value == -1) {
C->value = value;
return true; // value added
}
C->value = value;
return false; // value replaced
}
Node *Q = C->N[u];
if (Q) {
C = Q;
} else {
C = C->N[u] = new Node;
}
}
}
Node* findNode(void *key, int count) {
Node *C = &root;
BitScanner B(key, count);
while (true) {
char u = B.next();
if (u < 0) return C;
// if (C->N[0] == C->N[1]);
C = C->N[0+u];
if (C == 0) return 0;
}
}
};
int main() {
int T = time1000();
Trie16 trie;
__int64_t STEPS = 100000, STEP = 500000000, key;
key = 0;
for (int i = 0; i < STEPS; i++) {
key += STEP;
bool ok = trie.add(&key, 8, key+222);
}
printf("insert time:%i\n",time1000() - T); T = time1000();
int err = 0;
key = 0;
for (int i = 0; i < STEPS; i++) {
key += STEP;
Node *N = trie.findNode(&key, 8);
if (N==0 || N->value != key+222) err++;
}
printf("find time:%i\n",time1000() - T); T = time1000();
printf("errors:%i\n", err);
}
This is largely a guess but from what I read about CPU data prefetcher it would only prefetch if it sees multiple access to the same memory location and that access matches prefetch triggers, for example looks like scanning. In your case if there is only single access to C->N
the prefetcher would not be interested, however if there are multiple and it can predict that the later access is further into the same bit of memory that can make it to prefetch more than one cache line.
If the above was happening then C->N[u]
would not have to wait for memory to arrive from RAM therefore would be faster.
It looks like what you are doing is preventing processor stalls by delaying the execution of code until the data is available locally.
Doing it this way is very error prone unlikely to continue working consistently. The better way is to get the compiler to do this. By default most compilers generate code for a generic processor family. BUT if you look at the available flags you can usually find flags for specifying your specific processor so it can generate more specific code (like pre-fetches and stall code).
See: GCC: how is march different from mtune? the second answer goes into some detail: https://stackoverflow.com/a/23267520/14065
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With