Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

LogLog and HyperLogLog algorithms for counting of large cardinalities

Where can I find a valid implementation of LogLog algorithm? Have tried to implement it by myself but my draft implementation yields strange results.

Here it is:

function LogLog(max_error, max_count) {     function log2(x)     {          return Math.log(x) / Math.LN2;     }      var m = 1.30 / max_error;     var k = Math.ceil(log2(m * m));     m = Math.pow(2, k);      var k_comp = 32 - k;      var l = log2(log2(max_count / m));     if (isNaN(l)) l = 1; else l = Math.ceil(l);     var l_mask = ((1 << l) - 1) >>> 0;      var M = [];     for (var i = 0; i < m; ++i) M[i] = 0;      function count(hash)     {           if (hash !== undefined)           {                 var j = hash >>> k_comp;                  var rank = 0;                 for (var i = 0; i < k_comp; ++i)                 {                      if ((hash >>> i) & 1)                      {                           rank = i + 1;                           break;                      }                 }                  M[j] = Math.max(M[j], rank & l_mask);           }           else           {                 var c = 0;                 for (var i = 0; i < m; ++i) c += M[i];                 return 0.79402 * m * Math.pow(2, c / m);           }     }      return {count: count}; }  function fnv1a(text) {      var hash = 2166136261;      for (var i = 0; i < text.length; ++i)      {           hash ^= text.charCodeAt(i);           hash += (hash << 1) + (hash << 4) + (hash << 7) +             (hash << 8) + (hash << 24);      }     return hash >>> 0; }  var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words  var log_log = LogLog(0.01, 100000); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i])); alert(log_log.count()); 

For unknown reason implementation is very sensitive to max_error parameter, it is the main factor that determines the magnitude of the result. I'm sure, there is some stupid mistake :)

UPDATE: This problem is solved in the newer version of algorithm. I will post its implementation later.

like image 761
actual Avatar asked May 13 '11 10:05

actual


People also ask

What is HyperLogLog used for?

A HyperLogLog is a probabilistic data structure used to count unique values — or as it's referred to in mathematics: calculating the cardinality of a set. These values can be anything: for example, IP addresses for the visitors of a website, search terms, or email addresses.

Is HyperLogLog accurate?

The HyperLogLog algorithm is able to estimate cardinalities of > 109 with a typical accuracy (standard error) of 2%, using 1.5 kB of memory. HyperLogLog is an extension of the earlier LogLog algorithm, itself deriving from the 1984 Flajolet–Martin algorithm.


2 Answers

Here it is the updated version of the algorithm based on the newer paper:

var pow_2_32 = 0xFFFFFFFF + 1;  function HyperLogLog(std_error) {      function log2(x)      {           return Math.log(x) / Math.LN2;      }       function rank(hash, max)      {           var r = 1;           while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }           return r;      }       var m = 1.04 / std_error;      var k = Math.ceil(log2(m * m)), k_comp = 32 - k;      m = Math.pow(2, k);       var alpha_m = m == 16 ? 0.673           : m == 32 ? 0.697           : m == 64 ? 0.709           : 0.7213 / (1 + 1.079 / m);       var M = []; for (var i = 0; i < m; ++i) M[i] = 0;       function count(hash)      {           if (hash !== undefined)           {                 var j = hash >>> k_comp;                 M[j] = Math.max(M[j], rank(hash, k_comp));           }           else           {                 var c = 0.0;                 for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);                 var E = alpha_m * m * m / c;                  // -- make corrections                  if (E <= 5/2 * m)                 {                      var V = 0;                      for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;                      if (V > 0) E = m * Math.log(m / V);                 }                 else if (E > 1/30 * pow_2_32)                      E = -pow_2_32 * Math.log(1 - E / pow_2_32);                  // --                  return E;           }     }      return {count: count}; }  function fnv1a(text) {      var hash = 2166136261;      for (var i = 0; i < text.length; ++i)      {           hash ^= text.charCodeAt(i);           hash += (hash << 1) + (hash << 4) + (hash << 7) +             (hash << 8) + (hash << 24);      }      return hash >>> 0; }  var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words  var seed = Math.floor(Math.random() * pow_2_32); // make more fun  var log_log = HyperLogLog(0.065); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed); var count = log_log.count(); alert(count + ', error ' +     (count - words.length) / (words.length / 100.0) + '%'); 
like image 152
actual Avatar answered Sep 29 '22 08:09

actual


Here is a slightly modified version which adds the merge operation.

Merge allows you to take the counters from several instances of HyperLogLog, and determine the unique counters overall.

For example, if you have unique visitors collected on Monday, Tuesday and Wednesday, then you can merge the buckets together and count the number of unique visitors over the three day span:

var pow_2_32 = 0xFFFFFFFF + 1;  function HyperLogLog(std_error) {     function log2(x)     {         return Math.log(x) / Math.LN2;     }      function rank(hash, max)     {         var r = 1;         while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }         return r;     }      var m = 1.04 / std_error;     var k = Math.ceil(log2(m * m)), k_comp = 32 - k;     m = Math.pow(2, k);      var alpha_m = m == 16 ? 0.673          : m == 32 ? 0.697          : m == 64 ? 0.709          : 0.7213 / (1 + 1.079 / m);      var M = []; for (var i = 0; i < m; ++i) M[i] = 0;      function merge(other)     {         for (var i = 0; i < m; i++)         M[i] = Math.max(M[i], other.buckets[i]);     }      function count(hash)     {         if (hash !== undefined)         {             var j = hash >>> k_comp;             M[j] = Math.max(M[j], rank(hash, k_comp));         }         else         {             var c = 0.0;             for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);             var E = alpha_m * m * m / c;              // -- make corrections              if (E <= 5/2 * m)             {                  var V = 0;                  for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;                  if (V > 0) E = m * Math.log(m / V);             }             else if (E > 1/30 * pow_2_32)                  E = -pow_2_32 * Math.log(1 - E / pow_2_32);              // --              return E;         }     }      return {count: count, merge: merge, buckets: M}; }  function fnv1a(text) {     var hash = 2166136261;     for (var i = 0; i < text.length; ++i)     {         hash ^= text.charCodeAt(i);         hash += (hash << 1) + (hash << 4) + (hash << 7) +           (hash << 8) + (hash << 24);     }     return hash >>> 0; } 

Then you can do something like this:

// initialize one counter per day var ll_monday = HyperLogLog(0.01); var ll_tuesday = HyperLogLog(0.01); var ll_wednesday = HyperLogLog(0.01);   // add 5000 unique values in each day for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random())); for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random())); for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random()));  // add 5000 values which appear every day for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i));   ll_wednesday.count(fnv1a('' + i));}   // merge three days together together = HyperLogLog(0.01); together.merge(ll_monday); together.merge(ll_tuesday); together.merge(ll_wednesday);  // report console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count())); console.log('unique numbers overall: ' + Math.round(together.count())); 
like image 20
Gary Weiss Avatar answered Sep 29 '22 09:09

Gary Weiss