Where can I find a valid implementation of LogLog algorithm? Have tried to implement it by myself but my draft implementation yields strange results.
Here it is:
function LogLog(max_error, max_count) { function log2(x) { return Math.log(x) / Math.LN2; } var m = 1.30 / max_error; var k = Math.ceil(log2(m * m)); m = Math.pow(2, k); var k_comp = 32 - k; var l = log2(log2(max_count / m)); if (isNaN(l)) l = 1; else l = Math.ceil(l); var l_mask = ((1 << l) - 1) >>> 0; var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; var rank = 0; for (var i = 0; i < k_comp; ++i) { if ((hash >>> i) & 1) { rank = i + 1; break; } } M[j] = Math.max(M[j], rank & l_mask); } else { var c = 0; for (var i = 0; i < m; ++i) c += M[i]; return 0.79402 * m * Math.pow(2, c / m); } } return {count: count}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words var log_log = LogLog(0.01, 100000); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i])); alert(log_log.count());
For unknown reason implementation is very sensitive to max_error
parameter, it is the main factor that determines the magnitude of the result. I'm sure, there is some stupid mistake :)
UPDATE: This problem is solved in the newer version of algorithm. I will post its implementation later.
A HyperLogLog is a probabilistic data structure used to count unique values — or as it's referred to in mathematics: calculating the cardinality of a set. These values can be anything: for example, IP addresses for the visitors of a website, search terms, or email addresses.
The HyperLogLog algorithm is able to estimate cardinalities of > 109 with a typical accuracy (standard error) of 2%, using 1.5 kB of memory. HyperLogLog is an extension of the earlier LogLog algorithm, itself deriving from the 1984 Flajolet–Martin algorithm.
Here it is the updated version of the algorithm based on the newer paper:
var pow_2_32 = 0xFFFFFFFF + 1; function HyperLogLog(std_error) { function log2(x) { return Math.log(x) / Math.LN2; } function rank(hash, max) { var r = 1; while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; } return r; } var m = 1.04 / std_error; var k = Math.ceil(log2(m * m)), k_comp = 32 - k; m = Math.pow(2, k); var alpha_m = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m); var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; M[j] = Math.max(M[j], rank(hash, k_comp)); } else { var c = 0.0; for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]); var E = alpha_m * m * m / c; // -- make corrections if (E <= 5/2 * m) { var V = 0; for (var i = 0; i < m; ++i) if (M[i] == 0) ++V; if (V > 0) E = m * Math.log(m / V); } else if (E > 1/30 * pow_2_32) E = -pow_2_32 * Math.log(1 - E / pow_2_32); // -- return E; } } return {count: count}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words var seed = Math.floor(Math.random() * pow_2_32); // make more fun var log_log = HyperLogLog(0.065); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed); var count = log_log.count(); alert(count + ', error ' + (count - words.length) / (words.length / 100.0) + '%');
Here is a slightly modified version which adds the merge operation.
Merge allows you to take the counters from several instances of HyperLogLog, and determine the unique counters overall.
For example, if you have unique visitors collected on Monday, Tuesday and Wednesday, then you can merge the buckets together and count the number of unique visitors over the three day span:
var pow_2_32 = 0xFFFFFFFF + 1; function HyperLogLog(std_error) { function log2(x) { return Math.log(x) / Math.LN2; } function rank(hash, max) { var r = 1; while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; } return r; } var m = 1.04 / std_error; var k = Math.ceil(log2(m * m)), k_comp = 32 - k; m = Math.pow(2, k); var alpha_m = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m); var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function merge(other) { for (var i = 0; i < m; i++) M[i] = Math.max(M[i], other.buckets[i]); } function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; M[j] = Math.max(M[j], rank(hash, k_comp)); } else { var c = 0.0; for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]); var E = alpha_m * m * m / c; // -- make corrections if (E <= 5/2 * m) { var V = 0; for (var i = 0; i < m; ++i) if (M[i] == 0) ++V; if (V > 0) E = m * Math.log(m / V); } else if (E > 1/30 * pow_2_32) E = -pow_2_32 * Math.log(1 - E / pow_2_32); // -- return E; } } return {count: count, merge: merge, buckets: M}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; }
Then you can do something like this:
// initialize one counter per day var ll_monday = HyperLogLog(0.01); var ll_tuesday = HyperLogLog(0.01); var ll_wednesday = HyperLogLog(0.01); // add 5000 unique values in each day for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random())); for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random())); for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random())); // add 5000 values which appear every day for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i)); ll_wednesday.count(fnv1a('' + i));} // merge three days together together = HyperLogLog(0.01); together.merge(ll_monday); together.merge(ll_tuesday); together.merge(ll_wednesday); // report console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count())); console.log('unique numbers overall: ' + Math.round(together.count()));
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With