Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Order array of email addresses by frequency counts

I have an array with some duplicates. An efficient algorithm for sorting the array based on the duplicate count e.g.

['[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]']
=>
['[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]']

Because the counts are as followings [3, 2, 2, 1, 1]

I came up with:

const itemCounts = {}
const ordereditems = []
for (let i = 0; i < allitems.length; i++) {
  let item = allitems[i];
  itemCounts[item] = itemCounts[item] ? itemCounts[item] + 1 : 1
}
const tuples = []
for (let key in itemCounts) {
  tuples.push([key, itemCounts[key]])
}
return tuples.sort((a, b) => a[1] < b[1]).map(x => x[0])

Which is about Θ(3 N + N log N)?

Perhaps something faster with lodash?

Maybe keep a sorted priority queue as I do the counting?

Maybe use a radix sort of some kind?

like image 373
david_adler Avatar asked Jan 30 '18 02:01

david_adler


2 Answers

This is my try, and first attempt to write a snippet :)

var allitems = ['d', 'a', 'e', 'b', 'c', 'a', 'a', 'b', 'e'];

function original_code(allitems) {
  const itemCounts = {}
  const ordereditems = []
  for (let i = 0; i < allitems.length; i++) {
    let item = allitems[i];
    itemCounts[item] = itemCounts[item] ? itemCounts[item] + 1 : 1
  }
  const tuples = []
  for (let key in itemCounts) {
    tuples.push([key, itemCounts[key]])
  }
  return tuples.sort((a, b) => a[1] < b[1]).map(x => x[0])
}

function myTry(allitems) {
  var arr;
  const dic = {};
  arr = [];
  for (let i = 0; i < allitems.length; i++) {
    let item = allitems[i];
    if (!dic.hasOwnProperty(item)) {
      dic[item] = 1;
      arr.push(item);
    } else
      dic[item]++;
  }
  arr.sort((a, b) => dic[b] - dic[a]);
  return arr;
}



//measure attempts
{ //original code
  var t0 = performance.now();
  var res;
  for (let j = 0; j < 10000; j++) {
    res = original_code(allitems);
  }
  var t1 = performance.now();
  console.log("original " + (t1 - t0) + " milliseconds.");
  console.log("original " + res);
}

{ //my try
  var t0 = performance.now();
  var res;
  for (let j = 0; j < 10000; j++) {
    res = myTry(allitems);
  }
  var t1 = performance.now();
  console.log("myTry " + (t1 - t0) + " milliseconds.");
  console.log("myTry " + res);
}



{ //my try
  var t0 = performance.now();
  var res;
  for (let j = 0; j < 10000; j++) {
    res = myTry(allitems);
  }
  var t1 = performance.now();
  console.log("myTry2 " + (t1 - t0) + " milliseconds.");
  console.log("myTry2 " + res);
}

{ //original code
  var t0 = performance.now();
  var res;
  for (let j = 0; j < 10000; j++) {
    res = original_code(allitems);
  }
  var t1 = performance.now();
  console.log("original2 " + (t1 - t0) + " milliseconds.");
  console.log("original2 " + res);
}

Edit
I've tried to make a more reliable measurement. If there is a better way, I'd appreciate if you tell me.

+ Changed the sorting.

like image 61
Koray Avatar answered Nov 07 '22 06:11

Koray


Use array#reduce to create a frequency object with word and frequency of this word in the array. Then based on the object value sort the values and then take out the all the words.

var arr = ['d', 'a', 'e', 'b', 'c', 'a', 'a', 'b', 'e'],
    frequency = arr.reduce((r,val) => {
      r[val] = r[val] || {val, count : 0};
      r[val].count = r[val].count + 1;
      return r;
    },{});
var result = Object.values(frequency).sort((a,b) => {
  return b.count - a.count;
}).map(({val}) => val);
console.log(result);
like image 38
Hassan Imam Avatar answered Nov 07 '22 06:11

Hassan Imam