Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

compare two spreadsheet and output the difference using google app scripts

well, i'm trying to do what described in title. Both spreadsheets have only one sheet that are the ones i'm comparing. One spreadsheet is and update of the other, so i'm trying to get only new content. (if it were a fc (dos command) like function this would be easy...)

After doing some search, i have the folloing script that should work on most cases, that uses arrays for each sheet.

function test() {
  var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);  
  var newarray = getNewData(array,old_array);
  Logger.log('there are ' + newarray.length + 'different rows');
}

function getNewData(array1,array2){    
  var diff =array2;   
  for (var i = 0; i<array1.length; i++){
    var duplicate = false;
    for (var j = 0;j<diff.length;j++){
      if (array1[i].join() == diff[j].join()){
        Logger.log('duplicated line found on rows ' + i + ':' + j);
        diff.splice(j,1);
        var duplicate= true;
        break;
      }    
    }
    if (duplicate==false) {
      Logger.log('not duplicated line found on row ' + i);
      diff.push(array1[i]);            
    }
  }
  return diff;
}

The thing is that the files are too big, almost 30000 rows, so the scripts exceed 5 minutes limit for execution.

Is there a way to improve this, like for instance, eliminate the inner for loop? Or there is a way to do it in parts? like first the first 5000 rows, and so on.

Regards,

EDIT: after analizing the spreadsheet a little, i found out that there is a ID for every row, so now i can concentrate the search only in one column of each spreadsheet. So here is my new implementation:

function test(){
var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length); 
  //The COlumn has an indicator, so i search for that. I don't control the formatting of the files, so i search in both spreadsheet for the indicator
  var searchString = 'NAME';
  for (var i = 0; i < old_array.length; i++) {    
    for (var j = 0; j < old_array[i].length; j++) {    
      if (old_array[i][j] == searchString) {
        var Row_old = i+1;
        var Column_old = j;
        break;
      }      
    }
    if (Row_old != undefined){
      break;
    }
  }
  for (var i = 0; i < array.length; i++) {    
    for (var j = 0; j < array[i].length; j++) {    
      if (array[i][j] == searchString) {
        var Row = i+1;
        var Column = j;
        break;
      }      
    }
    if (Row != undefined){
      break;
    }
  }

  Logger.log(Row_old+':::'+Column_old+'\n'+Row+':::'+Column);  

  var diff_index =[];
  var row_ind = 0;  
  for (var i=Row;i<array.length;i++){        
    Logger.log(i);
    var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]);
    if (existe==-1){      
      Logger.log(row_ind+'!!!');
      diff_index[row_ind]=i;
      row_ind++;          
    }
  }
  Logger.log(diff_index);
}

This still run out of time... I will now try to incorporate your comments.

like image 376
kurokirasama Avatar asked Jun 06 '17 13:06

kurokirasama


1 Answers

Your script has a few major bottlenecks that slow it down massively:

  • Starting both loops at 0 every time makes its runtime explode
  • splicing every time you find a duplicate requires to move the array around
  • string concatenating an array on every iteration

We can circumvent these issues by:

  • sorting the second range once
  • I'm sure there's something clever to be done by iteratively binary searching through every column but we'd have to resort every time so we'll binary search the first column and then do a linear search.

We will use ArrayLib for the sorting (I hope it's a fast sorting algorithm).

Let's start with a function to find the first row where the first column matches a value (the first column of the current row):

function firstRowMatchingCol1(target, lookupRange) {
  var min = 0;
  var max = lookupRange.length - 1;
  var guess;
  var guessVal;

  while(min <= max) {
    guess = (min + max) / 2 | 0;
    guessVal = lookupRange[guess][0];

    if (guessVal < target) {
      min = guess + 1;
    } else if (guessVal > target) {
      max = guess - 1; 
    } else {
      while (guess > 0 && lookupRange[guess - 1][0] === target) {
        guess -= 1; 
      }
      return guess;
    }
  }
  return -1;
}

Now we can go linearly go through every row and check if the columns match until the first column doesn't match anymore.

function matchExists(row, lookupRange) {
  var index = firstRowMatchingCol1(row[0], lookupRange); 
  if (index === -1) {return false;}

  while (index < lookupRange.length && lookupRange[index][0] === row[0]) {
    for (var col = 1; col < row.length; col++) {
      if (row[col] !== lookupRange[index][col]) {break;}
      if (col === row.length - 1) {return true;} // This only works if the ranges are at least two columns wide but if they are one column wide you can just check if index > -1
    }
    index += 1;
  }
  return false;  
}

And finally we can get the duplicates like this:

function getNonDuplicates(r1, r2) {
  r2 = ArrayLib.sort(r2, 0, true);  
  return r1.filter(function(row) {return !matchExists(row, r2);});
}

Like mTorres' code this is untested

like image 80
Robin Gertenbach Avatar answered Sep 23 '22 13:09

Robin Gertenbach