well, i'm trying to do what described in title. Both spreadsheets have only one sheet that are the ones i'm comparing. One spreadsheet is and update of the other, so i'm trying to get only new content. (if it were a fc (dos command) like function this would be easy...)
After doing some search, i have the folloing script that should work on most cases, that uses arrays for each sheet.
function test() {
var Folder = DriveApp.getFoldersByName('theFolder').next();
var FolderId =Folder.getId();
//call old_spreadsheet
var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";
var files = DriveApp.searchFiles(searchFor);
var old_file = files.next();
var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
var old_sheet = old_spreadsheet.getSheets()[0];
var old_sheetname = old_sheet.getName();
var old_array = old_sheet.getDataRange().getValues();
Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
//call spreadsheet
var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";
var files = DriveApp.searchFiles(searchFor);
var file = files.next();
var spreadsheet = SpreadsheetApp.openById(file.getId());
var sheet = spreadsheet.getSheets()[0];
var sheetname = sheet.getName();
var array = sheet.getDataRange().getValues();
Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);
var newarray = getNewData(array,old_array);
Logger.log('there are ' + newarray.length + 'different rows');
}
function getNewData(array1,array2){
var diff =array2;
for (var i = 0; i<array1.length; i++){
var duplicate = false;
for (var j = 0;j<diff.length;j++){
if (array1[i].join() == diff[j].join()){
Logger.log('duplicated line found on rows ' + i + ':' + j);
diff.splice(j,1);
var duplicate= true;
break;
}
}
if (duplicate==false) {
Logger.log('not duplicated line found on row ' + i);
diff.push(array1[i]);
}
}
return diff;
}
The thing is that the files are too big, almost 30000 rows, so the scripts exceed 5 minutes limit for execution.
Is there a way to improve this, like for instance, eliminate the inner for loop? Or there is a way to do it in parts? like first the first 5000 rows, and so on.
Regards,
EDIT: after analizing the spreadsheet a little, i found out that there is a ID for every row, so now i can concentrate the search only in one column of each spreadsheet. So here is my new implementation:
function test(){
var Folder = DriveApp.getFoldersByName('theFolder').next();
var FolderId =Folder.getId();
//call old_spreadsheet
var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";
var files = DriveApp.searchFiles(searchFor);
var old_file = files.next();
var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
var old_sheet = old_spreadsheet.getSheets()[0];
var old_sheetname = old_sheet.getName();
var old_array = old_sheet.getDataRange().getValues();
Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
//call spreadsheet
var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";
var files = DriveApp.searchFiles(searchFor);
var file = files.next();
var spreadsheet = SpreadsheetApp.openById(file.getId());
var sheet = spreadsheet.getSheets()[0];
var sheetname = sheet.getName();
var array = sheet.getDataRange().getValues();
Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);
//The COlumn has an indicator, so i search for that. I don't control the formatting of the files, so i search in both spreadsheet for the indicator
var searchString = 'NAME';
for (var i = 0; i < old_array.length; i++) {
for (var j = 0; j < old_array[i].length; j++) {
if (old_array[i][j] == searchString) {
var Row_old = i+1;
var Column_old = j;
break;
}
}
if (Row_old != undefined){
break;
}
}
for (var i = 0; i < array.length; i++) {
for (var j = 0; j < array[i].length; j++) {
if (array[i][j] == searchString) {
var Row = i+1;
var Column = j;
break;
}
}
if (Row != undefined){
break;
}
}
Logger.log(Row_old+':::'+Column_old+'\n'+Row+':::'+Column);
var diff_index =[];
var row_ind = 0;
for (var i=Row;i<array.length;i++){
Logger.log(i);
var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]);
if (existe==-1){
Logger.log(row_ind+'!!!');
diff_index[row_ind]=i;
row_ind++;
}
}
Logger.log(diff_index);
}
This still run out of time... I will now try to incorporate your comments.
Your script has a few major bottlenecks that slow it down massively:
We can circumvent these issues by:
We will use ArrayLib for the sorting (I hope it's a fast sorting algorithm).
Let's start with a function to find the first row where the first column matches a value (the first column of the current row):
function firstRowMatchingCol1(target, lookupRange) {
var min = 0;
var max = lookupRange.length - 1;
var guess;
var guessVal;
while(min <= max) {
guess = (min + max) / 2 | 0;
guessVal = lookupRange[guess][0];
if (guessVal < target) {
min = guess + 1;
} else if (guessVal > target) {
max = guess - 1;
} else {
while (guess > 0 && lookupRange[guess - 1][0] === target) {
guess -= 1;
}
return guess;
}
}
return -1;
}
Now we can go linearly go through every row and check if the columns match until the first column doesn't match anymore.
function matchExists(row, lookupRange) {
var index = firstRowMatchingCol1(row[0], lookupRange);
if (index === -1) {return false;}
while (index < lookupRange.length && lookupRange[index][0] === row[0]) {
for (var col = 1; col < row.length; col++) {
if (row[col] !== lookupRange[index][col]) {break;}
if (col === row.length - 1) {return true;} // This only works if the ranges are at least two columns wide but if they are one column wide you can just check if index > -1
}
index += 1;
}
return false;
}
And finally we can get the duplicates like this:
function getNonDuplicates(r1, r2) {
r2 = ArrayLib.sort(r2, 0, true);
return r1.filter(function(row) {return !matchExists(row, r2);});
}
Like mTorres' code this is untested
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With