I have about 1.5 million documents in an AWS CloudSearch index. It is costing me too much and I wish to migrate off the service. I have been unable to see how I can download or export my documents from the index. Is it possible?
Amazon search is deprecated: Amazon search service is no longer supported. To set up a search functionality on your site(s), configure one of the three built-in search services instead. Amazon Cloud Search is deprecated in Sitefinity 13.3.
In Elasticsearch, searching happens on both index and types using a search API. The search API also includes Faceting and Filtering for searching data. In CloudSearch, users create a search domain that includes sub-services to upload documents. A search service provides the means to search indexed data.
Amazon CloudSearch now provides several popular search engine features available with Apache Solr in addition to the managed search service experience that makes it easy to set up, operate, and scale a search domain.
For a similar need, I had to browse my entire CloudSearch domain (more than the 10000 limit) to generate a file.
I used a nodeJS script to handle that, like this:
var AWS = require('aws-sdk');
var fs = require('fs');
AWS.config.update({
accessKeyId: '<yourAccessKey>', secretAccessKey: '<yourSecretAccessKey>',
region: '<yourRegion>',endpoint: '<YourSearchDomainEndPoint>'
});
var batchSize = 5000; //Number of item on every search... Max:10000
var compteur = 0;
var result = [];
var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);
function launchSearch(theContext) {
process.stdout.write('Launch AWS.CloudSearch ');
if (theContext==null) {
process.stdout.write('initial request ... ');
} else {
var current = (theContext.start/batchSize) +2 ;
var totalRun = (Math.ceil(theContext.found/batchSize * 10) / 10) + 1;
process.stdout.write('( ' + current + ' / ' + totalRun + ' ) ... ');
}
params = {
query:"-aQueryStringImpossibleToFind",
cursor: (theContext==null)?"initial":theContext.cursor,
size:batchSize
};
var forCursor = new AWS.CloudSearchDomain(params);
forCursor.search(params, function(err, data) {
if (err) {
console.log("Failed with params :" );
console.log(err);
} else {
resultMessage = data;
compteur = compteur + data.hits.hit.length;
for(var i=0;i<data.hits.hit.length;i++){
result.push(data.hits.hit[i]
});
}
}
process.stdout.write(resultMessage.hits.hit.length + ' hits found.');
if (resultMessage.hits.hit.length==0) {
process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
writeTheFile(result);
} else {
process.stdout.write('\n');
var myContext = {};
myContext.cursor = resultMessage.hits.cursor;
myContext.start = resultMessage.hits.start;
myContext.found = resultMessage.hits.found;
myContext.retrived = resultMessage.hits.hit.length;
launchSearch(myContext);
}
});
}
function writeTheFile(myResult) {
fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
if(err) {
return console.log(err);
}
});
process.stdout.write("DONE : File '"+ process.argv[2] + "' generated ( " + compteur + " elements ).\n");
}
/*Check parameters*/
if (!process.argv[2]) {
//console.log(process.argv);
process.stdout.write('ERROR : the output filename is expected as argumment.\n');
process.exit();
} else {
launchSearch();
}
This script has to be called from commandline : node script.js fileToCreate.json
Note : I don't know if this works correctly on a 1.5 millions documents searchdomain. The risk I forsee is the JSON variable size. So, this script has to be adapted (maybe a file write every 100 000 documents ?).
Amazon (still) doesn't offer a way to export all the data from a Cloudsearch domain, however, it's not difficult to write a utility to do this yourself.
just fixed a couple of things, full credit to @Nek's response https://stackoverflow.com/a/32119407/1894553
$ npm install aws-sdk
beware that in order to obtain a full dump with return: "_all_fields"
param, this fields must have flag return
enabled in the indexing options of the schema.
var AWS = require('aws-sdk');
var fs = require('fs');
AWS.config.update({
accessKeyId: 'xx',
secretAccessKey: 'xx',
region: 'xx',
endpoint: 'xxx'
});
var batchSize = 10000;
var compteur = 0;
var result = [];
var resultMessage = [];
var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);
function launchSearch(theContext) {
process.stdout.write('Launch AWS.CloudSearch ');
if (theContext==null) {
process.stdout.write('initial request ... ');
} else {
var current = (theContext.start/batchSize) +2 ;
var totalRun = (Math.ceil(theContext.found/batchSize * 10) / 10) + 1;
process.stdout.write('( ' + current + ' / ' + totalRun + ' ) ... ');
}
// https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/CloudSearchDomain.html#search-property
params = {
query:"matchall",
cursor: (theContext==null)?"initial":theContext.cursor,
size:batchSize,
queryParser: "structured",
return: "_all_fields"
};
var forCursor = new AWS.CloudSearchDomain(params);
forCursor.search(params, function(err, data) {
if (err) {
console.log("Failed with params :" );
console.log(err);
} else {
resultMessage = data;
compteur = compteur + data.hits.hit.length;
for(var i=0;i<data.hits.hit.length;i++){
result.push(data.hits.hit[i]);
};
}
process.stdout.write(resultMessage.hits.hit.length + ' hits found.');
if (resultMessage.hits.hit.length==0) {
process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
writeTheFile(result);
} else {
process.stdout.write('\n');
var myContext = {};
myContext.cursor = resultMessage.hits.cursor;
myContext.start = resultMessage.hits.start;
myContext.found = resultMessage.hits.found;
myContext.retrived = resultMessage.hits.hit.length;
launchSearch(myContext);
}
});
}
function writeTheFile(myResult) {
fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
if(err) {
return console.log(err);
}
});
process.stdout.write("DONE : File '"+ process.argv[2] + "' generated ( " + compteur + " elements ).\n");
}
/*Check parameters*/
if (!process.argv[2]) {
//console.log(process.argv);
process.stdout.write('ERROR : the output filename is expected as argument.\n');
process.exit();
} else {
launchSearch();
}
#execution
$ node export-all.js all-data.json
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With