Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

AWS CloudSearch export/download data

I have about 1.5 million documents in an AWS CloudSearch index. It is costing me too much and I wish to migrate off the service. I have been unable to see how I can download or export my documents from the index. Is it possible?

like image 521
waigani Avatar asked Dec 16 '13 22:12

waigani


People also ask

Is Amazon CloudSearch deprecated?

Amazon search is deprecated: Amazon search service is no longer supported. To set up a search functionality on your site(s), configure one of the three built-in search services instead. Amazon Cloud Search is deprecated in Sitefinity 13.3.

What is the difference between CloudSearch and ElasticSearch?

In Elasticsearch, searching happens on both index and types using a search API. The search API also includes Faceting and Filtering for searching data. In CloudSearch, users create a search domain that includes sub-services to upload documents. A search service provides the means to search indexed data.

Is CloudSearch based on SOLR?

Amazon CloudSearch now provides several popular search engine features available with Apache Solr in addition to the managed search service experience that makes it easy to set up, operate, and scale a search domain.


3 Answers

For a similar need, I had to browse my entire CloudSearch domain (more than the 10000 limit) to generate a file.

I used a nodeJS script to handle that, like this:

var AWS = require('aws-sdk');
var fs = require('fs');

AWS.config.update({
    accessKeyId: '<yourAccessKey>', secretAccessKey: '<yourSecretAccessKey>',
    region: '<yourRegion>',endpoint: '<YourSearchDomainEndPoint>'
});

var batchSize = 5000; //Number of item on every search... Max:10000    
var compteur = 0;
var result = [];

var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);

function launchSearch(theContext) {
    process.stdout.write('Launch AWS.CloudSearch ');

    if (theContext==null) {
        process.stdout.write('initial request ... ');
    } else {        
        var current  = (theContext.start/batchSize) +2 ;
        var totalRun = (Math.ceil(theContext.found/batchSize  * 10) / 10) + 1;
        process.stdout.write('( ' + current + ' / ' + totalRun + ' )       ... ');
    }
    
    params = {
           query:"-aQueryStringImpossibleToFind",
           cursor: (theContext==null)?"initial":theContext.cursor,
           size:batchSize 
    };  

    var forCursor = new AWS.CloudSearchDomain(params);
    
    forCursor.search(params, function(err, data) {
        if (err) {
            console.log("Failed with params :" );
            console.log(err);
        } else {
            resultMessage = data;       
            compteur = compteur + data.hits.hit.length;
            for(var i=0;i<data.hits.hit.length;i++){
                result.push(data.hits.hit[i]
                });
            }   
        }   
        
        process.stdout.write(resultMessage.hits.hit.length + ' hits found.');
        
        if (resultMessage.hits.hit.length==0) {
            process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
            writeTheFile(result);
        } else {
            process.stdout.write('\n');
            var myContext = {};
            myContext.cursor = resultMessage.hits.cursor;
            myContext.start = resultMessage.hits.start;
            myContext.found = resultMessage.hits.found;
            myContext.retrived = resultMessage.hits.hit.length;
            launchSearch(myContext);
        }
    });
}

function writeTheFile(myResult) {
    
    fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
        if(err) {
            return console.log(err);
        }
    });
    process.stdout.write("DONE : File '"+ process.argv[2] + "' generated  ( " + compteur + " elements ).\n");
}



 /*Check parameters*/
if (!process.argv[2]) {
     //console.log(process.argv);
    process.stdout.write('ERROR : the output filename is expected as argumment.\n');
    process.exit();
 } else {
    launchSearch();
}

This script has to be called from commandline : node script.js fileToCreate.json

Note : I don't know if this works correctly on a 1.5 millions documents searchdomain. The risk I forsee is the JSON variable size. So, this script has to be adapted (maybe a file write every 100 000 documents ?).

like image 130
Arnaduga Avatar answered Oct 13 '22 01:10

Arnaduga


Amazon (still) doesn't offer a way to export all the data from a Cloudsearch domain, however, it's not difficult to write a utility to do this yourself.

like image 43
ur-vogel Avatar answered Oct 13 '22 03:10

ur-vogel


just fixed a couple of things, full credit to @Nek's response https://stackoverflow.com/a/32119407/1894553


prerequisites, node + aws-sdk plugin

$ npm install aws-sdk

export-all.js

beware that in order to obtain a full dump with return: "_all_fields" param, this fields must have flag return enabled in the indexing options of the schema.

var AWS = require('aws-sdk');
var fs = require('fs');

AWS.config.update({
        accessKeyId: 'xx',
        secretAccessKey: 'xx',
        region: 'xx',
        endpoint: 'xxx'
});

var batchSize = 10000;
var compteur = 0;
var result = [];
var resultMessage = [];

var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);

function launchSearch(theContext) {
    process.stdout.write('Launch AWS.CloudSearch ');

    if (theContext==null) {
        process.stdout.write('initial request ... ');
    } else {
        var current  = (theContext.start/batchSize) +2 ;
        var totalRun = (Math.ceil(theContext.found/batchSize  * 10) / 10) + 1;
        process.stdout.write('( ' + current + ' / ' + totalRun + ' )       ... ');
    }

// https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/CloudSearchDomain.html#search-property
params = {
    query:"matchall",
    cursor: (theContext==null)?"initial":theContext.cursor,
    size:batchSize,
    queryParser: "structured",
    return: "_all_fields"
};
 
    var forCursor = new AWS.CloudSearchDomain(params);

    forCursor.search(params, function(err, data) {
        if (err) {
            console.log("Failed with params :" );
            console.log(err);
        } else {
            resultMessage = data;
            compteur = compteur + data.hits.hit.length;
            for(var i=0;i<data.hits.hit.length;i++){
                result.push(data.hits.hit[i]);
                };
            }


        process.stdout.write(resultMessage.hits.hit.length + ' hits found.');

        if (resultMessage.hits.hit.length==0) {
            process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
            writeTheFile(result);
        } else {
            process.stdout.write('\n');
            var myContext = {};
            myContext.cursor = resultMessage.hits.cursor;
            myContext.start = resultMessage.hits.start;
            myContext.found = resultMessage.hits.found;
            myContext.retrived = resultMessage.hits.hit.length;
            launchSearch(myContext);
        }
    });
}

function writeTheFile(myResult) {

    fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
        if(err) {
            return console.log(err);
        }
    });
    process.stdout.write("DONE : File '"+ process.argv[2] + "' generated  ( " + compteur + " elements ).\n");
}



 /*Check parameters*/
if (!process.argv[2]) {
     //console.log(process.argv);
    process.stdout.write('ERROR : the output filename is expected as argument.\n');
    process.exit();
 } else {
    launchSearch();
}  

#execution

$ node export-all.js all-data.json
like image 43
wideawakening Avatar answered Oct 13 '22 01:10

wideawakening