amazon-web-services - AWS CloudSearch 导出/下载数据

Question

我在 AWS CloudSearch 索引中有大约 150 万个文档。这对我来说太贵了，我希望从服务中迁移出来。我一直无法看到如何从索引中下载或导出我的文档。可能吗？

score 3 · Accepted Answer

对于类似的需求，我不得不浏览我的整个 CloudSearch 域（超过 10000 个限制）来生成文件。

我使用 nodeJS 脚本来处理它，如下所示：

var AWS = require('aws-sdk');
var fs = require('fs');

AWS.config.update({
    accessKeyId: '<yourAccessKey>', secretAccessKey: '<yourSecretAccessKey>',
    region: '<yourRegion>',endpoint: '<YourSearchDomainEndPoint>'
});

var batchSize = 5000; //Number of item on every search... Max:10000    
var compteur = 0;
var result = [];

var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);

function launchSearch(theContext) {
    process.stdout.write('Launch AWS.CloudSearch ');

    if (theContext==null) {
        process.stdout.write('initial request ... ');
    } else {        
        var current  = (theContext.start/batchSize) +2 ;
        var totalRun = (Math.ceil(theContext.found/batchSize  * 10) / 10) + 1;
        process.stdout.write('( ' + current + ' / ' + totalRun + ' )       ... ');
    }
    
    params = {
           query:"-aQueryStringImpossibleToFind",
           cursor: (theContext==null)?"initial":theContext.cursor,
           size:batchSize 
    };  

    var forCursor = new AWS.CloudSearchDomain(params);
    
    forCursor.search(params, function(err, data) {
        if (err) {
            console.log("Failed with params :" );
            console.log(err);
        } else {
            resultMessage = data;       
            compteur = compteur + data.hits.hit.length;
            for(var i=0;i<data.hits.hit.length;i++){
                result.push(data.hits.hit[i]
                });
            }   
        }   
        
        process.stdout.write(resultMessage.hits.hit.length + ' hits found.');
        
        if (resultMessage.hits.hit.length==0) {
            process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
            writeTheFile(result);
        } else {
            process.stdout.write('\n');
            var myContext = {};
            myContext.cursor = resultMessage.hits.cursor;
            myContext.start = resultMessage.hits.start;
            myContext.found = resultMessage.hits.found;
            myContext.retrived = resultMessage.hits.hit.length;
            launchSearch(myContext);
        }
    });
}

function writeTheFile(myResult) {
    
    fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
        if(err) {
            return console.log(err);
        }
    });
    process.stdout.write("DONE : File '"+ process.argv[2] + "' generated  ( " + compteur + " elements ).\n");
}



 /*Check parameters*/
if (!process.argv[2]) {
     //console.log(process.argv);
    process.stdout.write('ERROR : the output filename is expected as argumment.\n');
    process.exit();
 } else {
    launchSearch();
}

必须从命令行调用此脚本： node script.js fileToCreate.json

注意：我不知道这是否适用于 150 万个文档搜索域。我预见的风险是 JSON 变量大小。因此，必须修改此脚本（可能每 100 000 个文档写入一个文件？）。

score 2 · Accepted Answer

Amazon（仍然）不提供从 Cloudsearch 域导出所有数据的方法，但是，编写一个实用程序来自己执行此操作并不难。

score 2 · Accepted Answer

刚刚修复了几件事，完全归功于@Nek 的回复https://stackoverflow.com/a/32119407/1894553

先决条件，节点 + aws-sdk 插件

$ npm install aws-sdk

导出-all.js

请注意，为了获得带有return: "_all_fields"参数的完整转储，此字段必须return在模式的索引选项中启用标志。

var AWS = require('aws-sdk');
var fs = require('fs');

AWS.config.update({
        accessKeyId: 'xx',
        secretAccessKey: 'xx',
        region: 'xx',
        endpoint: 'xxx'
});

var batchSize = 10000;
var compteur = 0;
var result = [];
var resultMessage = [];

var params = {query:""};
var cloudsearchdomain = new AWS.CloudSearchDomain(params);

function launchSearch(theContext) {
    process.stdout.write('Launch AWS.CloudSearch ');

    if (theContext==null) {
        process.stdout.write('initial request ... ');
    } else {
        var current  = (theContext.start/batchSize) +2 ;
        var totalRun = (Math.ceil(theContext.found/batchSize  * 10) / 10) + 1;
        process.stdout.write('( ' + current + ' / ' + totalRun + ' )       ... ');
    }

// https://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/CloudSearchDomain.html#search-property
params = {
    query:"matchall",
    cursor: (theContext==null)?"initial":theContext.cursor,
    size:batchSize,
    queryParser: "structured",
    return: "_all_fields"
};
 
    var forCursor = new AWS.CloudSearchDomain(params);

    forCursor.search(params, function(err, data) {
        if (err) {
            console.log("Failed with params :" );
            console.log(err);
        } else {
            resultMessage = data;
            compteur = compteur + data.hits.hit.length;
            for(var i=0;i<data.hits.hit.length;i++){
                result.push(data.hits.hit[i]);
                };
            }


        process.stdout.write(resultMessage.hits.hit.length + ' hits found.');

        if (resultMessage.hits.hit.length==0) {
            process.stdout.write(' Done.\n\nLet\'s create thte file...\n');
            writeTheFile(result);
        } else {
            process.stdout.write('\n');
            var myContext = {};
            myContext.cursor = resultMessage.hits.cursor;
            myContext.start = resultMessage.hits.start;
            myContext.found = resultMessage.hits.found;
            myContext.retrived = resultMessage.hits.hit.length;
            launchSearch(myContext);
        }
    });
}

function writeTheFile(myResult) {

    fs.writeFile(process.argv[2], JSON.stringify(myResult), function(err) {
        if(err) {
            return console.log(err);
        }
    });
    process.stdout.write("DONE : File '"+ process.argv[2] + "' generated  ( " + compteur + " elements ).\n");
}



 /*Check parameters*/
if (!process.argv[2]) {
     //console.log(process.argv);
    process.stdout.write('ERROR : the output filename is expected as argument.\n');
    process.exit();
 } else {
    launchSearch();
}

＃执行

$ node export-all.js all-data.json

amazon-web-services - AWS CloudSearch 导出/下载数据

3 回答 3

先决条件，节点 + aws-sdk 插件

导出-all.js

Related

Reference