javascript - node js azure SDK getBlobToStream uses lots of memory

Question

I am writing a backup script that simply downloads all the blobs in all the blob containers of a specific Azure account.

The script uses async.js to make sure only so much threads can run at the same time so it doesn't overload the server. When I run this script it works fine, but when it hits large files it runs out of memory. I'm guessing the download runs faster than the disk can write, and it eventually fills up the in-memory buffer so badly that I run out of memory entirely, but debugging the exact cause has been impossible so far.

The specific function which appears to use a lot of memory is called as follows:

blobService.getBlobToStream(
  containerName,
  blob.name,
  fs.createWriteStream(fullPath),
  function(error) {
    if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
    console.log("Failed writing " + blob.name + " (" + error + ")");
    callback();
    }
    else if(!error) { //Write the last modified date and finish the queue item silently
    fs.writeFile(fullPath + ".date", blobLastModified, function(err)
    { if(err) console.log("Couldn't write .date file: " + err); });
    callback();
    }
    });

Even a single 700MB download will easily fill up 1GB of memory on my side.

Is there any way around this? Am I missing a parameter which magically prevents the Azure SDK from buffering everything and the kitchen sink?

Full code:

#!/usr/bin/env node

//Requires
var azure = require('azure');
var fs    = require('fs');
var mkdirp = require('mkdirp');
var path  = require('path');
var async = require('async');

var maxconcurrency = 1; //Max amount of simultaneous running threads of getBlobsAndSaveThem() running through async.js.

var blobService = azure.createBlobService();

backupPrefix='/backups/azurebackup/' //Always end with a '/'!!

//Main flow of the script is near the bottom of the file.
var containerProcessingQueue = async.queue(
 function getBlobsAndSaveThem(containerName) {
console.log(containerName); //DEBUG
  blobService.listBlobs(containerName,
   function(error, blobs) {
     if(!error){
        var blobProcessingQueue =
         async.queue(function(index,callback) {
                var blob = blobs[index];
                console.log(blob); //DEBUG
                var fullPath = backupPrefix + containerName + '/' + blob.name;
                var blobLastModified = new Date(blob.properties['last-modified']);

                //Only create if the directoy doesn't exist, since mkdirp fails if the directory exists.
                if(!fs.existsSync(path.dirname(fullPath))){ //And do it sync, because otherwise it'll check 99999 times if the directory exists simultaneously, doesn't find it, then fails to create it 99998 times.
                        mkdirp.sync(path.dirname(fullPath), function(err) { console.log('Failed to create directory ' + path.dirname(fullPath) + " ("+ err + ")"); });
                        }


                if(fs.existsSync(fullPath + ".date")){
                        if(blobLastModified == fs.readFileSync(fullPath + ".date").toString()) {
                                callback();
                                return; //If the file is unmodified, return. No this won't exit the program, because it's called within a function definition (async.queue(function ...))
                                }
                        }

                blobService.getBlobToStream(
                  containerName,
                  blob.name,
                  fs.createWriteStream(fullPath),
                  function(error) {
                        if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
                                console.log("Failed writing " + blob.name + " (" + error + ")");
                                callback();
                                }
                        else if(!error) { //Write the last modified date and finish the queue item silently
                                fs.writeFile(fullPath + ".date", blobLastModified, function(err)
                                { if(err) console.log("Couldn't write .date file: " + err); });
                                callback();
                                }
                           });

                },maxconcurrency);

        for(var blobindex in blobs){
                blobProcessingQueue.push(blobindex);
                 } //Push new items to the queue for processing



        }
        else {
         console.log("An error occurred listing the blobs: " + error);
        }
});
},1);

blobService.listContainers(function(err, result){
        for(var i=0;i<result.length;i++) {
                containerProcessingQueue.push(result[i].name);
        }
});

score 2 · Accepted Answer

对于所有现在好奇的人来说，开始和结束的变量已经改变了。它们现在只是 rangeStart 和 rangeEnd。这是 azure 节点文档以获取更多帮助 http://dl.windowsazure.com/nodestoragedocs/BlobService.html

score 1 · Accepted Answer

您可能做的一件事是只将数据块而不是整个 blob 数据读取到流中，将其附加到文件并读取下一个块。Blob 存储服务支持这一点。如果您查看getBlobToStream（https://github.com/WindowsAzure/azure-sdk-for-node/blob/master/lib/services/blob/blobservice.js）的源代码，您可以指定 from/to 字节选项 -rangeStartHeader和rangeEndHeader. 看看是否有帮助。

我已经破解了一些代码（正如您从我的代码中看到的那样，我对 node.js 的了解非常原始:)）。[请使用此代码来了解如何进行分块下载，因为我认为它仍然存在一些故障]

var azure = require('azure');
var fs = require('fs');

var blobService = azure.createBlobService("account", "accountkey");
var containerName = "container name";
var blobName = "blob name";
var blobSize;
var chunkSize = 1024 * 512;//chunk size -- we'll read 512 KB at a time.
var startPos = 0;
var fullPath = "D:\\node\\";
var blobProperties = blobService.getBlobProperties(containerName, blobName, null, function (error, blob) {
        if (error) {
            throw error;
        }
        else    {
            blobSize = blob.contentLength;
            fullPath = fullPath + blobName;
            console.log(fullPath);
            doDownload();
        }
    }
);

function doDownload() {
    var stream = fs.createWriteStream(fullPath, {flags: 'a'});
    var endPos = startPos + chunkSize;
    if (endPos > blobSize) {
        endPos = blobSize;
    }
    console.log("Downloading " + (endPos - startPos) + " bytes starting from " + startPos + " marker.");
    blobService.getBlobToStream("test", blobName, stream, 
        { "rangeStartHeader": startPos, "rangeEndHeader": endPos-1 }, function(error) {
        if (error) {
            throw error;
        }
        else if (!error) {
            startPos = endPos;
            if (startPos <= blobSize - 1) {
                doDownload();
            }
        }
    });
}

javascript - node js azure SDK getBlobToStream uses lots of memory

2 回答 2

Related

Reference