I am writing a backup script that simply downloads all the blobs in all the blob containers of a specific Azure account.
The script uses async.js to make sure only so much threads can run at the same time so it doesn't overload the server. When I run this script it works fine, but when it hits large files it runs out of memory. I'm guessing the download runs faster than the disk can write, and it eventually fills up the in-memory buffer so badly that I run out of memory entirely, but debugging the exact cause has been impossible so far.
The specific function which appears to use a lot of memory is called as follows:
blobService.getBlobToStream(
containerName,
blob.name,
fs.createWriteStream(fullPath),
function(error) {
if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
console.log("Failed writing " + blob.name + " (" + error + ")");
callback();
}
else if(!error) { //Write the last modified date and finish the queue item silently
fs.writeFile(fullPath + ".date", blobLastModified, function(err)
{ if(err) console.log("Couldn't write .date file: " + err); });
callback();
}
});
Even a single 700MB download will easily fill up 1GB of memory on my side.
Is there any way around this? Am I missing a parameter which magically prevents the Azure SDK from buffering everything and the kitchen sink?
Full code:
#!/usr/bin/env node
//Requires
var azure = require('azure');
var fs = require('fs');
var mkdirp = require('mkdirp');
var path = require('path');
var async = require('async');
var maxconcurrency = 1; //Max amount of simultaneous running threads of getBlobsAndSaveThem() running through async.js.
var blobService = azure.createBlobService();
backupPrefix='/backups/azurebackup/' //Always end with a '/'!!
//Main flow of the script is near the bottom of the file.
var containerProcessingQueue = async.queue(
function getBlobsAndSaveThem(containerName) {
console.log(containerName); //DEBUG
blobService.listBlobs(containerName,
function(error, blobs) {
if(!error){
var blobProcessingQueue =
async.queue(function(index,callback) {
var blob = blobs[index];
console.log(blob); //DEBUG
var fullPath = backupPrefix + containerName + '/' + blob.name;
var blobLastModified = new Date(blob.properties['last-modified']);
//Only create if the directoy doesn't exist, since mkdirp fails if the directory exists.
if(!fs.existsSync(path.dirname(fullPath))){ //And do it sync, because otherwise it'll check 99999 times if the directory exists simultaneously, doesn't find it, then fails to create it 99998 times.
mkdirp.sync(path.dirname(fullPath), function(err) { console.log('Failed to create directory ' + path.dirname(fullPath) + " ("+ err + ")"); });
}
if(fs.existsSync(fullPath + ".date")){
if(blobLastModified == fs.readFileSync(fullPath + ".date").toString()) {
callback();
return; //If the file is unmodified, return. No this won't exit the program, because it's called within a function definition (async.queue(function ...))
}
}
blobService.getBlobToStream(
containerName,
blob.name,
fs.createWriteStream(fullPath),
function(error) {
if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
console.log("Failed writing " + blob.name + " (" + error + ")");
callback();
}
else if(!error) { //Write the last modified date and finish the queue item silently
fs.writeFile(fullPath + ".date", blobLastModified, function(err)
{ if(err) console.log("Couldn't write .date file: " + err); });
callback();
}
});
},maxconcurrency);
for(var blobindex in blobs){
blobProcessingQueue.push(blobindex);
} //Push new items to the queue for processing
}
else {
console.log("An error occurred listing the blobs: " + error);
}
});
},1);
blobService.listContainers(function(err, result){
for(var i=0;i<result.length;i++) {
containerProcessingQueue.push(result[i].name);
}
});