I have a weird problem. If I call this code to make an http request in the main execution line:
var request = require('request');
request('http://www.google.com', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Print the google web page.
}
})
The Google page HTML is printed, as expected.
However, I am doing a batch download script/crawler, so I am parsing a very large JSON file and then performing a request for each of the URLs I produce from that file.
To do the parsing, I am using the JSONStream parser. Here is the code:
parser.on('data', function (obj) {
console.log("Found uri");
console.log(obj);
});
The code is being run correctly, as the URI's are being printed in my console.
However, if I make the request inside the parsing block, the request callback is never executed.... Here is the code:
parser.on('data', function (obj) {
console.log("Found uri");
console.log(obj);
var identifierArray = obj['dc.identifier'];
if(identifierArray != null && identifierArray instanceof Array)
{
for(var i = 0; i < identifierArray.length; i++)
{
var dryadIdentifier = identifierArray[i];
if(dryadIdentifier.indexOf("dryad") != -1)
{
var fullUrl = "http://datadryad.org/resource/"+dryadIdentifier+"/mets.xml"
//var fileDestination = __dirname +"/"+downloadSubDir+"/"+dryadIdentifier.replace("/","_")+".xml"
var fileDestination = __dirname +"/"+downloadSubDir+"/"+fileCounter+".xml";
fileCounter++;
console.log("Sending request to "+ fullUrl + " ...");
//REQUEST SENT HERE; SAME CODE AS ABOVE.
var request = require('request');
request('http://www.google.com', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Print the google web page.
}
})
sleep.usleep(500000); //dont hammer the server
}
}
}
});
The log shows
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.s737f/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.s737f/1/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.1fd83/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.1fd83/1/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.4vk6d/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.c3k8m/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.5410v/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.492r0/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.m6g1b/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.m6g1b/1/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.4dm30/mets.xml ...
But no html is printed (it should print the google homepage many times, as I am not using the url's I parse from the json yet, to rule out problems with the intended server.
Sorry for the long letter, but I am at a loss at this behaviour (still learning nodejs... :-O)