0

I have a weird problem. If I call this code to make an http request in the main execution line:

var request = require('request');
request('http://www.google.com', function (error, response, body) {
    if (!error && response.statusCode == 200) {
        console.log(body) // Print the google web page.
    }
})

The Google page HTML is printed, as expected.

However, I am doing a batch download script/crawler, so I am parsing a very large JSON file and then performing a request for each of the URLs I produce from that file.

To do the parsing, I am using the JSONStream parser. Here is the code:

parser.on('data', function (obj) {
    console.log("Found uri");
    console.log(obj);
});

The code is being run correctly, as the URI's are being printed in my console.

However, if I make the request inside the parsing block, the request callback is never executed.... Here is the code:

parser.on('data', function (obj) {

    console.log("Found uri");
    console.log(obj);

    var identifierArray = obj['dc.identifier'];

    if(identifierArray != null && identifierArray instanceof Array)
    {
        for(var i = 0; i < identifierArray.length; i++)
        {
            var dryadIdentifier = identifierArray[i];
            if(dryadIdentifier.indexOf("dryad") != -1)
            {
                var fullUrl = "http://datadryad.org/resource/"+dryadIdentifier+"/mets.xml"
                //var fileDestination = __dirname +"/"+downloadSubDir+"/"+dryadIdentifier.replace("/","_")+".xml"
                var fileDestination = __dirname +"/"+downloadSubDir+"/"+fileCounter+".xml";

                fileCounter++;

                console.log("Sending request to "+ fullUrl + "   ...");

               //REQUEST SENT HERE; SAME CODE AS ABOVE.

                var request = require('request');
                request('http://www.google.com', function (error, response, body) {
                    if (!error && response.statusCode == 200) {
                        console.log(body) // Print the google web page.
                    }
                })

                sleep.usleep(500000); //dont hammer the server
            }
        }
    }
});

The log shows

Sending request to http://datadryad.org/resource/doi:10.5061/dryad.s737f/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.s737f/1/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.1fd83/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.1fd83/1/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.4vk6d/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.c3k8m/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.5410v/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.492r0/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.m6g1b/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.m6g1b/1/mets.xml   ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.4dm30/mets.xml   ...

But no html is printed (it should print the google homepage many times, as I am not using the url's I parse from the json yet, to rule out problems with the intended server.

Sorry for the long letter, but I am at a loss at this behaviour (still learning nodejs... :-O)

4

1 回答 1

1

似乎这个问题与“睡眠”调用有关,所以我用信号量库实现了一个基本的连接队列。我现在指定最多 10 个同时连接,这是我的代码:

var makeRequestAndSaveToFile = function(url, absolutePath)
{
    sem.take(function(){
        console.log("Sending request to "+ url + "   ... and saving to file "+absolutePath);
        request(url, function(error,response, body) {
            if (!error && response.statusCode == 200) {
                fs.writeFile(absolutePath, body, function(err) {
                    sem.leave();

                    if(err) {
                        console.log(err);
                    } else {
                        console.log("The file was saved!");
                    }
                });
            }
        });
    });
}

我为要下载的每个链接调用此函数。

请注意,这不会处理大量下载,因为没有管道,并且链接将以 Slavo 在他的评论中所说的无序方式下载。

于 2013-09-16T14:59:35.447 回答