我在cheerio.js 和request.js 之上编写了一个小爬虫脚本,以从预订机构的网站上获取联系信息(URL 和电子邮件)。虽然它确实运行并返回了我需要的所有信息,但在运行它时我连续 5 次收到以下警告:
(node) warning: possible EventEmitter memory leak detected. 11 listeners added. Use emitter.setMaxListeners() to increase limit.
Trace
at Socket.EventEmitter.addListener (events.js:160:15)
at Socket.Readable.on (_stream_readable.js:689:33)
at Socket.EventEmitter.once (events.js:179:8)
at Request.onResponse (/home/max/Desktop/scraping/node_modules/request/request.js:625:25)
at ClientRequest.g (events.js:175:14)
at ClientRequest.EventEmitter.emit (events.js:95:17)
at HTTPParser.parserOnIncomingClient [as onIncoming] (http.js:1689:21)
at HTTPParser.parserOnHeadersComplete [as onHeadersComplete] (http.js:120:23)
at Socket.socketOnData [as ondata] (http.js:1584:20)
at TCP.onread (net.js:525:27)
我的直觉告诉我,我可能会收到这个警告,因为我将一个请求嵌套在另一个请求中。虽然我不确定,但我所知道的是,当调用位于getArtistInfo()
函数的 Cheerio.js.each()
循环之一中的嵌套请求时,警告会立即显示。(查看下面的代码以了解我的意思)
这是我的刮刀的代码:
var request = require('request');
var cheerio = require('cheerio');
var url = 'http://www.primarytalent.com/';
var getManyArtistsInfo = function(url){
request(url, (function(){
return function(err, resp, body) {
if(err)
throw err;
$ = cheerio.load(body);
// TODO: scraping goes here
$('#rosterlists div li a').each(function(){
var urlCap = this[0]['attribs']['href'].slice(1);
var artistURL = url.concat(urlCap);
console.log(artistURL);
getArtistInfo(artistURL);
});
}
})());
}
var getArtistInfo = function(artistURL){
request(artistURL, (function(){
return function(err, resp, body) {
if(err)
throw err;
$ = cheerio.load(body);
console.log("NOW SCRAPING artist's PAGE")
var artistName = "";
$('#content #col3-1 h1').each(function(){
artistName = this.text();
console.log(artistName);
});
$('#content #col3-1 #links li a').each(function(){
var socialURL = this.attr('href');
var siteURL = "";
var facebookURL = "";
var twitterURL = "";
var soundcloudURL = "";
var bandcampURL = "";
var myspaceURL = "";
switch(socialURL) {
case socialURL.indexOf("facebook"):
facebookURL = socialURL;
console.log(facebookURL);
break;
case socialURL.indexOf("twitter"):
twitterURL = socialURL;
console.log(twitterURL);
break;
case socialURL.indexOf("soundcloud"):
soundcloudURL = socialURL;
console.log(soundcloudURL);
break;
case socialURL.indexOf("bandcamp"):
bandcampURL = socialURL;
console.log(bandcampURL);
break;
case socialURL.indexOf("myspace"):
myspaceURL = socialURL;
console.log(myspaceURL);
break;
default:
siteURL = socialURL;
console.log(siteURL)
}
});
// get agentURL
$('#content #col3-1 .contacts li a').each(function(){
var agentURL = url + this.attr('href').slice(1);
console.log("Agent url is : " + agentURL);
request(agentURL, artistName, (function(){
return function(err, resp, body) {
if(err)
throw err;
$ = cheerio.load(body);
console.log("NOW SCRAPING AGENT'S PAGE")
var agentName = $('#content #col3-1 #details li h1').text();
console.log(agentName + ' reps ' + artistName);
var agentEmail = $('#content #col3-1 #details li a').attr("href").slice(7);
console.log(agentEmail);
var agentPhone = $('#content #col3-1 #details li').last().text();
console.log(agentPhone);
var agentArtistList = [];
$('#content #col3-1 #artists li a').each(function(){
agentArtistList.push(this.text());
});
console.log(agentName + ' represents ' + agentArtistList.length + ' artists!');
}
})(agentURL, artistName));
});
}
})(artistURL));
}
getManyArtistsInfo(url);
我在这里把意大利面弄得一团糟吗?
如何阻止此 EventEmitter 内存泄漏问题的发生?