2

我在cheerio.js 和request.js 之上编写了一个小爬虫脚本,以从预订机构的网站上获取联系信息(URL 和电子邮件)。虽然它确实运行并返回了我需要的所有信息,但在运行它时我连续 5 次收到以下警告:

(node) warning: possible EventEmitter memory leak detected. 11 listeners added. Use emitter.setMaxListeners() to increase limit.
Trace
    at Socket.EventEmitter.addListener (events.js:160:15)
    at Socket.Readable.on (_stream_readable.js:689:33)
    at Socket.EventEmitter.once (events.js:179:8)
    at Request.onResponse (/home/max/Desktop/scraping/node_modules/request/request.js:625:25)
    at ClientRequest.g (events.js:175:14)
    at ClientRequest.EventEmitter.emit (events.js:95:17)
    at HTTPParser.parserOnIncomingClient [as onIncoming] (http.js:1689:21)
    at HTTPParser.parserOnHeadersComplete [as onHeadersComplete] (http.js:120:23)
    at Socket.socketOnData [as ondata] (http.js:1584:20)
    at TCP.onread (net.js:525:27)

我的直觉告诉我,我可能会收到这个警告,因为我将一个请求嵌套在另一个请求中。虽然我不确定,但我所知道的是,当调用位于getArtistInfo()函数的 Cheerio.js.each()循环之一中的嵌套请求时,警告会立即显示。(查看下面的代码以了解我的意思)

这是我的刮刀的代码:

var request = require('request');
var cheerio = require('cheerio');

var url = 'http://www.primarytalent.com/';

var getManyArtistsInfo = function(url){
  request(url, (function(){
    return function(err, resp, body) {
      if(err)
        throw err;
      $ = cheerio.load(body);

      // TODO: scraping goes here
      $('#rosterlists div li a').each(function(){
          var urlCap = this[0]['attribs']['href'].slice(1);
          var artistURL = url.concat(urlCap);
          console.log(artistURL);

          getArtistInfo(artistURL);
      });
    }
  })());
}

var getArtistInfo = function(artistURL){
  request(artistURL, (function(){
    return function(err, resp, body) {
      if(err)
        throw err;
      $ = cheerio.load(body);

      console.log("NOW SCRAPING artist's PAGE")

      var artistName = "";
      $('#content #col3-1 h1').each(function(){
        artistName = this.text();
        console.log(artistName);
      });
      $('#content #col3-1 #links li a').each(function(){
        var socialURL = this.attr('href');
        var siteURL = "";
        var facebookURL = "";
        var twitterURL = "";
        var soundcloudURL = "";
        var bandcampURL = "";
        var myspaceURL = "";

        switch(socialURL) {
          case socialURL.indexOf("facebook"):
            facebookURL = socialURL;
            console.log(facebookURL);
            break;
          case socialURL.indexOf("twitter"):
            twitterURL = socialURL;
            console.log(twitterURL);
            break;
          case socialURL.indexOf("soundcloud"):
            soundcloudURL = socialURL;
            console.log(soundcloudURL);
            break;
          case socialURL.indexOf("bandcamp"):
            bandcampURL = socialURL;
            console.log(bandcampURL);
            break;
          case socialURL.indexOf("myspace"):
            myspaceURL = socialURL;
            console.log(myspaceURL);
            break;
          default: 
            siteURL = socialURL;
            console.log(siteURL)
        }
      });

      // get agentURL
      $('#content #col3-1 .contacts li a').each(function(){
        var agentURL = url + this.attr('href').slice(1);
        console.log("Agent url is : " + agentURL);

        request(agentURL, artistName, (function(){
          return function(err, resp, body) {
            if(err)
              throw err;
            $ = cheerio.load(body);

            console.log("NOW SCRAPING AGENT'S PAGE")

            var agentName = $('#content #col3-1 #details li h1').text();
            console.log(agentName + ' reps ' + artistName);

            var agentEmail = $('#content #col3-1 #details li a').attr("href").slice(7);
            console.log(agentEmail);

            var agentPhone = $('#content #col3-1 #details li').last().text();
            console.log(agentPhone);

            var agentArtistList = []; 
            $('#content #col3-1 #artists li a').each(function(){
              agentArtistList.push(this.text());
            }); 
            console.log(agentName + ' represents ' + agentArtistList.length + ' artists!');

          }
        })(agentURL, artistName));
      });
    }
  })(artistURL));
}

getManyArtistsInfo(url);

我在这里把意大利面弄得一团糟吗?

如何阻止此 EventEmitter 内存泄漏问题的发生?

4

1 回答 1

0

没有理由在请求时使用 IIFE。我想知道它是否可能导致此错误:

  request(artistURL, function(err, resp, body) {
      if(err)
        throw err;
      $ = cheerio.load(body);
于 2013-09-16T02:28:45.623 回答