2

我正在构建一个涵盖 200 多个站点的网络爬虫。我当前的代码运行在一个由十几个站点构建的外部 JSON 文件之上。样本:

[
  {
    "company": "My Company",
    "url": "http://example.com/jobs/",
    "query": "div.job-listings>dt a",
    "link": "div.job-listings>dt a"
  },
  {
    "company": "Another Company",
    "url": "http://anothercompany.com/careers/",
    "query": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a",
    "link": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a"
  }
]

当我尝试async .each 时,它会在函数顶部记录所有原始对象,然后再尝试进入噩梦实例然后返回error Nothing responds to "goto"。然后我尝试了async .eachSeries,它打印出正确的结果,但在第一次迭代后停止。

var async = require ('async');
var data = require('./input.json')
var Nightmare = require('nightmare');
var nightmare = Nightmare({ show: false })

function crawl(data, cb) {
  console.log(data) // When async.each will iterate all items then error
  var nightmare = new Nightmare()
  nightmare
    .goto(data.url) // go to JSON specified url
    .wait(data.query) // wait until CSS selector loads
    .evaluate(function (data) {
      positionsArr = []
      obj = {}
      obj.company = data.company
      query = document.querySelectorAll(data.query)
      link = document.querySelectorAll(data.link)
    /* Set query and link equal to all elements with selector
    itearte through appending text (innerText) from each element
    with job url to obj*/
      var i;
      for (i = 0; i < query.length; i++) {
    positionsObj = {}
    positionsObj.title = query[i].innerText.trim()
      // if each position has individual page
      if (data.link !== null) {
        positionsObj.url = link[i].href
      } else {
          positionsObj.url = data.url
      }
    positionsArr.push(positionsObj)
      }
      obj.positions = positionsArr
      return obj
    }, data)
  .end()
  .then(function (obj) {
    console.log(obj)
    console.log('done')
  })
  .catch(function (error) {
    console.error('error', error);
  });
}


async.eachSeries(data, crawl, function (err){
    console.log('done!');
})

我怎样才能完成这项工作而不必为每个文件编写单独的文件?或者有没有更好的方法来抓取这么多网站?

源代码

4

1 回答 1

1

cb如果要执行第二步,则必须使用回调 ( ),依此类推:

.end()
.then(function (obj) {
    console.log(obj);
    console.log('done');
    cb();
})
.catch(function (error) {
    console.error('error', error);
    cb(error);
});
于 2016-08-07T09:08:41.703 回答