我正在构建一个涵盖 200 多个站点的网络爬虫。我当前的代码运行在一个由十几个站点构建的外部 JSON 文件之上。样本:
[
{
"company": "My Company",
"url": "http://example.com/jobs/",
"query": "div.job-listings>dt a",
"link": "div.job-listings>dt a"
},
{
"company": "Another Company",
"url": "http://anothercompany.com/careers/",
"query": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a",
"link": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a"
}
]
当我尝试async .each 时,它会在函数顶部记录所有原始对象,然后再尝试进入噩梦实例然后返回error Nothing responds to "goto"
。然后我尝试了async .eachSeries,它打印出正确的结果,但在第一次迭代后停止。
var async = require ('async');
var data = require('./input.json')
var Nightmare = require('nightmare');
var nightmare = Nightmare({ show: false })
function crawl(data, cb) {
console.log(data) // When async.each will iterate all items then error
var nightmare = new Nightmare()
nightmare
.goto(data.url) // go to JSON specified url
.wait(data.query) // wait until CSS selector loads
.evaluate(function (data) {
positionsArr = []
obj = {}
obj.company = data.company
query = document.querySelectorAll(data.query)
link = document.querySelectorAll(data.link)
/* Set query and link equal to all elements with selector
itearte through appending text (innerText) from each element
with job url to obj*/
var i;
for (i = 0; i < query.length; i++) {
positionsObj = {}
positionsObj.title = query[i].innerText.trim()
// if each position has individual page
if (data.link !== null) {
positionsObj.url = link[i].href
} else {
positionsObj.url = data.url
}
positionsArr.push(positionsObj)
}
obj.positions = positionsArr
return obj
}, data)
.end()
.then(function (obj) {
console.log(obj)
console.log('done')
})
.catch(function (error) {
console.error('error', error);
});
}
async.eachSeries(data, crawl, function (err){
console.log('done!');
})
我怎样才能完成这项工作而不必为每个文件编写单独的文件?或者有没有更好的方法来抓取这么多网站?