我正在尝试创建一个价格抓取应用程序,它将获取一组 URL,执行请求,提取所需的数据(价格),并在完成所有 URL 后,执行 res.send 以使用抓取的数据填充浏览器.
下面的代码抱怨无法访问 url、list 和 count。这怎么可能?
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
// Initiate Express GET Handler
app.use('/', function(req, res){
//URL to Crawl
var urls = [
"http://www.ecommerce.com/products/test",
"http://www.ecommerce.com/products/test1",
"http://www.ecommerce.com/products/test2",
"http://www.ecommerce.com/products/test3",
"http://www.ecommerce.com/products/test4",
"http://www.ecommerce.com/products/test5"
];
var list = [];
var count = 0;
scrape(function(){
//check if the loop has completed and all request data returned.
if(count < urls.length){
res.send(list);
}
});
});
function scrape(callback){
for(var i = 0; i < urls.length; i++){
request(urls[i], function(error, response, html){
if(!error && response.statusCode == 200){
// LOAD Cherio (jQuery) on the webpage
var $ = cheerio.load(html);
var name = $(".name").text();
var mpn = $(".specs.block").contents().get(6).nodeValue.trim();
var jsontemp = {"MPN": "", "Name": "", "PriceList": {}};
jsontemp.MPN = mpn;
jsontemp.Name = name;
// Traverse the DOM to get tr tags and extract info
$(".wide-table tbody tr").each(function (i, row) {
var $row = $(row),
merchant = $row. attr("class").trim(),
total = $row.children(".total").text();
jsontemp.PriceList[merchant] = merchant;
jsontemp.PriceList[merchant] = total;
list.push(jsontemp);
count++;
callback();
});
}
else{
console.log(error);
}
//callback();
});
}
}
app.listen('8000')
console.log('Server is running on Port 8000');
exports = module.exports = app;