我正在摆弄使用 Node.js 从电子商务网站上抓取数据。我Request
用来检索页面的 DOM 并Cheerio
进行服务器端 DOM 选择。
const cheerio = require('cheerio');
const request = require('request');
// takes a URL, scrapes the page, and returns an object with the data
let scrapePage = (url) => {
return new Promise((resolve, reject) => {
request(url, (error, resp, body) => {
if(error){
reject(error);
};
let $ = cheerio.load(body);
let $url = url;
let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text();
let obj = {
url: $url,
price: $price
}
resolve(obj);
});
});
};
// Runs scrapePage in a loop
// There is a variable called arrayOfURLs defined elsewhere that contains 100s of URLs
for( let i = 0; i < arrayOfURLs.length; i++){
scrapePage(arrayOfURLs[i])
.then((obj) => {
//write to a file
})
.catch((error) => {
})
};
问题是我发送请求的服务器有时会发回空白数据,我假设是因为我发送了太多请求而没有任何暂停。由于 JS 的异步特性,我很难弄清楚如何在循环的每次迭代之间添加有效延迟。setTimeOut
仅以同步方式添加 a 是不够的,因为setTimeOut
它本身是异步的,而且我在服务器上运行它,所以没有Window
对象。
编辑
上面的代码是我正在处理的简化版本。整个代码是这样的:
app.js
const fs = require('fs');
const path = 'urls.txt';
const path2 = 'results.txt';
const scraper = require('./scraper');
let scrapePage = (url) => {
scraper.scrapePage(url)
.then((obj) => {
// console.log('obj from the scraper with Promises was received');
// console.log(obj);
// console.log('writing obj to a file');
fs.appendFile(path2, JSON.stringify(obj) + ', ', (error) => {
if(error){
console.log(error);
} else {
// console.log('Successfully wrote to ' + path2);
}
})
})
.catch((error) => {
console.log('There was an error scraping obj: ');
console.log(error);
})
}
fs.readFile(path, 'utf8', (err, data) => {
if (err){
throw err;
};
var urlArray = JSON.parse(data);
// this returns an Unexpected Identifier error
// const results = await Promise.all(urlArray.map(scrapePage));
// this returns an Unexpected Token Function error
// async function scrapePages(){
// const results = await Promise.all(urlArray.map(scrapePage));
// };
});
scraper.js
const request = require('request');
const cheerio = require('cheerio');
exports.scrapePage = (url) => {
return new Promise((resolve, reject) => {
request(url, (error, resp, body) => {
if(error){
reject(error);
};
let $ = cheerio.load(body);
let $url = url;
let $price = $('#rt-mainbody > div > div.details > div.itemData > div:nth-child(4) > div.description').text();
let obj = {
url: $url,
price: $price
}
resolve(obj);
})
})
}