我正在从N
网址中提取文本。首先我得到 N 个 url,linksOnPage
然后我运行一个doOnPage
函数来从每个 url 中获取文本。当我运行代码时,只有 N 个 url 中的 1 个通过该函数得到处理。我认为这是因为处理功能是异步运行的。如何将它们堆叠在队列中并全部运行/有什么更好的方法来做到这一点?
这是主要的JS代码:
var nodeio, linksOnPage, lyricsFromLink, db;
nodeio = require('node.io');
db = require('./db');
db.loadDB();
var loadSong = function(artist, title, lyrics){
console.log("loadSong being called");
var newSongObj = {};
newSongObj['artist'] = artist;
newSongObj['title'] = title;
newSongObj['lyrics'] = lyrics;
//store the lyrics in a mongo table
var newSong = new db.Song(newSongObj);
newSong.save(function(err) {
if(err){
throw err;
} else{
console.log("saved with no errors!");
}
});
};
// generic utility for getting links on a page and running a function on each one
exports.linksOnPage = function(pageObj, linkSelector, doOnPage, contentSelector) {
nodeio.scrape(function(){
this.getHtml(pageObj.pageUrl, function(err, $) {
var links = [];
var i = 0;
$(linkSelector).each(function(link) {
var fullLink = pageObj.rootUrl + link.attribs.href
links.push(fullLink);
//run a function on each link
console.log('getting lyrics for song: ', i);
doOnPage(pageObj.artist, fullLink, contentSelector);
i = i+1;
});
//this.emit(links);
});
});
}
// get the lyrics for a specific song
exports.lyricsFromLink = function(artist, pageUrl, lyricsSelector) {
nodeio.scrape(function(){
this.getHtml(pageUrl, function(err, $) {
var lyrics = "";
console.log('before each statement');
$(lyricsSelector).each(function(lyricParagraph) {
lyrics = lyrics + " " + lyricParagraph.text;
});
console.log('after each statement');
loadSong(artist, pageUrl, lyrics);
this.emit(lyrics)
});
});
}