以下代码是对soupselect 演示示例的修改。它基本上获取一些 html 并打印链接列表并将它们存储在一个变量中:
crawl = function(host)
var select = require('soupselect').select,
htmlparser = require("htmlparser"),
http = require('http'),
sys = require('sys');
// fetch some HTML...
var http = require('http');
var client = http.createClient(80, host);
var request = client.request('GET', '/',{'host': host});
var newPages = []
request.on('response', function (response) {
response.setEncoding('utf8');
var body = "";
response.on('data', function (chunk) {
body = body + chunk;
});
response.on('end', function() {
// now we have the whole body, parse it and select the nodes we want...
var handler = new htmlparser.DefaultHandler(function(err, dom) {
if (err) {
sys.debug("Error: " + err);
} else {
// soupselect happening here...
var titles = select(dom, 'a.title');
sys.puts("Top stories from reddit");
titles.forEach(function(title) {
sys.puts("- " + title.children[0].raw + " [" + title.attribs.href + "]\n");
newPages.push(title.attribs.href);
})
}
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(body);
});
});
request.end();
}
我真正想要的是让这个函数返回newPages
我想说的newPages = crawl(host)
;麻烦的是我不确定这是否有意义或将 return 语句放在哪里。我看到 newPages 在请求结束之前存在,但在请求结束后为空。
我如何使该函数具有返回值newPages
?