21

现在我们的应用程序将 nodejs.org 的源代码写入控制台。我们希望它改为编写 nodejs.org 的所有超链接。也许我们只需要一行代码就可以从body.

应用程序.js:

var http = require('http');

http.createServer(function (req, res) {
    res.writeHead(200, {'Content-Type': 'text/plain'});
    res.end('Hello World\n');
}).listen(1337, '127.0.0.1');
console.log('Server running at http://127.0.0.1:1337/');

var request = require("request");



request("http://nodejs.org/", function (error, response, body) {
    if (!error)
        console.log(body);
    else
        console.log(error);
});
4

2 回答 2

65

您可能正在寻找jsdomjquerycheerio。您所做的称为屏幕抓取,即从站点中提取数据。jsdom/jquery 提供了一套完整的工具,但是cheerio 更快。

这是一个cheerio示例:

var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'screen+scraping';
var url = 'http://www.bing.com/search?q=' + searchTerm;
request(url, function(err, resp, body){
  $ = cheerio.load(body);
  links = $('a'); //jquery get all hyperlinks
  $(links).each(function(i, link){
    console.log($(link).text() + ':\n  ' + $(link).attr('href'));
  });
});

你选择最适合你的。

于 2013-03-11T17:08:23.733 回答
0

包.json

    {
      "name": "url_extractor",
      "version": "1.0.0",
      "description": "tool to extract all urls from website",
      "main": "index.js",
      "scripts": {
        "start": "node index.js",
        "test": "echo \"Error: no test specified\" && exit 1"
      },
      "author": "sandip shelke",
      "license": "ISC",
      "dependencies": {
        "axios": "^0.24.0",
        "cheerio": "^1.0.0-rc.10"
      }
    }

索引.js

        const axios = require('axios');
        var cheerio = require('cheerio');

        var baseUrl = 'target website base url';

        (async () => {
            
            try 
            {
                let homePageLinks = await getLinksFromURL(baseUrl)
                console.log(homePageLinks);
            } catch (e) { console.log(e); }

        })();



        async function getLinksFromURL(url) {

            try {
                let links = [];
                let httpResponse = await axios.get(url);

                let $ = cheerio.load(httpResponse.data);
                let linkObjects = $('a'); // get all hyperlinks

                linkObjects.each((index, element) => {
                    links.push({
                        text: $(element).text(), // get the text
                        href: $(element).attr('href'), // get the href attribute
                    });
                });

                return links;
            } catch (e) { console.log(e) }

        }

此代码将仅从主页获取链接,递归运行它以加载网页中的所有链接。

考虑到您已经安装了节点,运行 npm install 然后 npm start 运行上面的代码。

于 2021-11-03T06:54:13.850 回答