包.json
{
"name": "url_extractor",
"version": "1.0.0",
"description": "tool to extract all urls from website",
"main": "index.js",
"scripts": {
"start": "node index.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "sandip shelke",
"license": "ISC",
"dependencies": {
"axios": "^0.24.0",
"cheerio": "^1.0.0-rc.10"
}
}
索引.js
const axios = require('axios');
var cheerio = require('cheerio');
var baseUrl = 'target website base url';
(async () => {
try
{
let homePageLinks = await getLinksFromURL(baseUrl)
console.log(homePageLinks);
} catch (e) { console.log(e); }
})();
async function getLinksFromURL(url) {
try {
let links = [];
let httpResponse = await axios.get(url);
let $ = cheerio.load(httpResponse.data);
let linkObjects = $('a'); // get all hyperlinks
linkObjects.each((index, element) => {
links.push({
text: $(element).text(), // get the text
href: $(element).attr('href'), // get the href attribute
});
});
return links;
} catch (e) { console.log(e) }
}
此代码将仅从主页获取链接,递归运行它以加载网页中的所有链接。
考虑到您已经安装了节点,运行 npm install 然后 npm start 运行上面的代码。