不完全是你问的,但可能足够接近可以通过以下方式实现:
- 安装无头镀铬
- 设置 nodejs + 几个模块
- 使用客户端事件 outerHTML 获取加载和解析
- 从 perl 代码中读取 outerHTML
如下:
# just a perl oneliner, parsing the scrapped html and passing it to Mojo::DOM
perl -MMojo::DOM -e '$s=`node scrap-html.js`; for my $e (Mojo::DOM->new($s)->find("html body a.scroll")->each){ print $e->text}';
scrap-html.js 的代码在哪里
// file: scrap-html.js src: https://gist.github.com/magician11/a979906401591440bd6140bd14260578
const CDP = require('chrome-remote-interface');
const chromeLauncher = require('chrome-launcher');
(async function() {
const launchChrome = () =>
chromeLauncher.launch({ chromeFlags: ['--disable-gpu', '--headless','--blink-settings=imagesEnabled=false'] });
const chrome = await launchChrome();
const protocol = await CDP({ port: chrome.port });
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms));
// See API docs: https://chromedevtools.github.io/devtools-protocol/
const { Page, Runtime, DOM } = protocol;
await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]);
uri = 'https://qto.fi/qto/view/readme_doc'
Page.navigate({ url: uri });
// wait until the page says it's loaded...
Page.loadEventFired(async () => {
try {
await timeout(4000); // give the JS some time to load
// get the page source
const rootNode = await DOM.getDocument({ depth: -1 });
const pageSource = await DOM.getOuterHTML({
nodeId: rootNode.root.nodeId
});
protocol.close();
chrome.kill();
console.log ( pageSource.outerHTML)
} catch (err) {
console.log(err);
}
});
})();
//eof file: scrap-html.js
ubuntu 上的整个设置示例:
# start install chromium-headless
sudo apt-get update
sudo apt-get install -y software-properties-common
sudo apt-get install -y chromium-browser
sudo apt-get update
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo dpkg -i google-chrome-stable_current_amd64.deb
apt --fix-broken install
# stop install chromium-headless
# start installing the nodejs + node modules
sudo apt install nodejs
sudo npm install -g chrome-remote-interface
sudo npm install -g chrome-launcher
export NODE_PATH=/usr/local/lib/node_modules
# stop installing the nodejs + modules