0

我正在构建一个爬虫但我需要爬取 iframe 内容 chrome-remote-interface 没有转储 iframe 内容有什么办法吗?

代码

     CDP.New({'url':url},(err,target) => {
                if(!err){
                    CDP({target},(client) => {
                        const {Network, Page, Runtime} = client;
                        Network.setUserAgentOverride({'userAgent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'});
                        Network.enable();
                        Page.enable();
                        Runtime.enable();
                        Page.navigate({url});
                            Page.loadEventFired(() => {
                                Runtime.evaluate({
//I have no idea what to do ..
                                    expression:'document.documentElement',
                                    returnValue:true})
                                    .then(({result}) => {
                                        cb(null,{html:result.value})
                                        CDP.Close({id:target.id})
                                    })
                            })


                    })
4

2 回答 2

1

我认为这是禁止的,您可以做的是使用DOM.getDocument获取整个树,但不能将其直接转换为 HTML 字符串:

const {root} = await DOM.getDocument({depth: -1, pierce: true});
于 2017-08-10T16:26:00.250 回答
0

使用 Chrome 启动器

https://www.npmjs.com/package/chrome-launcher

'--disable-web-security'如果您使用 chromeLauncher 并传入标志,这实际上很容易做到。这是一个如何设置的示例。

const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');


let launchChrome = () => {
  console.log('launchChrome..');
  return chromeLauncher.launch({
    chromeFlags: [
      '--disable-web-security', // Query within iframes
    ],
    logLevel: 'error'
  }).catch(function(e) {
    console.log('Error launching chrome: ' + e);
  });
}

let initChrome = async () => {
  console.log('initChrome..');
  const chrome = await launchChrome();
  const protocol = await CDP({port: chrome.port});

  const {Page, Runtime, Network} = protocol;
  const userAgent = 'Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.39 Safari/537.36';
  await Promise.all([Page.enable(), Runtime.enable(), Network.setUserAgentOverride({userAgent})]);

  return {chrome: chrome, protocol: protocol, Page: Page, Runtime: Runtime}
}

let run = async () => {

  let {chrome, protocol, Page, Runtime} = await initChrome();

  try {

    await Page.navigate({url: 'https://www.example.com/'});
    await Page.loadEventFired();

    //////////////////////////////////
    // YOU CAN NOW QUERY IN IFRAMES //
    console.log(await Runtime.evaluate({expression: `document.querySelector('iframe')`, returnByValue: true}));
    //////////////////////////////////
    
    console.log('..Finished');
  } catch (err) {
    console.log(err);
  }

  protocol.close();
  chrome.kill();
}
于 2018-02-10T21:32:48.380 回答