0

我正在阅读一个 csv 文件,每一行都包含一个网站 url。我有这个功能来读取文件:

function readCSV(csv){

  var lines=csv.split("\n");

  var result = [];

  var headers=lines[0].split(",");

  // for every line of the file I call check_page function to check the policies (csp and xfo)
  Promise.all(
    lines.map(line => {
      var obj = {};
      var currentline=line.split(",");
      console.log("currentline: "+currentline[1])  
      return check_page("https://www."+currentline[1])
    })
  ).then(() => console.log('it worked')).catch(err => console.log(err));
}

该函数调用内部的另一个函数以通过 http-request 获取 csp 和 xfo 标头。

async function check_page(web_page){

    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto(web_page)
    
    
      
    console.log("MAIN: "+page.mainFrame().url())
    /* I  send for every iframe an http request for retrieve the policies from http header */
    var XMLHttpRequest = require("xmlhttprequest").XMLHttpRequest;       
    var req = new XMLHttpRequest();
    console.log("FACCIO LA GET: "+page.mainFrame().url())
    req.open('GET', page.mainFrame().url(), false);
    req.send(null)
    var headers = req.getAllResponseHeaders().toLowerCase();       
    var arr = headers.trim().split(/[\r\n]+/);
        // Create a map of header names to values
    var headerMap = {};
    arr.forEach(function (line) {
      var parts = line.split(': ');
      var header = parts.shift();
      var value = parts.join(': ');
      headerMap[header] = value;
    });
        
         

    await browser.close();
  
}

如果我的行数很少,我的代码就可以工作:但是如果我有一个包含 100 行的文件,我会遇到这个错误:

(node:1076) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 exit listeners added to [process]. Use emitter.setMaxListeners() to increase limit
(node:1076) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 SIGINT listeners added to [process]. Use emitter.setMaxListeners() to increase limit
(node:1076) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 SIGTERM listeners added to [process]. Use emitter.setMaxListeners() to increase limit
(node:1076) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 SIGHUP listeners added to [process]. Use emitter.setMaxListeners() to increase limit

我想我必须以某种方式分配工作,但我不知道如何。

4

1 回答 1

1

您有一堆 URL 需要检查,并且您想查看响应标头。

没有必要为此使用任何东西puppeteer,更不用说为您的每个 URL 启动一个完整的浏览器了。这是完全没有意义的,而且非常浪费。每个 URL 发送一个 HTTP 请求就足够了。

使用request-promise模块,这是一项非常简单的任务。

const request = require('request-promise');

function readCSV(csv) {                               // -> 'a,b,c\na,b,c'
    var lines = csv.split("\n");                      // -> ['a,b,c', 'a,b,c']
    var table = lines.map(line => line.split(","));   // -> [['a', 'b', 'c'], ['a', 'b' ,'c']]
    var requests = table.map(row => request({         // -> [request, request]
        method: 'GET',
        uri: "https://www." + row[1],
        resolveWithFullResponse: true
    }));

    return Promise.all(requests).then(responses => {  // -> [response, response]
        console.log('it worked');
        responses.forEach(response => {
            var hrds = response.headers;
            // hrds is an object. print it, extract info from it, whatever
            // don't forget to look at the other properties of `response`, as well
        });
    }).catch(err => console.log(err));
}

想一个比 更好的名字readCSV,因为读取 CSV 不是函数的作用。

于 2020-11-09T10:46:34.620 回答