我编写了一个简单的节点程序来解析从公司 ERP 返回的 excel 格式的 HTML 表,提取数据,并将其保存为 JSON。

这使用 FS 打开文件并使用 Cheerio 提取数据。


我遇到问题的数据文件是 38MB,大约有 30,000 行数据。

问题1:这不应该更快吗?问题2:我只能得到一个console.log 语句来输出。我可以在任何地方放置一个语句并且它可以工作,如果我添加多个,只有第一个输出任何东西。

var fs = require('fs');                             // for file system streaming

function oracleParse(file, callback) {

    var headers = [];                               // array to store the data table column headers
    var myError;                                    // module error holder
    var XMLdata = [];                               // array to store the parsed XML data to be returned
    var cheerio = require('cheerio');

    // open relevant file
    var reader = fs.readFile(file, function (err, data) {

        if (err) {

            myError = err;                                      // catch errors returned from file open
        } else {
            $ = cheerio.load(data);                             // load data returned from fs into cheerio for parsing

            // the data retruned from Oracle consists of a variable number of tables however the last one is
            // always the one that contains the data.  We can select this with cheerio and reset the cherrio $ object
            var dataTable = $('table').last();
            $ = cheerio.load(dataTable);

            // table column headers in the table of data returned from Oracle include headers under 'tr td b' elements
            // We extract these headers and load these into the 'headers' array for future use as keys in the JSON
            // data array to be constucted
            $('tr td b').each(function (i, elem) {

            // remove the headers from the cheerio data object so that they don't interfere with the data
            $('tr td b').remove();

            // for the actual data, each row of data (this corresponds to a customer, account, transation record etc) is
            // extracted using cheerio and stored in a key/value object.  These objects are then stored in an array
            var dataElements = [];
            var dataObj = {};
            var headersLength = headers.length;
            var headerNum;
            // the actual data is returned from Oracle in 'tr td nobr' elements.  Using cheerio, we can extract all of
            // these elements although they are not separated into individual rows.  It is possible to return individual
            // rows using cheeris (e.g. 'tr') but this is very slow as cheerio needs to requery each subsequent row.
            // In our case, we simply select all data elements using the 'tr td nobr' selector and then iterate through
            // them, aligning them with the relevant key and grouping them into relevant rows by taking the modulus of
            // the element number returned and the number of headers there are.
            $('tr td nobr').each(function (i, elem) {

                headerNum = i % headersLength;                   // pick which column is associated with each element

                dataObj[headers[headerNum]] = $(this).text();    // build the row object

                // if we find the header number is equal to the header length less one, we have reached the end of
                // elements for the row and push the row object onto the array in which we store the final result
                if (headerNum === headersLength - 1) {
                    dataObj = {};

            // once all the data in the file has been parsed, run the call back function passed in

    return myError;

// parse promo dates data
var file = './data/Oracle/signups_01.html';
var output = './data/Oracle/signups_01.JSON';
//var file = './data/Oracle/detailed_data.html';
//var output = './data/Oracle/detailed_data.JSON';
var test = oracleParse(file, function(data) {
    fs.writeFile(output, data, function(err) {
        if (err) throw err;
        console.log('File write complete: ' + output);


您可能想查看像 substack 的trumpet或 (shameless self-plug) cornet这样的流式解决方案。否则,您将多次遍历文档,这总是需要一些时间。

我的猜测是 Chrome 智能地推迟了繁重的工作——你可能只关心前几行,所以这就是你得到的。尝试包含 jQuery 并运行您的代码,这仍然需要一些时间。公平地说,Chrome 的 DOM 不会被垃圾回收,因此总是会胜过 Cheerio。

