5

对 Node 的文件系统解析感到困惑。这是我的代码:

var fs = require('fs'),
    xml2js = require('xml2js');

var parser = new xml2js.Parser();

var stream = fs.createReadStream('xml/bigXML.xml');
stream.setEncoding('utf8');

stream.on('data', function(chunk){ 

    parser.parseString(chunk, function (err, result) {
        console.dir(result);
        console.log('Done');
    });
});


stream.on('end', function(chunk){
    // file have been read over,do something...
    console.log("IT'S OVER")
});

这导致......什么都没有发生。XML2JS/解析器根本没有输出。当我尝试时console.log(chunk),似乎chunks没有基于字节大小以外的任何东西以任何有意义的块输出。一个“块”的输出是:

<?xml version="1.0" encoding="UTF-8"?>
    <merchandiser xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="merchandiser.xsd">
    <header><merchantId>1237</merchantId><merchantName>NORDSTROM.com</merchantName><createdOn>12/13/2013 23:50:57</createdOn></header>
    <product product_id="52863929">// product info</product>
    <product product_id="26537849">// product info</product>
    <product product_id="25535647">// product info</product>

这个块里面有很多很多<product>来自 XML 的条目。该块将在一个条目的中间某处结束,<product>而下一个块将从其中断处开始。

主要问题是如何获得createReadStream从 开始<product和结束的输出块</product>

编辑:为了获得正确的输出,这是 XML 从第一个开始到结束的<product>样子:

<?xml version="1.0" encoding="UTF-8" ?>
<merchandiser xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="merchandiser.xsd">
  <header>
    <merchantId>1237</merchantId>
    <merchantName>NORDSTROM.com</merchantName>
    <createdOn>12/13/2013 23:50:57</createdOn>
  </header>
  <product product_id="52863929" name="Teva 'Psyclone' Print Sandal (Baby, Walker &amp; Toddler) Camo/ Dark Olive 6 M" sku_number="52863929" manufacturer_name="Teva" part_number="1001701">
    <category>
      <primary>Toddler Unisex</primary>
      <secondary>Shoes~~Sandals/Slides</secondary>
    </category>
    <URL>
      <product>http://click.linksynergy.com/link?id=LUyP0GcLCGc&amp;offerid=276223.52863929&amp;type=15&amp;murl=http%3A%2F%2Fshop.nordstrom.com%2FS%2F3297406%3Fcm_cat%3Ddatafeed%26cm_pla%3Dshoes%3Asandals%252fslides%26cm_ite%3Dteva_%2527psyclone%2527_print_sandal_%2528baby%252c_walker_%2526_toddler%2529%3A503158_1%26cm_ven%3DLinkshare</product>
      <productImage>http://content.nordstrom.com/imagegallery/store/product/large/0/_6880020.jpg</productImage>
      <buy></buy>
    </URL>
    <description>
      <short>Rugged construction and stylish good looks define a sporty sandal, with the added convenience and security of hook-and-loop closures across the toe and at the instep.Rugged construction and stylish good looks define a sporty sandal, with the added
        convenience and security of h...</short>
      <long>Rugged construction and stylish good looks define a sporty sandal, with the added convenience and security of hook-and-loop closures across the toe and at the instep.Rugged construction and stylish good looks define a sporty sandal, with the added
        convenience and security of hook-and-loop closures across the toe and at the instep. Color(s): camo/ dark olive, daisy blue. Brand: Teva. Style Name: Teva 'Psyclone' Print Sandal (Baby, Walker &amp; Toddler). Style Number: 503158_1.</long>
    </description>
    <discount currency="USD">
      <amount></amount>
      <type>amount</type>
    </discount>
    <price currency="USD">
      <sale begin_date="" end_date="">24.95</sale>
      <retail>24.95</retail>
    </price>
    <brand>Teva</brand>
    <shipping>
      <cost currency="USD">
        <amount>0.00</amount>
        <currency>USD</currency>
      </cost>
      <information></information>
      <availability>Y</availability>
    </shipping>
    <keywords></keywords>
    <upc>737872649135</upc>
    <m1>503158_1.</m1>
    <pixel>http://ad.linksynergy.com/fs-bin/show?id=LUyP0GcLCGc&amp;bids=276223.52863929&amp;type=15&amp;subid=0</pixel>
    <attributeClass class_id="60">
      <Misc></Misc>
      <Product_Type>Shoes</Product_Type>
      <Size>6 M</Size>
      <Material></Material>
      <Color>CAMO/ DARK OLIVE</Color>
      <Gender>Unisex</Gender>
      <Style></Style>
      <Age></Age>
    </attributeClass>
  </product>
4

2 回答 2

9

你有两种可能性来解决你的问题。

如 Damhat 所述,XML2JS 需要完整的 XML 内容才能解析数据。但是您有一个文件流,它可以逐块流式传输数据。第一个解决方案是将这个数据流转换成一个漂亮的大Buffer,然后发送给XML2JS。为此,您可以使用stream-to( npm i stream-to) 将文件流转换为缓冲区数组,然后我们将使用 将其连接到一个缓冲区中Buffer.concat,如下所示:

var fs = require('fs')
var streamTo = require('stream-to')
var xml2js = require('xml2js')

var file = fs.createReadStream('input.xml')

streamTo.array(file, function (err, arr) {
    if (err) return console.log(err.message)

    var content = Buffer.concat(arr)
    var parser = new xml2js.Parser()
    parser.parseString(content, function (err, res) {
        if (err) return console.log(err.message)
        console.log(res.merchandiser.product)
    })
})

这工作得很好,但由于它需要将整个文件保存到内存中,如果您的输入文件非常大,它将无法工作。要处理非常大的文件,您需要使用流式 XML 解析器,例如sax. 然而sax,它不会创建 Javascript 对象,而是一个 EventEmitter,并且使用起来有点困难,因为您必须处理所有相关事件才能动态构建对象。

例如,您可以使用SaXPath 库,它支持一小部分 XPath 语法。match该库在每次匹配 XPath 模式时都会发出一个事件。这是一个例子:

var saxpath = require('saxpath')
var fs = require('fs')
var sax = require('sax')

var saxParser = sax.createStream(true)
var streamer = new saxpath.SaXPath(saxParser, '/merchandiser/product')

streamer.on('match', function(xml) {
    console.log(xml);
});

fs.createReadStream('input.xml').pipe(saxParser)

然后你有两个选择:

  1. 由于您现在拥有一次只匹配一个产品的 XML,因此您可以使用一次xml2js解析一个产品
  2. SaXPath 支持多个记录器:默认记录器侦听 sax 事件并重新创建相应的 XML(这是我们使用第一个解决方案的原因),但您可以推出自己的记录器,它侦听 sax 事件并在飞 javascript 对象。
于 2013-12-16T13:56:30.893 回答
0

xml2js 用于完整加载的 xml。

在您使用 sax 的情况下,它是一个流解析器:

// 安装

npm install sax

// 此代码用于打印所有 product_id

var fs = require('fs');
var sax = require('sax');

var saxStream = sax.createStream();

saxStream.onopentag = function (node) {
    if(node.name === 'PRODUCT'){
        console.log(node.attributes.PRODUCT_ID);
    }
};

fs.createReadStream('xml/bigXML.xml').pipe(saxStream);

输出:

52863929
26537849
25535647
于 2013-12-16T04:25:59.727 回答