4

我正在尝试解析超过 300 页的 pdf。我正在使用pdf-parse npm package。pdf 有 300 页。但是我的应用程序在解析 pdf 时崩溃了。我的问题是我可以一次解析一页吗?下面是我尝试过的代码。

function render_page(pageData) {
    //check documents https://mozilla.github.io/pdf.js/
    let render_options = {
      //replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`.
      normalizeWhitespace: false,
      //do not attempt to combine same line TextItem's. The default value is `false`.
      disableCombineTextItems: false
    }

    return pageData.getTextContent(render_options)
      .then(function (textContent) {
        return textContent.items.map(function (s) {
         return s.str
        }).join(''); // value page text 
      })
  }
  //textContent.items.map
  //.map(function (s) { return s.str; }).join('{newline}'); // value page text 
  let dataBuffer = fs.readFileSync('male.pdf');
  const options = {
    // internal page parser callback
    // you can set this option, if you need another format except raw text
    pagerender: render_page,
    // max page number to parse
    max: 4,
    //check https://mozilla.github.io/pdf.js/getting_started/
    version: 'v1.10.100'
  }
  pdf(dataBuffer, options).then(function (data) {
    res.send(data)
  })
4

0 回答 0