我正在尝试解析超过 300 页的 pdf。我正在使用pdf-parse npm package。pdf 有 300 页。但是我的应用程序在解析 pdf 时崩溃了。我的问题是我可以一次解析一页吗?下面是我尝试过的代码。
function render_page(pageData) {
//check documents https://mozilla.github.io/pdf.js/
let render_options = {
//replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`.
normalizeWhitespace: false,
//do not attempt to combine same line TextItem's. The default value is `false`.
disableCombineTextItems: false
}
return pageData.getTextContent(render_options)
.then(function (textContent) {
return textContent.items.map(function (s) {
return s.str
}).join(''); // value page text
})
}
//textContent.items.map
//.map(function (s) { return s.str; }).join('{newline}'); // value page text
let dataBuffer = fs.readFileSync('male.pdf');
const options = {
// internal page parser callback
// you can set this option, if you need another format except raw text
pagerender: render_page,
// max page number to parse
max: 4,
//check https://mozilla.github.io/pdf.js/getting_started/
version: 'v1.10.100'
}
pdf(dataBuffer, options).then(function (data) {
res.send(data)
})