语境
我正在使用 Tabula 解析 pdf 表格。我有很多 pdf,这些 pdf 有不同数量的页面,从几页到几百页不等。我正在尝试将我的 node.js 程序指向一个输入文件夹,并让它为我提供由我确定的区域定义的所有表的 csvs。
由于内存限制,我不能只遍历文件、遍历页面并让 tabula 异步解析每个表。所以我四处寻找如何做批量任务并async.mapLimit
不断提出。似乎它应该满足我的要求,但无论出于何种原因,我只是让它完成第一批大小limit
,然后它“完成”而不去后续批次并执行相同的任务。
代码
const tabula = require('tabula-js');
require('pdfjs-dist');
const fs = require('fs');
const path = require('path');
const async = require('async');
// regions in every page that I care about
const region1 = "5.94,121.275,35.64,788.535";
const region2 = "38.61,159.885,85.14,788.535";
const region3 = "64.35,9.405,149.49,157.905";
const region4 = "87.12,159.885,146.52,789.525";
const region5 = "148.5,186.615,314.82,791.505";
const region6 = "151.47,7.425,313.83,181.665";
const region7 = "318.78,6.435,383.13,788.535";
const region8 = "386.1,10.395,479.16,216.315";
const region9 = "385.11,218.295,595.98,374.715";
const region10 = "386.1,377.685,481.14,791.505";
const region11 = "481.14,10.395,595.98,214.335";
const region12 = "483.12,376.695,596.97,778.635";
// handler for when tabula.js is done extracting table to csv
const handleTableParse = (err, data, title) => {
if (err) {
console.log(err);
return;
}
if (!fs.existsSync('./output/')) {
fs.mkdirSync('./output/');
}
if (fs.existsSync(`./output/${title}.csv`)) {
fs.unlinkSync(`./output/${title}.csv`);
}
data.map(line => {
fs.appendFileSync(`./output/${title}.csv`, `${line}\n`);
});
}
// parse the tables found on each page in the pdf provided
const parseTablesInPDF = (pdf, numPages) => {
const pageNumbers = Array.from(new Array(numPages), (x, i) => i + 1);
const ext = path.extname(pdf);
const filename = path.basename(pdf, ext)
.split(' ')
.join('_');
// let it do 5 pages at a time for memory management reasons
async.mapLimit(pageNumbers, 5, pageNumber => {
const region1Data = tabula(pdf, { pages: `${pageNumber}`, area: region1 });
region1Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region1_page${pageNumber}`));
const region2Data = tabula(pdf, { pages: `${pageNumber}`, area: region2 });
region2Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region2_page${pageNumber}`));
const region3Data = tabula(pdf, { pages: `${pageNumber}`, area: region3 });
region3Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region3_page${pageNumber}`));
const regino4Data = tabula(pdf, { pages: `${pageNumber}`, area: region4 });
region4Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region4_page${pageNumber}`));
const region5Data = tabula(pdf, { pages: `${pageNumber}`, area: region5 });
region5Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region5_page${pageNumber}`));
const region6Data = tabula(pdf, { pages: `${pageNumber}`, area: region6 });
region6Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region6_page${pageNumber}`));
const region7Data = tabula(pdf, { pages: `${pageNumber}`, area: region7 });
region7Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region7_page${pageNumber}`));
const region8Data = tabula(pdf, { pages: `${pageNumber}`, area: region8 });
region8Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region8_page${pageNumber}`));
const region9Data = tabula(pdf, { pages: `${pageNumber}`, area: region9 });
region9Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region9_page${pageNumber}`));
const region10Data = tabula(pdf, { pages: `${pageNumber}`, area: region10 });
region10Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region10_page${pageNumber}`));
const regino11Data = tabula(pdf, { pages: `${pageNumber}`, area: region11 });
region11Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region11_page${pageNumber}`));
const region12Data = tabula(pdf, { pages: `${pageNumber}`, area: region12 });
region12Data.extractCsv((err, data) => handleTableParse(err, data, `${filename}_region12_page${pageNumber}`));
})
}
// start the process of parsing pdfs, 5 at a time
async.mapLimit(fs.readdirSync('./input/2016'), 5, file => {
const data = new Uint8Array(fs.readFileSync(`./input/2016/${file}`));
PDFJS.getDocument(data).then(document => {
parseTablesInPDF(`./input/2016/${file}`, document.numPages);
})
}, () => console.log('DONE!'));
问题
在运行结束时,当我知道有几个有超过 100 页时,我只得到每个 pdf 的 5 页。似乎每个 pdf 文件都在处理中,但无论出于何种原因,我每个只得到 5 页。
这是我第一次使用其中一些库。我究竟做错了什么?提前致谢!
我的想法
会不会是我在做 a async.mapLimit
inside of a async.mapLimit
?的console.log(pageNumber)
内部电话确实显示我只得到 1 到 5 parseTablesInPDF
...async.mapLimit
我是在默默地失败吗?我怎么知道?
更新
我已经意识到我要求节点应用程序做的都是在一个进程中运行。我一直在尝试提出一个工作示例,我可以将这个问题更新为涉及分叉child_process
es,但我不确定如何控制child_process
一次运行多少并发 es。该send({...})
方法似乎是异步的,我不确定如何“等待”它。
如何控制节点中分叉子进程的并发?