我正在使用 PDFJS 从 pdf 文档中提取纯文本,并使用 Firebase Cloud Functions 安排提取。一切都很好,直到有一天我certificate has expired
从选定的 pdf 中获得,主要来自两个域。
我检查了那些受影响的域仍然有有效的 SSL,并且在本地机器上运行纯文本提取代码没有问题。一旦将其部署到 Firebase 云功能,它就会引发certificate has expired
错误。
Error
at BaseExceptionClosure (/srv/node_modules/pdfjs-dist/build/pdf.js:666:29)
at Object.<anonymous> (/srv/node_modules/pdfjs-dist/build/pdf.js:669:2)
at __w_pdfjs_require__ (/srv/node_modules/pdfjs-dist/build/pdf.js:52:30)
at Object.defineProperty.value (/srv/node_modules/pdfjs-dist/build/pdf.js:129:23)
at __w_pdfjs_require__ (/srv/node_modules/pdfjs-dist/build/pdf.js:52:30)
at pdfjsVersion (/srv/node_modules/pdfjs-dist/build/pdf.js:116:18)
at /srv/node_modules/pdfjs-dist/build/pdf.js:119:10
at webpackUniversalModuleDefinition (/srv/node_modules/pdfjs-dist/build/pdf.js:25:20)
at Object.<anonymous> (/srv/node_modules/pdfjs-dist/build/pdf.js:32:3)
at Module._compile (module.js:653:30)
at Object.Module._extensions..js (module.js:664:10)
at Module.load (module.js:566:32)
at tryModuleLoad (module.js:506:12)
at Function.Module._load (module.js:498:3)
at Module.require (module.js:597:17)
at require (internal/module.js:11:18)
at Object.<anonymous> (/srv/pdf/pdf.js:7:18)
at Module._compile (module.js:653:30)
at Object.Module._extensions..js (module.js:664:10)
at Module.load (module.js:566:32)
at tryModuleLoad (module.js:506:12)
at Function.Module._load (module.js:498:3)
message: 'certificate has expired',
name: 'UnknownErrorException',
details: 'UnknownErrorException: certificate has expired' }"
代码:
const pdfjslib = require('pdfjs-dist');
const functions = require('firebase-functions');
module.exports = functions.https.onRequest((req, res) => {
let url = req.query.url
return extractPlainTextFromPdf(url)
.then(pb => {
return res.send(pb)
})
.catch(err => {
console.log(err)
return res.send("Err occured")
})
});
function extractPlainTextFromPdf(pdfUrl) {
let options = setupPdfOptions(pdfUrl)
return getPlainBody(options)
.then((plainBody) => plainBody)
.catch((err) => {
console.log("Err plainBody", err) //<== Error thrown here
})
}
async function getPlainBody(options) {
return getDocument(options)
.then(doc => extractTexts(doc, doc.numPages))
}
function getDocument(options) {
var loadingTask = pdfjslib.getDocument(options)
return loadingTask.promise
.then((doc) => doc)
}
function setupPdfOptions(url) {
return {
url: url,
httpHeaders: {
"User-Agent": "MY-USER-AGENT",
},
};
}
这是面临上述问题的两个示例pdf。
编辑:
//package.json
{
"name": "functions",
"description": "Cloud Functions for Firebase",
"scripts": {
"serve": "firebase serve --only functions",
"shell": "firebase functions:shell",
"start": "npm run shell",
"deploy": "firebase deploy --only functions",
"logs": "firebase functions:log",
},
"engines": {
"node": "8"
},
"dependencies": {
"@google-cloud/functions-framework": "^1.5.1",
"@google-cloud/vision": "^1.11.0",
"aws-sdk": "^2.667.0",
"axios": "^0.19.2",
"cheerio": "^1.0.0-rc.3",
"diff-match-patch": "^1.0.4",
"firebase-admin": "^8.11.0",
"firebase-functions": "^3.6.1",
"moment": "^2.25.0",
"nodemailer": "^6.4.6",
"pdfjs-dist": "^2.3.200",
"request": "^2.88.2",
"request-promise": "^4.2.5",
},
"devDependencies": {
"firebase-functions-test": "^0.1.6"
},
"private": true
}