制作了一个解析器(基于以前的答案),它可以抓取 PDF 层次结构并为您提供 JSON。
// Parse PDF into JSON.
PDFParser.parse(pdfUrl: pdfFileURL, into: jsonFileURL)
// Parse PDF into Dictionary.
let pdf: [String:Any?] = PDFParser.parse(pdfUrl: pdfFileURL)
给你:
{
"Catalog" : {
"Pages<Dictionary>" : {
"MediaBox<Array>" : [
0,
0,
612,
792
],
"Type<Name>" : "Pages",
"Kids<Array>" : [
{
"Rotate<Integer>" : 0,
"MediaBox<Array>" : [
0,
0,
595.27499999999998,
841.88999999999999
],
"Parent<Dictionary>" : "<PARENT_NOT_SERIALIZED>",
"Resources<Dictionary>" : {
"ColorSpace<Dictionary>" : {
"Cs1<Array>" : [
"ICCBased",
{
"N<Integer>" : 3,
"Filter<Name>" : "FlateDecode",
"Alternate<Name>" : "DeviceRGB",
"Length<Integer>" : 2612
}
]
}
...
从CGPDFDocument
(如原始问题)获取:
// Get document catalog.
guard
let document = CGPDFDocument(pdfFileURL as CFURL),
let catalog = document.catalog
else { return }
// Parse into dictionary.
let catalogDictionary = PDFParser.value(from: catalog)
给你一个非常普通的 Swift 字典。控制台输出:
Optional(["Pages<Dictionary>": Optional({
"Count<Integer>" = 1;
"Kids<Array>" = (
{
"ArtBox<Array>" = (
"28.3465",
"325.193",
"393.389",
"813.543"
);
"Contents<Stream>" = {
Data = "q Q q 0 0 595.276 841.89 re W n 1 0 1 0 k /Gs1 gs 201.8862 420.9449 m 201.8862\n473.8269 244.7562 516.6959 297.6372 516.6959 c 350.5192 516.6959 393.3892\n473.8269 393.3892 420.9449 c 393.3892 368.0629 350.5192 325.1939 297.6372\n325.1939 c 244.7562 325.1939 201.8862 368.0629 201.8862 420.9449 c f Q q 28.346 530.078 283.464 283.465\nre W n 0 0 0 1 k /Gs1 gs BT 12 0 0 12 28.3467 803.499 Tm /Tc1 1 Tf [ (h) 4\n(ttp://epp) 7 (z.eu) ] TJ ET Q";
"Filter<Name>" = FlateDecode;
"Length<Integer>" = 237;
};
"MediaBox<Array>" = (
0,
0,
"595.2760000000001",
"841.89"
);
"Parent<Dictionary>" = "<PARENT_NOT_SERIALIZED>";
"Resources<Dictionary>" = {
"ExtGState<Dictionary>" = {
"Gs1<Dictionary>" = {
"OPM<Integer>" = 1;
"Type<Name>" = ExtGState;
};
};
...
ParsePDF.swift
:
//
// PDFParser.swift
// PDFParser
//
// Copyright (c) 2020 Geri Borbás http://www.twitter.com/_eppz
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
import Foundation
import PDFKit
class PDFParser
{
/// Shorthand for type strings.
static let namesForTypes: [CGPDFObjectType:String] =
[
.null : "Null",
.boolean : "Boolean",
.integer : "Integer",
.real : "Real",
.name : "Name",
.string : "String",
.array : "Array",
.dictionary : "Dictionary",
.stream : "Stream",
CGPDFObjectTypeObject : "Object",
]
struct Message
{
static let parentNotSerialized = "<PARENT_NOT_SERIALIZED>"
static let couldNotParseValue = "<COULD_NOT_PARSE_VALUE>"
static let couldNotGetStreamData = "<COULD_NOT_GET_STREAM_DATA>"
static let unknownStreamDataFormat = "<UNKNOWN_STREAM_DATA_FORMAT>"
}
/// Parse a PDF file into a JSON file.
static func parse(pdfUrl: URL, into jsonURL: URL)
{
do
{
let pdf = PDFParser.parse(pdfUrl: pdfUrl)
let data = try JSONSerialization.data(withJSONObject: pdf, options: .prettyPrinted)
try data.write(to: jsonURL, options: [])
}
catch
{ print(error) }
}
/// Parse a PDF file into a JSON file.
static func parse(pdfUrl: URL) -> [String:Any?]
{
// Document.
guard
let document = CGPDFDocument(pdfUrl as CFURL),
let catalog = document.catalog,
let info = document.info
else
{
print("Cannot open PDF.")
return [:]
}
// Parse.
return [
"Catalog" : PDFParser.value(from: catalog),
"Info" : PDFParser.value(from: info)
]
}
static func value(from object: CGPDFObjectRef) -> Any?
{
switch (CGPDFObjectGetType(object))
{
case .null:
return nil
case .boolean:
var valueRef: CGPDFBoolean = 0
if CGPDFObjectGetValue(object, .boolean, &valueRef)
{ return Bool(valueRef == 0x01) }
case .integer:
var valueRef: CGPDFInteger = 0
if CGPDFObjectGetValue(object, .integer, &valueRef)
{ return valueRef as Int }
case .real:
var valueRef: CGPDFReal = 0.0
if CGPDFObjectGetValue(object, .real, &valueRef)
{ return Double(valueRef) }
case .name:
var objectRefOrNil: UnsafePointer<Int8>? = nil
if
CGPDFObjectGetValue(object, .name, &objectRefOrNil),
let objectRef = objectRefOrNil,
let string = String(cString: objectRef, encoding: String.Encoding.isoLatin1)
{ return string }
case .string:
var objectRefOrNil: UnsafePointer<Int8>? = nil
if
CGPDFObjectGetValue(object, .string, &objectRefOrNil),
let objectRef = objectRefOrNil,
let stringRef = CGPDFStringCopyTextString(OpaquePointer(objectRef))
{ return stringRef as String }
case .array:
var arrayRefOrNil: CGPDFArrayRef? = nil
if
CGPDFObjectGetValue(object, .array, &arrayRefOrNil),
let arrayRef = arrayRefOrNil
{
var array: [Any] = []
for index in 0 ..< CGPDFArrayGetCount(arrayRef)
{
var eachObjectRef: CGPDFObjectRef? = nil
if
CGPDFArrayGetObject(arrayRef, index, &eachObjectRef),
let eachObject = eachObjectRef,
let eachValue = PDFParser.value(from: eachObject)
{ array.append(eachValue) }
}
return array
}
case .stream:
var streamRefOrNil: CGPDFStreamRef? = nil
if
CGPDFObjectGetValue(object, .stream, &streamRefOrNil),
let streamRef = streamRefOrNil,
let streamDictionaryRef = CGPDFStreamGetDictionary(streamRef)
{
// Get stream dictionary.
var streamNSMutableDictionary = NSMutableDictionary()
Self.collectObjects(from: streamDictionaryRef, into: &streamNSMutableDictionary)
var streamDictionary = streamNSMutableDictionary as! [String: Any?]
// Get data.
var dataString: String? = Message.couldNotGetStreamData
var streamDataFormat: CGPDFDataFormat = .raw
if let streamData: CFData = CGPDFStreamCopyData(streamRef, &streamDataFormat)
{
switch streamDataFormat
{
case .raw: dataString = String(data: NSData(data: streamData as Data) as Data, encoding: String.Encoding.utf8)
case .jpegEncoded, .JPEG2000: dataString = NSData(data: streamData as Data).base64EncodedString()
@unknown default: dataString = Message.unknownStreamDataFormat
}
}
// Add to dictionary.
streamDictionary["Data"] = dataString
return streamDictionary
}
case .dictionary:
var dictionaryRefOrNil: CGPDFDictionaryRef? = nil
if
CGPDFObjectGetValue(object, .dictionary, &dictionaryRefOrNil),
let dictionaryRef = dictionaryRefOrNil
{
var dictionary = NSMutableDictionary()
Self.collectObjects(from: dictionaryRef, into: &dictionary)
return dictionary as! [String: Any?]
}
@unknown default:
var dictionary = NSMutableDictionary()
Self.collectObjects(from: object, into: &dictionary)
return dictionary as! [String: Any?]
}
// No known case.
return nil
}
static func collectObjects(from dictionaryRef: CGPDFDictionaryRef, into dictionaryPointer: UnsafeMutableRawPointer?)
{
CGPDFDictionaryApplyFunction(
dictionaryRef,
{
(eachKeyPointer, eachObject, eachContextOrNil: UnsafeMutableRawPointer?) -> Void in
// Unwrap dictionary.
guard let dictionary = eachContextOrNil?.assumingMemoryBound(to: NSMutableDictionary.self).pointee
else { return print("Could not unwrap dictionary.") }
// Unwrap key.
guard let eachKey = String(cString: UnsafePointer<CChar>(eachKeyPointer), encoding: .isoLatin1)
else { return print("Could not unwrap key.") }
// Type.
guard let eachTypeName = PDFParser.namesForTypes[CGPDFObjectGetType(eachObject)]
else { return print("Could not unwrap type.") }
// Assemble.
let eachDictionaryKey = "\(eachKey)<\(eachTypeName)>" as NSString
// Skip parent.
guard eachKey != "Parent"
else
{
dictionary.setObject(Message.parentNotSerialized, forKey: eachDictionaryKey)
return
}
// Parse value.
guard let eachValue = PDFParser.value(from: eachObject)
else
{
dictionary.setObject(Message.couldNotParseValue, forKey: eachDictionaryKey)
fatalError("")
// return
}
// Set.
dictionary.setObject(eachValue, forKey: eachDictionaryKey)
},
dictionaryPointer
)
}
}