我正在尝试使用 PDFrw 阅读示例 PDF。PDFHello Matthew
在坐标的左下角包含短语(100, 100)
。当我尝试输出文本时(如果可以的话?)我得到一个数据流。我似乎无法弄清楚如何将其作为文本获取。
>>> import pdfrw
>>> file_object = pdfrw.PdfReader("Hello.pdf")
>>> file_object
{'/ID': ['<f643bc0910dfb67725d53e11054f4609>', '<f643bc0910dfb67725d53e11054f4609>'], '/Info': (5, 0), '/Root': {'/Outl
ines': (8, 0), '/PageMode': '/UseNone', '/Pages': {'/Count': '1', '/Kids': [{'/Contents': (7, 0), '/MediaBox': ['0', '0
', '595.2756', '841.8898'], '/Parent': {...}, '/Resources': {'/Font': (1, 0), '/ProcSet': ['/PDF', '/Text', '/ImageB',
'/ImageC', '/ImageI']}, '/Rotate': '0', '/Trans': {}, '/Type': '/Page'}], '/Type': '/Pages'}, '/Type': '/Catalog'}, '/S
ize': '9'}
>>> file_object.pages[0]
{'/Contents': (7, 0), '/MediaBox': ['0', '0', '595.2756', '841.8898'], '/Parent': {'/Count': '1', '/Kids': [{...}], '/T
ype': '/Pages'}, '/Resources': {'/Font': (1, 0), '/ProcSet': ['/PDF', '/Text', '/ImageB', '/ImageC', '/ImageI']}, '/Rot
ate': '0', '/Trans': {}, '/Type': '/Page'}
>>> file_object.pages[0].keys()
['/Contents', '/MediaBox', '/Parent', '/Resources', '/Rotate', '/Trans', '/Type']
>>> file_object.pages[0].Contents
{'/Filter': ['/ASCII85Decode', '/FlateDecode'], '/Length': '102'}
>>> file_object.pages[0].Contents.stream
'GapQh0E=F,0U\\H3T\\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90EC-p>QkRte=<%V"lI7]P)Rn29neZ[Kb,htEWn&q7Q2"V~>'