python - 通过 ElementTree提取文本

Question

score 0 · Accepted Answer

你可以get_children()，然后你可以text从这个孩子身上得到。

for word in ocr_word:

    # get main text as list
    text_main = [word.text.strip()]

    # get children text as list
    text_children = [x.text.strip() for x in word.getchildren()]

    # concatenate lists
    text = text_main + text_children

    # create one string
    text = " ".join(text).strip()

    # result
    print(word.get('id'), text)

最小的工作示例

data = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html
xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.03' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "2001ABI-7.png"; bbox 0 0 2550 3300; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 762 112 1394 161">
<p class='ocr_par' dir='ltr' id='par_1_1' title="bbox 762 112 1394 161">
<span class='ocr_line' id='line_1_1' title="bbox 762 112 1394 161; baseline 0 -1">
<span class='ocrx_word' id='word_1_1' title='bbox 762 112 1034 161; x_wconf 91' lang='eng' dir='ltr'>STATION</span>
<span class='ocrx_word' id='word_1_2' title='bbox 1056 112 1394 161; x_wconf 91' lang='eng' dir='ltr'>LOCATION</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_2' title="bbox 1192 182 1818 318">
<p class='ocr_par' dir='ltr' id='par_1_2' title="bbox 1203 205 1611 307">
<span class='ocr_line' id='line_1_2' title="bbox 1373 205 1611 221; baseline 0 -1">
<span class='ocrx_word' id='word_1_3' title='bbox 1373 205 1507 221; x_wconf 80' lang='eng' dir='ltr'>ELEVATION</span>
<span class='ocrx_word' id='word_1_4' title='bbox 1536 205 1611 221; x_wconf 80' lang='eng' dir='ltr'>ABOVE</span>
</span>
<span class='ocr_line' id='line_1_3' title="bbox 1218 264 1581 281; baseline 0.006 -2">
<span class='ocrx_word' id='word_1_5' title='bbox 1218 264 1262 280; x_wconf 88' lang='eng' dir='ltr'>SEA</span>
<span class='ocrx_word' id='word_1_6' title='bbox 1493 265 1581 281; x_wconf 85' lang='eng' dir='ltr'>GROUND</span>
</span>
<span class='ocr_line' id='line_1_4' title="bbox 1203 292 1276 307; baseline 0 0">
<span class='ocrx_word' id='word_1_7' title='bbox 1203 292 1276 307; x_wconf 90' lang='eng' dir='ltr'>LEVEL</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_3' title="bbox 131 211 1057 1378">
<p class='ocr_par' dir='ltr' id='par_1_3' title="bbox 131 211 1057 1378">
<span class='ocr_line' id='line_1_5' title="bbox 1012 211 1028 229; baseline 0 0">
<span class='ocrx_word' id='word_1_8' title='bbox 1012 211 1028 229; x_wconf 92' lang='eng' dir='ltr'>L</span>
</span>
<span class='ocr_line' id='line_1_6' title="bbox 1011 236 1027 254; baseline 0 0">
<span class='ocrx_word' id='word_1_9' title='bbox 1011 236 1027 254; x_wconf 88' lang='eng' dir='ltr'>A</span>
</span>
<span class='ocr_line' id='line_1_7' title="bbox 1013 261 1027 279; baseline 0 0">
<span class='ocrx_word' id='word_1_10' title='bbox 1013 261 1027 279; x_wconf 97' lang='eng' dir='ltr'>
<strong>T</strong>
</span>
</span>
<span class='ocr_line' id='line_1_8' title="bbox 1012 286 1020 304; baseline 0 0">
<span class='ocrx_word' id='word_1_11' title='bbox 1012 286 1020 304; x_wconf 97' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
</span>
<span class='ocr_line' id='line_1_9' title="bbox 1013 311 1027 329; baseline 0 0">
<span class='ocrx_word' id='word_1_12' title='bbox 1013 311 1027 329; x_wconf 97' lang='eng' dir='ltr'>T</span>
</span>
<span class='ocr_line' id='line_1_10' title="bbox 1012 335 1027 354; baseline 0 0">
<span class='ocrx_word' id='word_1_13' title='bbox 1012 335 1027 354; x_wconf 92' lang='eng' dir='ltr'>U</span>
</span>
<span class='ocr_line' id='line_1_11' title="bbox 621 360 1030 387; baseline 0.002 -7">
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_15' title='bbox 761 383 764 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_16' title='bbox 849 362 922 381; x_wconf 68' lang='eng' dir='ltr'>Aﬁﬁne</span>
<span class='ocrx_word' id='word_1_17' title='bbox 1012 360 1030 378; x_wconf 88' lang='eng' dir='ltr'>D</span>
</span>
</p>
</div>
</div>
</body>
</html>'''

from xml.etree import ElementTree as ET

tree = ET.fromstring(data)
#root = tree.getroot()

line = tree

ocr_word = line.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']")

for word in ocr_word:
    text_main = [word.text.strip()]
    text_children = [x.text.strip() for x in word.getchildren()]
    text = text_main + text_children
    text = " ".join(text).strip()
    print(word.get('id'), text)

python - 通过 ElementTree提取文本

1 回答 1

Related

Reference