或者,您可以使用 XML 解析器,例如lxml
orBeautifulSoup
和 spaCy 作为分词器。
pip install lxml
pip install spacy
python3 -m spacy download en_core_web_sm
一个例子:
from lxml import etree
from io import StringIO
import re
import spacy
DISABLED = [
"ner", "tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]
def type_to_iob(enttype, idx):
mapping = {
"tag1": "TAG1",
"tag2": "TAG2",
"tag3": "TAG3",
}
iob = 'B' if idx == 0 else 'I'
return '{}_{}'.format(iob, mapping.get(enttype))
def transform_to_iob(item):
tokens = list(nlp(item.text, disable=DISABLED))
return [
(ent, type_to_iob(item.tag, idx))
for idx, ent in enumerate(tokens)
]
xmltext = """<doc>
Some example of <tag1>annotated text</tag1> in <tag2>XML</tag2>.
Some other sample of <tag3>another annotated text</tag3> in <tag2>XML</tag2>!
</doc>"""
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('sentencizer')
tree = etree.parse(StringIO(xmltext))
for item in tree.getroot().xpath('/doc/node()'):
if isinstance(item, etree._ElementUnicodeResult):
doc = nlp(str(item).replace("\n", "").strip(), disable=DISABLED)
for sentence in doc.sents:
for token in sentence:
if re.match(r'\s*$', str(token)):
print()
continue
print(f"{token} O")
elif isinstance(item, etree._Element):
for iob_tag in transform_to_iob(item):
print(f'{iob_tag[0]} {iob_tag[1]}')
结果:
❯ python3 test.py
Some O
example O
of O
annotated B_TAG1
text I_TAG1
in O
XML B_TAG2
. O
Some O
other O
sample O
of O
another B_TAG3
annotated I_TAG3
text I_TAG3
in O
XML B_TAG2
! O