如果有一天有人需要更好的版本:
from typing import Tuple
from polyglot.text import Text, Sentence, Chunk
doc = " Apple is looking at buying Samsung for $1 billion and Donald Trump isnt happy. Second sentence with this time Joe Biden."
text = Text(doc, hint_language_code="en")
def get_position_in_text(sentence: Text, entity: Chunk) -> Tuple[int, int]:
""" Get the position in text (chars count) """
sent = sentence.raw
start_search = len("".join(sentence.words[0:entity.start]))
try:
start_pos = sent.index(entity[0], start_search)
# Its a single world, that case is eaiser
if len(entity) == 1:
return start_pos, start_pos + len(entity[0])
else:
start_search = start_pos + len("".join(sentence.words[entity.start:entity.end - 1]))
end_pos = sent.index(entity[-1], start_search)
return start_pos, end_pos + len(entity[-1])
except ValueError:
return -1, -1
print(text.raw + "\n")
for entity in text.entities:
# Polyglot do not gives you the position
# but its possible with an algorithm to find
# it...
start_pos, end_pos = get_position_in_text(text, entity)
print(entity.tag, entity, "start", start_pos, "end", end_pos)
这是一个更好的版本,因为上面的版本确实每个句子都给出了,并且句子在前后被剥离了空格,导致偏移量很容易出错。
这个代替使用 text.raw ,它用空格等保持文本完整。