python - 字符串列表到字符串的对齐索引

Question

我需要一个函数来提供字符串列表最适合与更大字符串对齐的索引。

例如：

给定字符串：

text = 'Kir4.3 is a inwardly-rectifying potassium channel. Dextran-sulfate is useful in glucose-mediated channels.'

和字符串列表：

tok = ['Kir4.3', 'is', 'a', 'inwardly-rectifying', 'potassium', 'channel','.', 'Dextran-sulfate', 'is', 'useful' ,'in', 'glucose','-', 'mediated', 'channels','.']

可以创建一个函数来产生：

indices = [7, 10, 12, 32, 42, 49, 51, 67, 70, 77, 80, 87, 88, 97, 105]

这是我创建的一个脚本来说明这一点：

from re import split
from numpy import vstack, zeros
import numpy as np

# I need a function which takes a string and the tokenized list 
# and returns the indices for which the tokens were split at
def index_of_split(text_str, list_of_strings):
    #?????
    return indices

# The text string, string token list, and character binary annotations 
# are all given
text = 'Kir4.3 is a inwardly-rectifying potassium channel. Dextran-sulfate is useful in glucose-mediated channels.'
tok = ['Kir4.3', 'is', 'a', 'inwardly-rectifying', 'potassium', 'channel','.', 'Dextran-sulfate', 'is', 'useful' ,'in', 'glucose','-', 'mediated', 'channels','.']
# (This binary array labels the following terms ['Kir4.3', 'Dextran-sulfate', 'glucose'])
bin_ann = [1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

# Here we would apply our function
indices = index_of_split(text, tok)
# This list is the desired output
#indices = [7, 10, 12, 32, 42, 49, 51, 67, 70, 77, 80, 87, 88, 97, 105]

# We could now split the binary array based on these indices
bin_ann_toked = np.split(bin_ann, indices)
# and combine with the tokenized list
tokenized_strings = np.vstack((tok, bin_ann_toked)).T

# Then we can remove the trailing zeros, 
# which are likely caused from spaces, 
# or other non tokenized text
for i, el in enumerate(tokenized_strings):
    tokenized_strings[i][1] = el[1][:len(el[0])]
print(tokenized_strings)

假设函数按描述工作，这将提供以下输出：

[['Kir4.3' array([1, 1, 1, 1, 1, 1])]
 ['is' array([0, 0])]
 ['a' array([0])]
 ['inwardly-rectifying'
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
 ['potassium' array([0, 0, 0, 0, 0, 0, 0, 0, 0])]
 ['channel' array([0, 0, 0, 0, 0, 0, 0])]
 ['.' array([0])]
 ['Dextran-sulfate' array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]
 ['is' array([0, 0])]
 ['useful' array([0, 0, 0, 0, 0, 0])]
 ['in' array([0, 0])]
 ['glucose' array([1, 1, 1, 1, 1, 1, 1])]
 ['-' array([0])]
 ['mediated' array([0, 0, 0, 0, 0, 0, 0, 0])]
 ['channels' array([0, 0, 0, 0, 0, 0, 0, 0])]
 ['.' array([0])]]

score 1 · Accepted Answer

text = 'Kir4.3 is a inwardly-rectifying potassium channel. Dextran-sulfate is useful in glucose-mediated channels.'

tok = ['Kir4.3', 'is', 'a', 'inwardly-rectifying', 'potassium', 'channel','.', 'Dextran-sulfate', 'is', 'useful' ,'in', 'glucose','-', 'mediated', 'channels','.']


ind = [0]
for i,substring in enumerate(tok):
    ind.append(text.find(substring,ind[i],len(text)))

print ind[2:]

结果是

[7, 10, 12, 32, 42, 49, 51, 67, 70, 77, 80, 87, 88, 97, 105]

score 1 · Accepted Answer

这是一种蛮力的 numpy 方法：它找到所有单词匹配，然后对所有组合进行评分，以惩罚偏移量。

import numpy as np
from scipy import signal

def pen(l, r):
    return (r-l)*(1-4*(l>r))

class template:
    def __init__(self, template):
        self.template = np.frombuffer(template.encode('utf32'), offset=4,
                                      dtype=np.int32)
        self.normalise = self.template*self.template
    def match(self, other):
        other = np.frombuffer(other.encode('utf32'), offset=4, dtype=np.int32)[::-1]
        m = signal.convolve(self.template, other, 'valid')
        t = signal.convolve(self.normalise, np.ones_like(other), 'valid')
        delta = np.absolute(m - t)
        md = min(delta)
        return np.where(delta == md)[0], md
    def brute(self, tok):
        ms, md = self.match(tok[0])
        matches = [[-md, (tok[0], s, s+len(tok[0]))] for s in ms]
        for t in tok[1:]:
            ms, md = self.match(t)
            matches = [[mo[0] - md - pen(mo[-1][-1], mn)] + mo[1:]
                       + [(t, mn, mn + len(t))] for mn in ms for mo in matches]
        return sorted(matches, key=lambda x: x[0])
#            for t in tok[1:]:
#                ms, md = self.match(t)
#                matches = [[mo[0] - md] + mo[1:]
#                           + [(t, mn, mn + len(t))] for mn in ms for mo in matches
#                           if mo[-1][-1] <=  mn]
#            return sorted(matches, key=lambda x: x[0])

text = 'Kir4.3 is a inwardly-rectifying potassium channel. Dextran-sulfate is useful in glucose-mediated channels.'
tok = ['Kir4.3', 'is', 'a', 'inwardly-rectifying', 'potassium', 'channel','.', 'Dextran-sulfate', 'is', 'useful' ,'in', 'glucose','-', 'mediated', 'channels','.']
tx = template(text)
matches = tx.brute(tok)
print(matches[-1])

# [-11, ('Kir4.3', 0, 6), ('is', 7, 9), ('a', 10, 11), ('inwardly-rectifying', 12, 31), ('potassium', 32, 41), ('channel', 42, 49), ('.', 49, 50), ('Dextran-sulfate', 51, 66), ('is', 67, 69), ('useful', 70, 76), ('in', 77, 79), ('glucose', 80, 87), ('-', 87, 88), ('mediated', 88, 96), ('channels', 97, 105), ('.', 105, 106)]

python - 字符串列表到字符串的对齐索引

2 回答 2

Related

Reference