概述
您可以使用此代码提取名称,传入 [david, bob, etc.] 列表:
有没有一种简单的方法从python中的无空格句子生成一个可能的单词列表?
然后用于collections.Counter
获取频率。
编码
from Bio import trie
import string
from collections import Counter
def get_trie(words):
tr = trie.trie()
for word in words:
tr[word] = len(word)
return tr
def get_trie_word(tr, s):
for end in reversed(range(len(s))):
word = s[:end + 1]
if tr.has_key(word):
return word, s[end + 1: ]
return None, s
def get_trie_words(s):
names = ['david', 'bob', 'karl', 'joe', 'mike']
tr = get_trie(names)
while s:
word, s = get_trie_word(tr, s)
yield word
def main(urls):
d = Counter()
for url in urls:
url = "".join(a for a in url if a in string.lowercase)
for word in get_trie_words(url):
d[word] += 1
return d
if __name__ == '__main__':
urls = [
"davidbobmike1joe",
"mikejoe2bobkarl",
"joemikebob",
"bobjoe",
]
print main(urls)
结果
Counter({'bob': 4, 'joe': 4, 'mike': 3, 'karl': 1, 'david': 1})