我昨晚在你问这个问题后不久就开始了这个,但直到现在才开始完善它。这是我的解决方案,它基本上是一个修改过的 trie,直到今天我才知道!
class Node(object):
__slots__ = ('words', 'letter', 'child', 'sib')
def __init__(self, letter, sib=None):
self.words = []
self.letter = letter
self.child = None
self.sib = sib
def get_child(self, letter, create=False):
child = self.child
if not child or child.letter > letter:
if create:
self.child = Node(letter, child)
return self.child
return None
return child.get_sibling(letter, create)
def get_sibling(self, letter, create=False):
node = self
while node:
if node.letter == letter:
return node
sib = node.sib
if not sib or sib.letter > letter:
if create:
node.sib = Node(letter, sib)
node = node.sib
return node
return None
node = sib
return None
def __repr__(self):
return '<Node({}){}{}: {}>'.format(chr(self.letter), 'C' if self.child else '', 'S' if self.sib else '', self.words)
def add_word(root, word):
word = word.lower().strip()
letters = [ord(c) for c in sorted(word)]
node = root
for letter in letters:
node = node.get_child(letter, True)
node.words.append(word)
def find_max_word(root, word):
word = word.lower().strip()
letters = [ord(c) for c in sorted(word)]
words = []
def grab_words(root, letters):
last = None
for idx, letter in enumerate(letters):
if letter == last: # prevents duplication
continue
node = root.get_child(letter)
if node:
words.extend(node.words)
grab_words(node, letters[idx+1:])
last = letter
grab_words(root, letters)
return words
root = Node(0)
with open('/path/to/dict/file', 'rt') as f:
for word in f:
add_word(root, word)
测试:
>>> def nonrepeating_words():
... return find_max_word(root, 'abcdefghijklmnopqrstuvwxyz')
...
>>> sorted(nonrepeating_words(), key=len)[-10:]
['ambidextrously', 'troublemakings', 'dermatoglyphic', 'hydromagnetics', 'hydropneumatic', 'pyruvaldoxines', 'hyperabductions', 'uncopyrightable', 'dermatoglyphics', 'endolymphaticus']
>>> len(nonrepeating_words())
67590
我想我更喜欢 dermatoglyphics 最长的词不可版权,我自己。性能方面,使用约 500k 单词词典(来自此处),
>>> import timeit
>>> timeit.timeit(nonrepeating_words, number=100)
62.8912091255188
>>>
因此,平均而言,6/10 秒(在我的 i5-2500 上)可以找到所有 67000 个不包含重复字母的单词。
这个实现和 trie 之间的最大区别(这使得它比一般的 DAWG 更远)是:单词存储在 trie 中,与它们的排序字母相关。所以'dog'这个词和'god'存储在同一个路径下:dgo。第二位是find_max_word
算法,它通过不断地砍掉它的头并重新运行搜索来确保访问每个可能的字母组合。
哦,只是为了咯咯笑:
>>> sorted(tree.find_max_word('RAEPKWAEN'), key=len)[-5:]
['wakener', 'rewaken', 'reawake', 'reawaken', 'awakener']