我最近发现了difflib
模块的强大功能。
认为这会帮助你:
import difflib
datab = ['Pnk Flooyd', 'John Marvulli',
'Ld Zeppelin', 'John Michael Marvulli',
'Led Zepelin', 'Beetles', 'Pink Fl',
'Beatlez', 'Beatles', 'Poonk LLoyds',
'Pook Loyds']
print datab
print
li = []
s = difflib.SequenceMatcher()
def yield_ratios(s,iterable):
for x in iterable:
s.set_seq1(x)
yield s.ratio()
for text_item in datab:
s.set_seq2(text_item)
for gathered in li:
if any(r>0.45 for r in yield_ratios(s,gathered)):
gathered.append(text_item)
break
else:
li.append([text_item])
for el in li:
print el
结果
['Pnk Flooyd', 'Pink Fl', 'Poonk LLoyds', 'Pook Loyds']
['John Marvulli', 'John Michael Marvulli']
['Ld Zeppelin', 'Led Zepelin']
['Beetles', 'Beatlez', 'Beatles']