我自己为你做了一些东西(python 2.7):
from __future__ import division
import time
from itertools import izip
from fuzzywuzzy import fuzz
one = "different simliar"
two = "similar"
def compare(first, second):
smaller, bigger = sorted([first, second], key=len)
s_smaller= smaller.split()
s_bigger = bigger.split()
bigger_sets = [set(word) for word in s_bigger]
counter = 0
for word in s_smaller:
if set(word) in bigger_sets:
counter += len(word)
if counter:
return counter/len(' '.join(s_bigger))*100 # percentage match
return counter
start_time = time.time()
print "match: ", compare(one, two)
compare_time = time.time() - start_time
print "compare: --- %s seconds ---" % (compare_time)
start_time = time.time()
print "match: ", fuzz.ratio(one, two)
fuzz_time = time.time() - start_time
print "fuzzy: --- %s seconds ---" % (fuzz_time)
print
print "<simliar or similar>/<length of bigger>*100%"
print 7/len(one)*100
print
print "Equals?"
print 7/len(one)*100 == compare(one, two)
print
print "Faster than fuzzy?"
print compare_time < fuzz_time
所以我认为我的更快,但对你来说更准确?你决定。
现在编辑
不仅更快,而且更准确。
结果:
match: 41.1764705882
compare: --- 4.19616699219e-05 seconds ---
match: 50
fuzzy: --- 7.39097595215e-05 seconds ---
<simliar or similar>/<length of bigger>*100%
41.1764705882
Equals?
True
Faster than fuzzy?
True
当然,如果你想像fuzzywuzzy那样检查单词,那么你就可以了:
from __future__ import division
from itertools import izip
import time
from fuzzywuzzy import fuzz
one = "different simliar"
two = "similar"
def compare(first, second):
smaller, bigger = sorted([first, second], key=len)
s_smaller= smaller.split()
s_bigger = bigger.split()
bigger_sets = [set(word) for word in s_bigger]
counter = 0
for word in s_smaller:
if set(word) in bigger_sets:
counter += 1
if counter:
return counter/len(s_bigger)*100 # percentage match
return counter
start_time = time.time()
print "match: ", compare(one, two)
compare_time = time.time() - start_time
print "compare: --- %s seconds ---" % (compare_time)
start_time = time.time()
print "match: ", fuzz.ratio(one, two)
fuzz_time = time.time() - start_time
print "fuzzy: --- %s seconds ---" % (fuzz_time)
print
print "Equals?"
print fuzz.ratio(one, two) == compare(one, two)
print
print "Faster than fuzzy?"
print compare_time < fuzz_time
结果:
match: 50.0
compare: --- 7.20024108887e-05 seconds ---
match: 50
fuzzy: --- 0.000125169754028 seconds ---
Equals?
True
Faster than fuzzy?
True