我在 python 中为 minhash 和 shingle 运行此代码以检测重复的文本,但它给了我一个错误。用于绘图。我有一些错误。它说“float() 参数必须是字符串或数字,而不是 'dict_keys'”。我试图列出它,但它绘制错误。此外,字典的键和值是数字。
from datasketch import MinHash, LeanMinHash, MinHashLSH
import xxhash
import kshingle as ks
def minhash(content, num_perm):
m = MinHash(num_perm=num_perm, hashfunc=xxhash.xxh64_intdigest)
for s in content:
m.update(s.encode('utf8'))
return LeanMinHash(m)
def get_minhashes(corpus, n_shingle, num_perm):
return [minhash(ks.shingleset_k(text, n_shingle), num_perm) for
text in corpus]
num_perm = 128
n_shingle = 5
minhashes=get_minhashes(content, n_shingle, num_perm)
mh_distances = {}
for i in range(len(content)):
for j in range(len(content)):
if i < j:
mh_distances[(i, j)] = minhashes[i].jaccard(minhashes[j])
mh_distances
count = 0
near_duplicates = dict()
for pairs, similarity in mh_distances.items():
if similarity > 0.5:
count += 1
near_duplicates[pairs] = similarity
print(count)
print(near_duplicates)
shingle_size = [1,5,10,15,20]
num_perm = 128
count_dup_shingle = {}
for size in shingle_size:
minhashes=get_minhashes(content, size, num_perm)
mh_distances = {}
count_dup_shingle[size] = 0
for i in range(len(content)):
for j in range(len(content)):
if i < j:
mh_distances[(i, j)] = minhashes[i].jaccard(minhashes[j])
if mh_distances[(i, j)]>0.5:
count_dup_shingle[size]+=1
count_dup_shingle
import matplotlib.pyplot as plt
plt.plot(count_dup_shingle.keys(), count_dup_shingle.values())
plt.xlabel("shingle_size")
plt.ylabel("Number of duplicates")
plt.title("plot dependency of duplicates on shingle size for minhash 64")
plt.show()