0

我在 python 中为 minhash 和 shingle 运行此代码以检测重复的文本,但它给了我一个错误。用于绘图。我有一些错误。它说“float() 参数必须是字符串或数字,而不是 'dict_keys'”。我试图列出它,但它绘制错误。此外,字典的键和值是数字。

 from datasketch import MinHash, LeanMinHash, MinHashLSH
import xxhash
import kshingle as ks

def minhash(content, num_perm):
    m = MinHash(num_perm=num_perm, hashfunc=xxhash.xxh64_intdigest)
    for s in content:
        m.update(s.encode('utf8'))
    return LeanMinHash(m)
def get_minhashes(corpus, n_shingle, num_perm):
    return [minhash(ks.shingleset_k(text, n_shingle), num_perm) for 
 text in corpus]
num_perm = 128
n_shingle = 5
minhashes=get_minhashes(content, n_shingle, num_perm)
mh_distances = {}
for i in range(len(content)):
   for j in range(len(content)):
        if i < j:
           mh_distances[(i, j)] = minhashes[i].jaccard(minhashes[j])

mh_distances

count = 0
near_duplicates = dict()
for pairs, similarity in mh_distances.items():
  if similarity > 0.5:
    count += 1
    near_duplicates[pairs] = similarity
print(count)
print(near_duplicates)

shingle_size = [1,5,10,15,20]
num_perm = 128
count_dup_shingle = {}
for size in shingle_size:
    minhashes=get_minhashes(content, size, num_perm)
    mh_distances = {}
  count_dup_shingle[size] = 0
for i in range(len(content)):
    for j in range(len(content)):
        if i < j:
            mh_distances[(i, j)] = minhashes[i].jaccard(minhashes[j])
            if mh_distances[(i, j)]>0.5:
                count_dup_shingle[size]+=1
count_dup_shingle

import matplotlib.pyplot as plt
plt.plot(count_dup_shingle.keys(), count_dup_shingle.values())
plt.xlabel("shingle_size")
plt.ylabel("Number of duplicates")
plt.title("plot dependency of duplicates on shingle size for minhash 64")
plt.show()
4

0 回答 0