import jieba
from utils import hash_funcs
import numpy as np
string1 = "攻击者利用微软MSHTML漏洞窃取谷歌和instagram凭证信息"
string2 = "黑客利用微软MSHTML漏洞窃取谷歌和instagram凭证信息"
def cut_segment(string):
return list(jieba.cut(string))
def signature(word):
hash_string = hash_funcs.md5(word)
return (bin(int(hash_string, 16))[2:]).zfill(128)
def merge(signatures, weights):
sig_arr = []
for sig, weight in zip(signatures, weights):
sig_s = list(sig)
sig_data = np.array(sig_s, dtype=int)
sig_data[sig_data==0]=-1
sig_arr.append(sig_data * weight)
return np.array(sig_arr)
def reduce(arr):
sum_reducer = arr.sum(axis=0)
sum_reducer[sum_reducer >0]=1
sum_reducer[sum_reducer<1]=0
return sum_reducer
def simhash(segments):
weights = [1 for i in range(len(segments))] # assume weight is 1
signatures = [signature(w) for w in segments]
arrs = merge(signatures, weights)
return reduce(arrs)
# segments_1 ['攻击者', '利用', '微软', 'MSHTML', '漏洞', '窃取', '谷歌', '和', 'instagram', '凭证', '信息']
# segments_2 ['攻击', '利用', '微软', 'MSHTML', '漏洞', '窃取', '谷歌', '和', 'instagram', '凭证', '信息']
s1_hash = simhash(segments_1)
s2_hash = simhash(segments_2)
dif = np.logical_xor(s1_hash, s2_hash).sum() # is 18
我正在尝试实现 simhash 并尝试将其用于我们最近的工作,我编写了一些代码,它的工作原理是这样的,但我对这个差异感到困惑。为什么这么大,有些文章说 diff < 3 是相似度。
对不起我的英语。谢谢~