0
import jieba
from utils import hash_funcs
import numpy as np


string1 = "攻击者利用微软MSHTML漏洞窃取谷歌和instagram凭证信息"
string2 = "黑客利用微软MSHTML漏洞窃取谷歌和instagram凭证信息"


def cut_segment(string):
    return list(jieba.cut(string))


def signature(word):
    hash_string = hash_funcs.md5(word)
    return (bin(int(hash_string, 16))[2:]).zfill(128)


def merge(signatures, weights):
    sig_arr = []
    for sig, weight in zip(signatures, weights):
        sig_s = list(sig)
        sig_data = np.array(sig_s, dtype=int)
        sig_data[sig_data==0]=-1
        sig_arr.append(sig_data * weight)
    return np.array(sig_arr)

def reduce(arr):
    sum_reducer =  arr.sum(axis=0)
    sum_reducer[sum_reducer >0]=1
    sum_reducer[sum_reducer<1]=0
    return sum_reducer

def simhash(segments):
    weights = [1 for i in range(len(segments))] # assume weight is 1
    signatures = [signature(w) for w in segments]
    arrs = merge(signatures, weights)
    return reduce(arrs)
# segments_1 ['攻击者', '利用', '微软', 'MSHTML', '漏洞', '窃取', '谷歌', '和', 'instagram', '凭证', '信息']
# segments_2 ['攻击', '利用', '微软', 'MSHTML', '漏洞', '窃取', '谷歌', '和', 'instagram', '凭证', '信息']
s1_hash = simhash(segments_1)
s2_hash = simhash(segments_2)
dif = np.logical_xor(s1_hash, s2_hash).sum() # is 18 

我正在尝试实现 simhash 并尝试将其用于我们最近的工作,我编写了一些代码,它的工作原理是这样的,但我对这个差异感到困惑。为什么这么大,有些文章说 diff < 3 是相似度。

对不起我的英语。谢谢~

4

0 回答 0