我编写了这段代码来使用 Siamese 方法来计算两个文档的相似度。我想使用矢量化方法嵌入两个单独文档的矢量化层(嵌入是使用 Google 新闻数据集执行的),然后将其提供给 LSTM,LSTM 的输出进入余弦函数以测量两个文档的相似性。
#importing libraries
from __future__ import print_function
import gensim
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import csv
import re
import pandas as pd
from pandas import DataFrame
import pandas as pd
nltk.download('punkt')
from tensorflow import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras import layers
#Loading pre=trained word2vec model
from gensim.models.keyedvectors import KeyedVectors
# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory
model_path = 'D:\GoogleNews_vectors_negative300\GoogleNews_vectors_negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
#Setting Parameters for model
class DocSim(object):
def __init__(self, w2v_model , stopwords=[]):
self.w2v_model = w2v_model
self.stopwords = stopwords
def vectorize(self, doc):
"""Identify the vector values for each word in the given document"""
doc = doc.lower()
words = [w for w in doc.split(" ") if w not in self.stopwords]
word_vecs = []
for word in words:
try:
vec = self.w2v_model[word]
word_vecs.append(vec)
except KeyError:
# Ignore, if the word doesn't exist in the vocabulary
pass
# Assuming that document vector is the mean of all the word vectors
vector = np.mean(word_vecs, axis=0)
return vector
def Siamese_cosine_sim(self, vectorA, vectorB):
model = Sequential()
model.add(LSTM(20, return_sequences=True),input_shape=[vectorA,vectorB])
model.compile(loss='binary_crossentropy', optimizer='adam')
outputs = layers.Dense(1, activation="sigmoid")(left_doc,right_doc)
"""Find the cosine similarity distance between two vectors."""
csim = np.dot(left_doc, right_doc) / (np.linalg.norm(left_doc) * np.linalg.norm(right_doc))
if np.isnan(np.sum(csim)):
return 0
return csim
def calculate_similarity(self, withdigits_source_rules, withdigits_target_rules=[], threshold=0.8):
"""Calculates & returns similarity scores between given source rules & all
the target rules"""
if isinstance(withdigits_target_rules, str):
withdigits_target_rules = [withdigits_target_rules]
source_vec = self.vectorize(withdigits_source_rules)
results = []
for rule in withdigits_target_rules:
target_vec = self.vectorize(rule)
sim_score = self.Siamese_cosine_sim (source_vec, target_vec)
if sim_score > threshold:
results.append({
'Siamese Sim Score':sim_score,
'Target Rule':rule
})
# Sort results by score in desc order
results.sort(key=lambda k : k['Siamese Sim Score'] , reverse=True)
return results
ds = DocSim(w2v_model)
#Two documents data
withdigits_source_rules =set(["2.1 Separation of trains","2.3.1.2 Level crossing obstruction","2.2.1.1 Safety is compromised if a train proceeds without a movement autority","Principle: The method of signalling must maintain a space interval between trains that is safe.","2.1.1 Context"])
#Calculate the similarity score between a source rule & a target rule.
if isinstance(withdigits_source_rules, str):
withdigits_source_rules = [withdigits_source_rules]
# This will return one target rules text with a similarity score
for rule in withdigits_source_rules:
sim_scores= ds.calculate_similarity(rule, withdigits_target_rules)
# Printing the output in text file
print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) , file=open("output.txt", "a"))
print("\n")
# Printing output in Jupyter
print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) )
print("\n")
如果有人可以帮助我解决这个问题以及 LSTM 输入功能,我会收到以下错误吗?
TypeError: add() got an unexpected keyword argument 'input_shape'