python - 使用 keras 进行文档相似性的 Siamese LSTM 给出输入错误

Question

我编写了这段代码来使用 Siamese 方法来计算两个文档的相似度。我想使用矢量化方法嵌入两个单独文档的矢量化层（嵌入是使用 Google 新闻数据集执行的），然后将其提供给 LSTM，LSTM 的输出进入余弦函数以测量两个文档的相似性。

#importing libraries
from __future__ import print_function
import gensim
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import csv
import re
import pandas as pd
from pandas import DataFrame
import pandas as pd
nltk.download('punkt')

from tensorflow import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras import layers

#Loading pre=trained word2vec model

from gensim.models.keyedvectors import KeyedVectors

# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory

model_path = 'D:\GoogleNews_vectors_negative300\GoogleNews_vectors_negative300.bin'   
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

#Setting Parameters for model

class DocSim(object):
    def __init__(self, w2v_model , stopwords=[]):
        self.w2v_model = w2v_model
        self.stopwords = stopwords
        
    def vectorize(self, doc):
        """Identify the vector values for each word in the given document"""
        doc = doc.lower()
        words = [w for w in doc.split(" ") if w not in self.stopwords]
        word_vecs = []
        for word in words:
            try:
                vec = self.w2v_model[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore, if the word doesn't exist in the vocabulary
                pass

        # Assuming that document vector is the mean of all the word vectors

        vector = np.mean(word_vecs, axis=0)
        return vector
        
        
    def Siamese_cosine_sim(self, vectorA, vectorB):
        model = Sequential()
        model.add(LSTM(20, return_sequences=True),input_shape=[vectorA,vectorB])
        model.compile(loss='binary_crossentropy', optimizer='adam')
        outputs = layers.Dense(1, activation="sigmoid")(left_doc,right_doc)

        """Find the cosine similarity distance between two vectors."""
        csim = np.dot(left_doc, right_doc) / (np.linalg.norm(left_doc) * np.linalg.norm(right_doc))
        if np.isnan(np.sum(csim)):
            return 0
        return csim
 

    def calculate_similarity(self, withdigits_source_rules, withdigits_target_rules=[], threshold=0.8):
            """Calculates & returns similarity scores between given source rules & all
            the target rules"""
            if isinstance(withdigits_target_rules, str):
                withdigits_target_rules = [withdigits_target_rules]


            source_vec = self.vectorize(withdigits_source_rules)
            results = []

            for rule in withdigits_target_rules:
                target_vec = self.vectorize(rule)
                sim_score = self.Siamese_cosine_sim (source_vec, target_vec)
                if sim_score > threshold:
                    results.append({
                        'Siamese Sim Score':sim_score,
                        'Target Rule':rule
                    })


                # Sort results by score in desc order
                results.sort(key=lambda k : k['Siamese Sim Score'] , reverse=True)

            return results

ds = DocSim(w2v_model)

#Two documents data
withdigits_source_rules =set(["2.1 Separation of trains","2.3.1.2 Level crossing obstruction","2.2.1.1 Safety is compromised if a train proceeds without a movement autority","Principle: The method of signalling must maintain a space interval between trains that is safe.","2.1.1 Context"])

#Calculate the similarity score between a source rule & a target rule.


if isinstance(withdigits_source_rules, str):
    withdigits_source_rules = [withdigits_source_rules]
   

# This will return one target rules text with a similarity score

for rule in withdigits_source_rules:
    sim_scores= ds.calculate_similarity(rule, withdigits_target_rules)

    

    
    # Printing the output in text file
    
    print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) , file=open("output.txt", "a"))
    print("\n")
    
    
    # Printing output in Jupyter
    
    print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) )
    print("\n")

如果有人可以帮助我解决这个问题以及 LSTM 输入功能，我会收到以下错误吗？

TypeError: add() got an unexpected keyword argument 'input_shape'

score 0 · Accepted Answer

请参阅此处的文档以将层添加到顺序模型。该add方法只接受一个参数 - layer。如果传递的参数不是层实例，它会引发TypeError，这正是它抛出的错误。我猜，您想将input_shape参数传递给LSTM层（创建顺序模型后的行）。只需将其移动到 LSTM 层内，它应该可以正常工作。

python - 使用 keras 进行文档相似性的 Siamese LSTM 给出输入错误

1 回答 1

Related

Reference