我是 Keras 的新手,我正在尝试在 Keras 中使用 NN 解决句子相似性的任务。我使用 word2vec 作为词嵌入,然后使用连体网络来预测两个句子的相似程度。Siamese 网络的基础网络是 LSTM,为了合并这两个基础网络,我使用了带有余弦相似度度量的 Lambda 层。作为数据集,我使用的是 SICK 数据集,它为每对句子打分,从 1(不同)到 5(非常相似)。
我创建了网络并且它运行了,但我有很多疑问:首先我不确定我用句子喂 LSTM 的方式是否合适。我对每个单词进行 word2vec 嵌入,每个句子只创建一个数组,用零填充到 seq_len 以获得相同的长度数组。然后我以这种方式重塑它: data_A = embedding_A.reshape((len(embedding_A), seq_len, feature_dim))
此外我不确定我的连体网络是否正确,因为不同对的很多预测都是相等的,并且损失没有太大变化(从 0.3300 到 0.2105 在 10 个时期内,并且在 100 个时期内变化不大时代)。
有人可以帮助我找到并理解我的错误吗?非常感谢(对不起我的英语不好)
对我的代码感兴趣的部分
def cosine_distance(vecs):
#I'm not sure about this function too
y_true, y_pred = vecs
y_true = K.l2_normalize(y_true, axis=-1)
y_pred = K.l2_normalize(y_pred, axis=-1)
return K.mean(1 - K.sum((y_true * y_pred), axis=-1))
def cosine_dist_output_shape(shapes):
shape1, shape2 = shapes
print((shape1[0], 1))
return (shape1[0], 1)
def contrastive_loss(y_true, y_pred):
margin = 1
return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
def create_base_network(feature_dim,seq_len):
model = Sequential()
model.add(LSTM(100, batch_input_shape=(1,seq_len,feature_dim),return_sequences=True))
model.add(Dense(50, activation='relu'))
model.add(Dense(10, activation='relu'))
return model
def siamese(feature_dim,seq_len, epochs, tr_dataA, tr_dataB, tr_y, te_dataA, te_dataB, te_y):
base_network = create_base_network(feature_dim,seq_len)
input_a = Input(shape=(seq_len,feature_dim,))
input_b = Input(shape=(seq_len,feature_dim))
processed_a = base_network(input_a)
processed_b = base_network(input_b)
distance = Lambda(cosine_distance, output_shape=cosine_dist_output_shape)([processed_a, processed_b])
model = Model([input_a, input_b], distance)
adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=adam, loss=contrastive_loss)
model.fit([tr_dataA, tr_dataB], tr_y,
batch_size=128,
epochs=epochs,
validation_data=([te_dataA, te_dataB], te_y))
pred = model.predict([tr_dataA, tr_dataB])
tr_acc = compute_accuracy(pred, tr_y)
for i in range(len(pred)):
print (pred[i], tr_y[i])
return model
def padding(max_len, embedding):
for i in range(len(embedding)):
padding = np.zeros(max_len-embedding[i].shape[0])
embedding[i] = np.concatenate((embedding[i], padding))
embedding = np.array(embedding)
return embedding
def getAB(sentences_A,sentences_B, feature_dim, word2idx, idx2word, weights,max_len_def=0):
#from_sentence_to_array : function that transforms natural language sentences
#into vectors of real numbers. Each word is replaced with the corrisponding word2vec
#embedding, and words that aren't in the embedding are replaced with zeros vector.
embedding_A, max_len_A = from_sentence_to_array(sentences_A,word2idx, idx2word, weights)
embedding_B, max_len_B = from_sentence_to_array(sentences_B,word2idx, idx2word, weights)
max_len = max(max_len_A, max_len_B,max_len_def*feature_dim)
#padding to max_len
embedding_A = padding(max_len, embedding_A)
embedding_B = padding(max_len, embedding_B)
seq_len = int(max_len/feature_dim)
print(seq_len)
#rashape
data_A = embedding_A.reshape((len(embedding_A), seq_len, feature_dim))
data_B = embedding_B.reshape((len(embedding_B), seq_len, feature_dim))
print('A,B shape: ',data_A.shape, data_B.shape)
return data_A, data_B, seq_len
FEATURE_DIMENSION = 100
MIN_COUNT = 10
WINDOW = 5
if __name__ == '__main__':
data = pd.read_csv('data\\train.csv', sep='\t')
sentences_A = data['sentence_A']
sentences_B = data['sentence_B']
tr_y = 1- data['relatedness_score']/5
if not (os.path.exists(EMBEDDING_PATH) and os.path.exists(VOCAB_PATH)):
create_embeddings(embeddings_path=EMBEDDING_PATH, vocab_path=VOCAB_PATH, size=FEATURE_DIMENSION, min_count=MIN_COUNT, window=WINDOW, sg=1, iter=25)
word2idx, idx2word, weights = load_vocab_and_weights(VOCAB_PATH,EMBEDDING_PATH)
tr_dataA, tr_dataB, seq_len = getAB(sentences_A,sentences_B, FEATURE_DIMENSION,word2idx, idx2word, weights)
test = pd.read_csv('data\\test.csv', sep='\t')
test_sentences_A = test['sentence_A']
test_sentences_B = test['sentence_B']
te_y = 1- test['relatedness_score']/5
te_dataA, te_dataB, seq_len = getAB(test_sentences_A,test_sentences_B, FEATURE_DIMENSION,word2idx, idx2word, weights, seq_len)
model = siamese(FEATURE_DIMENSION, seq_len, 10, tr_dataA, tr_dataB, tr_y, te_dataA, te_dataB, te_y)
test_a = ['this is my dog']
test_b = ['this dog is mine']
a,b,seq_len = getAB(test_a,test_b, FEATURE_DIMENSION,word2idx, idx2word, weights, seq_len)
prediction = model.predict([a, b])
print(prediction)
一些结果:
my prediction | true label
0.849908 0.8
0.849908 0.8
0.849908 0.74
0.849908 0.76
0.849908 0.66
0.849908 0.72
0.849908 0.64
0.849908 0.8
0.849908 0.78
0.849908 0.8
0.849908 0.8
0.849908 0.8
0.849908 0.8
0.849908 0.74
0.849908 0.8
0.849908 0.8
0.849908 0.8
0.849908 0.66
0.849908 0.8
0.849908 0.66
0.849908 0.56
0.849908 0.8
0.849908 0.8
0.849908 0.76
0.847546 0.78
0.847546 0.8
0.847546 0.74
0.847546 0.76
0.847546 0.72
0.847546 0.8
0.847546 0.78
0.847546 0.8
0.847546 0.72
0.847546 0.8
0.847546 0.8
0.847546 0.78
0.847546 0.8
0.847546 0.78
0.847546 0.78
0.847546 0.46
0.847546 0.72
0.847546 0.8
0.847546 0.76
0.847546 0.8
0.847546 0.8
0.847546 0.8
0.847546 0.8
0.847546 0.74
0.847546 0.8
0.847546 0.72
0.847546 0.68
0.847546 0.56
0.847546 0.8
0.847546 0.78
0.847546 0.78
0.847546 0.8
0.852975 0.64
0.852975 0.78
0.852975 0.8
0.852975 0.8
0.852975 0.44
0.852975 0.72
0.852975 0.8
0.852975 0.8
0.852975 0.76
0.852975 0.8
0.852975 0.8
0.852975 0.8
0.852975 0.78
0.852975 0.8
0.852975 0.8
0.852975 0.78
0.852975 0.8
0.852975 0.8
0.852975 0.76
0.852975 0.8