我正在尝试使用 pytorch 重现论文中提出的模型的结果。该模型利用atttion机制来达到在知识图谱中进行关系预测的目的。主要分为图注意力网络(GAT)和卷积网络两部分。训练GAT的代码如下:
def train_gat(args):
# Creating the gat model here.
####################################
current_batch_2hop_indices = torch.tensor([])
if(args.use_2hop):
current_batch_2hop_indices = Corpus_.get_batch_nhop_neighbors_all(args,
Corpus_.unique_entities_train, node_neighbors_2hop)
if CUDA:
current_batch_2hop_indices = Variable(
torch.LongTensor(current_batch_2hop_indices)).cuda()
else:
current_batch_2hop_indices = Variable(
torch.LongTensor(current_batch_2hop_indices))
print("Defining model")
print(
"\nModel type -> GAT layer with {} heads used , Initital Embeddings training".format(args.nheads_GAT[0]))
model_gat = SpKBGATModified(entity_embeddings, relation_embeddings, args.entity_out_dim, args.entity_out_dim,
args.drop_GAT, args.alpha, args.nheads_GAT)
if CUDA:
model_gat.cuda()
model_gat=torch.nn.DataParallel(model_gat,device_ids=device_ids)
optimizer = torch.optim.Adam(
model_gat.parameters(), lr=args.lr, weight_decay=args.weight_decay_gat)
scheduler = torch.optim.lr_scheduler.StepLR(
optimizer, step_size=500, gamma=0.5, last_epoch=-1)
gat_loss_func = nn.MarginRankingLoss(margin=args.margin)
epoch_losses = [] # losses of all epochs
print("Number of epochs {}".format(args.epochs_gat))
for epoch in range(args.epochs_gat):
print("\nepoch-> ", epoch)
random.shuffle(Corpus_.train_triples)
Corpus_.train_indices = np.array(
list(Corpus_.train_triples)).astype(np.int32)
model_gat.train() # getting in training mode
start_time = time.time()
epoch_loss = []
if len(Corpus_.train_indices) % args.batch_size_gat == 0:
num_iters_per_epoch = len(
Corpus_.train_indices) // args.batch_size_gat
else:
num_iters_per_epoch = (
len(Corpus_.train_indices) // args.batch_size_gat) + 1
for iters in range(num_iters_per_epoch):
start_time_iter = time.time()
train_indices, train_values = Corpus_.get_iteration_batch(iters)
if CUDA:
train_indices = Variable(
torch.LongTensor(train_indices)).cuda()
train_values = Variable(torch.FloatTensor(train_values)).cuda()
else:
train_indices = Variable(torch.LongTensor(train_indices))
train_values = Variable(torch.FloatTensor(train_values))
# forward pass
entity_embed, relation_embed = model_gat(
Corpus_, Corpus_.train_adj_matrix, train_indices, current_batch_2hop_indices)
optimizer.zero_grad()
loss = batch_gat_loss(
gat_loss_func, train_indices, entity_embed, relation_embed)
loss.backward()
optimizer.step()
epoch_loss.append(loss.data.item())
end_time_iter = time.time()
print("Iteration-> {0} , Iteration_time-> {1:.4f} , Iteration_loss {2:.4f}".format(
iters, end_time_iter - start_time_iter, loss.data.item()))
scheduler.step()
print("Epoch {} , average loss {} , epoch_time {}".format(
epoch, sum(epoch_loss) / len(epoch_loss), time.time() - start_time))
epoch_losses.append(sum(epoch_loss) / len(epoch_loss))
save_model(model_gat, args.data, epoch,
args.output_folder)
计算 GAT 损失的代码如下:
def batch_gat_loss(gat_loss_func, train_indices, entity_embed, relation_embed):
len_pos_triples = int(
train_indices.shape[0] / (int(args.valid_invalid_ratio_gat) + 1))
pos_triples = train_indices[:len_pos_triples]
neg_triples = train_indices[len_pos_triples:]
pos_triples = pos_triples.repeat(int(args.valid_invalid_ratio_gat), 1)
source_embeds = entity_embed[pos_triples[:, 0]]
relation_embeds = relation_embed[pos_triples[:, 1]]
tail_embeds = entity_embed[pos_triples[:, 2]]
x = source_embeds + relation_embeds - tail_embeds
pos_norm = torch.norm(x, p=1, dim=1)
source_embeds = entity_embed[neg_triples[:, 0]]
relation_embeds = relation_embed[neg_triples[:, 1]]
tail_embeds = entity_embed[neg_triples[:, 2]]
x = source_embeds + relation_embeds - tail_embeds
neg_norm = torch.norm(x, p=1, dim=1)
y = -torch.ones(int(args.valid_invalid_ratio_gat) * len_pos_triples).cuda()
loss = gat_loss_func(pos_norm, neg_norm, y)
return loss
本文用于训练模型的数据集是 FB15k-237、NELL-995 和 umls。他们的训练集中三元组的数量是:272115、149678 和 5216。我成功地在所有三个数据集上重现了结果。但是,当我尝试使用新的数据集 YAGO3-10(具有 1079040 个训练数据)来训练模型时,出现 GPU 内存溢出错误。错误信息如下:</p>
Traceback (most recent call last):
File "main.py", line 366, in <module>
train_gat(args)
File "main.py", line 240, in train_gat
gat_loss_func, train_indices, entity_embed, relation_embed)
File "main.py", line 149, in batch_gat_loss
x = source_embeds + relation_embeds - tail_embeds
RuntimeError: CUDA out of memory. Tried to allocate 1.61 GiB (GPU 0; 15.77 GiB total capacity; 14.24 GiB already allocated; 120.25 MiB free; 440.29 MiB cached)
我尝试过的方法如下:
减少批量大小。减少后,它会在反向传播期间溢出。错误信息如下:
Traceback (most recent call last): File "main.py", line 366, in <module> train_gat(args) File "main.py", line 242, in train_gat loss.backward() File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/tensor.py", line 107, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/autograd/__init__.py", line 93, in backward allow_unreachable=True) # allow_unreachable flag RuntimeError: CUDA out of memory. Tried to allocate 2.60 GiB (GPU 0; 15.77 GiB total capacity; 11.86 GiB already allocated; 2.42 GiB free; 511.96 MiB cached)
使用 torch.cuda.empty_cache () 在反向传播之前删除无用的变量。但是,最大内存使用量似乎没有改变。
使用dataparallel在两个GPU上训练模型,但没有运行成功,似乎是因为一些模型参数在不同的GPU上。错误信息如下:
Traceback (most recent call last): File "main.py", line 377, in <module> train_gat(args) File "main.py", line 244, in train_gat Corpus_, Corpus_.train_adj_matrix_dim1, Corpus_.train_adj_matrix_dim00, Corpus_.train_adj_matrix_dim01 ,train_indices, current_batch_2hop_indices) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward outputs = self.parallel_apply(replicas, inputs, kwargs) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply raise output File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker output = module(*input, **kwargs) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/user1/cxy/KBAT-2/models.py", line 148, in forward edge_list, edge_type, edge_embed, edge_list_nhop, edge_type_nhop) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/user1/cxy/KBAT-2/models.py", line 55, in forward for att in self.attentions], dim=1) File "/home/user1/cxy/KBAT-2/models.py", line 55, in <listcomp> for att in self.attentions], dim=1) File "/home/user1/anaconda3/envs/stw/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__ result = self.forward(*input, **kwargs) File "/home/user1/cxy/KBAT-2/layers.py", line 140, in forward edge_m = self.a.mm(edge_h) RuntimeError: arguments are located on different GPUs at /opt/conda/conda-bld/pytorch_1556653183467/work/aten/src/THC/generic/THCTensorMathBlas.cu:255
使用 apex 包进行 float16 计算。但是有些参数会报nan值,这可能是使用float16的原因。
后两种方法可能有效,但我没有成功运行代码。请问有没有人有使用上述方法的经验,或者有更好的建议。
由于篇幅限制,我无法展示所有代码。该模型的代码链接如下: https ://github.com/deepakn97/relationPrediction