我正在尝试在 Azure Notebook 中训练深度学习模型,该模型使用 DSVM-Ubuntu 18.04 中的 GPU,该模型由标准 NC6(6 vcpus,56 GiB 内存)组成,并且出现以下错误:
运行时错误:CUDA 内存不足。尝试分配 64.00 MiB(GPU 0;11.17 GiB 总容量;10.76 GiB 已分配;50.31 MiB 空闲;PyTorch 总共保留 10.84 GiB)
我在这方面进行了搜索,在网上的任何问题中都找不到任何解决方案。错误消息中的“PyTorch 总共保留了 10.84 GiB”引起了我的注意,这是否可以配置为具有低内存值?我想收到这方面的任何意见。谢谢你。
这是我的微调/训练代码
for epoch in range(EPOCHS):
for idx,article in tqdm_notebook(enumerate(article_loader)):
article_tens = torch.tensor(tokenizer.encode(article[0], max_length=1024)).unsqueeze(0).to(device)
outputs = model(article_tens, labels=article_tens)
train_loss, prediction_scores = outputs[:2]
train_loss.backward()
train_sum_loss = train_sum_loss + train_loss.detach().data
iteration_count=idx
article_count = article_count + 1
if article_count == BATCH_SIZE:
article_count = 0
batch_count += 1
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
错误的整个堆栈跟踪:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-11-2c74a22e42f7> in <module>
20 article_tens = torch.tensor(tokenizer.encode(article[0], max_length=1024)).unsqueeze(0).to(device)
21
---> 22 outputs = model(article_tens, labels=article_tens)
23
24 train_loss, prediction_scores = outputs[:2]
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, input_ids, past, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, use_cache)
602 head_mask=head_mask,
603 inputs_embeds=inputs_embeds,
--> 604 use_cache=use_cache,
605 )
606 hidden_states = transformer_outputs[0]
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, input_ids, past, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, use_cache)
486 attention_mask=attention_mask,
487 head_mask=head_mask[i],
--> 488 use_cache=use_cache,
489 )
490
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, x, layer_past, attention_mask, head_mask, use_cache)
240
241 x = x + a
--> 242 m = self.mlp(self.ln_2(x))
243 x = x + m
244
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, x)
215
216 def forward(self, x):
--> 217 h = self.act(self.c_fc(x))
218 h2 = self.c_proj(h)
219 return self.dropout(h2)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/activations.py in gelu_new(x)
27 Also see https://arxiv.org/abs/1606.08415
28 """
---> 29 return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
30
31
RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 11.17 GiB total capacity; 10.74 GiB already allocated; 320.00 KiB free; 10.89 GiB reserved in total by PyTorch)