下面的代码在单个 GPU 上工作,但在使用多个 GPU 时抛出错误 RuntimeError: grad can be implicitly created only for scalar outputs
代码
def forward(
self,
input_ids,
attention_mask=None,
decoder_input_ids=None,
decoder_attention_mask=None,
lm_labels=None
):
return self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
labels=lm_labels,
)
def _step(self, batch):
lm_labels = batch["target_ids"]
# lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100
outputs = self(
input_ids=batch["source_ids"],
attention_mask=batch["source_mask"],
lm_labels=lm_labels,
decoder_attention_mask=batch['target_mask']
)
loss = outputs[0]
return loss
def training_step(self, batch, batch_idx):
loss = self._step(batch)
return {"loss": loss}
损失值是一个缩放器: tensor(12.8875, device='cuda:1', grad_fn=NllLossBackward) 这个错误背后的原因是什么?
回溯(最后一次调用):文件“training_trial.py”,第 390 行,在 trainer.fit(model) 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning /trainer/trainer.py”,第 510 行,适合结果 = self.accelerator_backend.train() 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/accelerators/加速器.py”,第 57 行,在火车中返回 self.train_or_test() 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py”,行74,在train_or_test results = self.trainer.train() 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py”,第561行,在训练自己。train_loop.run_training_epoch() 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py”,第 549 行,在 run_training_epoch batch_output = self.run_training_batch(batch ,batch_idx,dataloader_idx)文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py”,第704行,在run_training_batch self.optimizer_step(优化器,opt_idx ,batch_idx,train_step_and_backward_closure)文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py”,第490行,在optimizer_step using_lbfgs=is_lbfgs,文件“/主页/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/core/lightning。py”,第 1296 行,在 optimizer_step optimizer.step(closure=optimizer_closure) 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py”,行286、在步骤self.__optimizer_step(*args,closure=closure,profiler_name=profiler_name,**kwargs)文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/core /optimizer.py",第 144 行,在 _7/site-packages/pytorch_lightning/core/optimizer.py”,第 144 行,在 _7/site-packages/pytorch_lightning/core/optimizer.py”,第 144 行,在 _optimizer_step optimizer.step(closure=closure, *args, **kwargs) 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/torch/optim/lr_scheduler.py”,行67、在 wrapper 中返回 Wrapped(*args, **kwargs) 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/transformers/optimization.py”,第 318 行,步骤loss =closure() 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py”,第 699 行,在 train_step_and_backward_closure self.trainer.hiddens 文件中“ /home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py”,第 802 行,training_step_and_backward self.backward(result, optimizer, opt_idx) 文件”/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py”,第 829 行,后向 result.closure_loss,优化器,opt_idx,*args,**kwargs文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py”,第 109 行,后向 model.backward(closure_loss,优化器,opt_idx,*args , **kwargs) 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py”,第 1162 行,后向 loss.backward(*args, **kwargs)文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/torch/tensor.py”,第 221 行,后向 torch.autograd.backward(self,gradient,保留图,创建图)文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/torch/autograd/init .py”,第 126 行,向后 grad_tensors = make_grads(tensors, grad_tensors ) 文件“/home/nvarshn2/.conda/envs/pytorch_lightning_new_env/lib/python3.7/site-packages/torch/autograd/init .py ” ,第 50 行,在 _make_grads raise RuntimeError("grad can be implicitly created only for scalar outputs") RuntimeError: grad can be implicitly created only for scalar outputs