python - Wandb 训练杀死了 jupyter 实验室中的内核

Question

在我的 jupyter 中，我可以在 batch_size=8 上训练我的模型，但是当我总是在 9 次迭代后使用 wandb 时，进程被终止并且内核重新启动。更奇怪的是，相同的代码在 colab 上运行，但使用我的 GPU (RTX 3080)，我永远无法完成该过程。

有谁知道如何克服这个问题？

编辑：我注意到内核每次尝试将渐变记录到 wandb 时都会死机。这可以解决吗？

带有wandb的代码：

def train_batch(images, labels, model, optimizer, criterion):
    images, labels = images.to(device), labels.to(device)
    
    # Forward pass ➡
    outputs = model(images)
    loss = criterion(outputs, labels)
    
    # Backward pass ⬅
    optimizer.zero_grad()
    loss.backward()

    # Step with optimizer
    optimizer.step()
    
    size = images.size(0)
    del images, labels
    return loss, size

from loss import YoloLoss

# train the model
def train(model, train_dl, criterion, optimizer, config, is_one_batch):
    # Tell wandb to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, criterion, log="all", log_freq=10)

    example_ct = 0  # number of examples seen
    batch_ct = 0
    
    # enumerate epochs
    for epoch in range(config.epochs):
        running_loss = 0.0
        
        if not is_one_batch:
            for i, (inputs, _, targets) in enumerate(train_dl):
                loss, batch_size = train_batch(inputs, targets, model, optimizer, criterion)
                running_loss += loss.item() * batch_size
        else:
            # for one batch only
            loss, batch_size = train_batch(train_dl[0], train_dl[2], model, optimizer, criterion)
            running_loss += loss.item() * batch_size
            
        epoch_loss = running_loss / len(train_dl)
#         loss_values.append(epoch_loss)
        wandb.log({"epoch": epoch, "avg_batch_loss": epoch_loss})
#         wandb.log({"epoch": epoch, "loss": loss}, step=example_ct)
        print("Average epoch loss {}".format(epoch_loss))
def make(config, is_one_batch, data_predefined=True):
    optimizers = {
        "Adam":torch.optim.Adam,
        "SGD":torch.optim.SGD
    }
    
    if data_predefined:
        train_dl, test_dl = train_dl_predef, test_dl_predef
    else:
        train_dl, test_dl = dataset.prepare_data()
        
    if is_one_batch:
        train_dl = next(iter(train_dl))
        test_dl = train_dl
    
    # Make the model
    model = architecture.darknet(config.batch_norm)
    model.to(device)

    # Make the loss and optimizer
    criterion = YoloLoss()
    optimizer = optimizers[config.optimizer](
        model.parameters(), 
        lr=config.learning_rate,
        momentum=config.momentum
    )
    
    return model, train_dl, test_dl, criterion, optimizer
        
def model_pipeline(hyp, is_one_batch=False, device=device):
    with wandb.init(project="YOLO-recreated", entity="bindas1", config=hyp):
        config = wandb.config
        
        # make the model, data, and optimization problem
        model, train_dl, test_dl, criterion, optimizer = make(config, is_one_batch)
        
        # and use them to train the model
        train(model, train_dl, criterion, optimizer, config, is_one_batch)
        
    return model

没有wandb的代码：

def train_model(train_dl, model, is_one_batch=False):
    # define the optimization
    criterion = YoloLoss()
    optimizer = SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
    
    # for loss plotting
    loss_values = []
    
    # enumerate epochs
    for epoch in tqdm(range(EPOCHS)):
        if epoch % 10 == 0:
            print(epoch)
        running_loss = 0.0
        
        if not is_one_batch:
        # enumerate mini batches
            for i, (inputs, _, targets) in enumerate(train_dl):
                inputs = inputs.to(device)
                targets = targets.to(device)
                # clear the gradients
                optimizer.zero_grad()
                # compute the model output
                yhat = model(inputs)
                # calculate loss
                loss = criterion(yhat, targets)
                # credit assignment
                loss.backward()
#                 print(loss)
                running_loss =+ loss.item() * inputs.size(0)
                # update model weights
                optimizer.step()
        else:
            # for one batch only
            with torch.autograd.detect_anomaly():
                inputs, targets = train_dl[0].to(device), train_dl[2].to(device)
                optimizer.zero_grad()
                # compute the model output
                yhat = model(inputs)
                # calculate loss
                loss = criterion(yhat, targets)
                # credit assignment
                loss.backward()
                print(loss)
                running_loss =+ loss.item() * inputs.size(0)
                # update model weights
                optimizer.step()
        loss_values.append(running_loss / len(train_dl))
    
    plot_loss(loss_values)

model = architecture.darknet()
model.to(device)
optimizer = SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
train_dl_main, test_dl_main = train_dl_predef, test_dl_predef
one_batch = next(iter(train_dl_main))
train_model_wandb(one_batch, model, is_one_batch=True)

score 1 · Accepted Answer

嗯，奇怪，所以在你的编辑中你说如果你删除它就可以了wandb.watch吗？

仔细检查一下，您是否在最新版本的 wandb (0.12.7) 上尝试过原始代码？

python - Wandb 训练杀死了 jupyter 实验室中的内核

1 回答 1

Related

Reference