由于我是 Pytorch 的新手,这个问题可能是一个非常微不足道的问题,但我想请你帮忙解决这个问题。
我已经从一篇论文中实现了一个网络,并使用了论文中描述的所有超参数和所有层。
但是当它开始训练时,即使我将学习率衰减设置为 0.001,错误并没有下降。训练误差在 3.3~3.4 左右,测试误差在 3.5~3.6 左右,在 100 个 epoch 中......!
我可以更改超参数以改进模型,但由于论文中提供了确切的数字,我想看看我实现的训练代码中是否存在错误。
下面的代码是我用于训练的代码。
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import json
import torch
import math
import time
import os
model = nn.Sequential(Baseline(), Classification(40)).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)
batch = 32
train_path = '/content/mtrain'
train_data = os.listdir(train_path)
test_path = '/content/mtest'
test_data = os.listdir(test_path)
train_loader = torch.utils.data.DataLoader(train_data, batch, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch, shuffle=True)
train_loss, val_loss = [], []
epochs = 100
now = time.time()
print('training start!')
for epoch in range(epochs):
running_loss = 0.0
for bidx, trainb32 in enumerate(train_loader):
bpts, blabel = [], []
for i, data in enumerate(trainb32):
path = os.path.join(train_path, data)
with open(path, 'r') as f:
jdata = json.load(f)
label = jdata['label']
pts = jdata['pts']
bpts.append(pts)
blabel.append(label)
bpts = torch.tensor(bpts).transpose(1, 2).to(device)
blabel = torch.tensor(blabel).to(device)
input = data_aug(bpts).to(device)
optimizer.zero_grad()
y_pred, feat_stn, glob_feat = model(input)
# print(f'global_feat is {global_feat}')
loss = F.nll_loss(y_pred, blabel) + 0.001 * regularizer(feat_stn)
loss.backward()
optimizer.step()
running_loss += loss.item()
if bidx % 10 == 9:
vrunning_loss = 0
vacc = 0
model.eval()
with torch.no_grad():
# val batch
for vbidx, testb32 in enumerate(test_loader):
bpts, blabel = [], []
for j, data in enumerate(testb32):
path = os.path.join(test_path, data)
with open(path, 'r') as f:
jdata = json.load(f)
label = jdata['label']
pts = jdata['pts']
bpts.append(pts)
blabel.append(label)
bpts = torch.tensor(bpts).transpose(1, 2).to(device)
blabel = torch.tensor(blabel).to(device)
input = data_aug(bpts).to(device)
vy_pred, vfeat_stn, vglob_feat = model(input)
# print(f'global_feat is {vglob_feat}')
vloss = F.nll_loss(vy_pred, blabel) + 0.001 * regularizer(vfeat_stn)
_, vy_max = torch.max(vy_pred, dim=1)
vy_acc = torch.sum(vy_max == blabel) / batch
vacc += vy_acc
vrunning_loss += vloss
# print every training 10th batch
train_loss.append(running_loss / len(train_loader))
val_loss.append(vrunning_loss / len(test_loader))
print(f"Epoch {epoch+1}/{epochs} {bidx}/{len(train_loader)}.. "
f"Train loss: {running_loss / 10:.3f}.."
f"Val loss: {vrunning_loss / len(test_loader):.3f}.."
f"Val Accuracy: {vacc/len(test_loader):.3f}.."
f"Time: {time.time() - now}")
now = time.time()
running_loss = 0
model.train()
print(f'training finish! training time is {time.time() - now}')
print(model.parameters())
savePath = '/content/modelpath.pth'
torch.save(model.state_dict(), '/content/modelpath.pth')
很抱歉这个基本问题,但如果这个培训代码没有错误,很高兴让我知道,如果有,请给出任何提示来解决..
我已经实现了 pointNet 代码,完整代码可在https://github.com/RaraKim/PointNet/blob/master/PointNet_pytorch.ipynb获得
谢谢!