我正在尝试使用类似于 ImageNet 的数据集从头开始重新训练 resnet50。我写了以下训练循环:
def train_network(epochs , train_loader , val_loader , optimizer , network):
since = time.time ( )
train_acc_history = []
val_acc_history = []
best_model_weights = copy.deepcopy (network.state_dict ( ))
best_accuracy = 0.0
for epoch in range (epochs):
correct_train = 0
correct_val = 0
for x , t in train_loader:
x = x.to (device)
t = t.to (device)
optimizer.zero_grad ( )
z = network (x)
J = loss (z , t)
J.backward ( )
optimizer.step ( )
_ , y = torch.max (z , 1)
correct_train += torch.sum (y == t.data)
with torch.no_grad ( ):
network.eval ( )
for x_val , t_val in val_loader:
x_val = x_val.to (device)
t_val = t_val.to (device)
z_val = network (x_val)
_ , y_val = torch.max (z_val , 1)
correct_val += torch.sum (y_val == t_val.data)
network.train ( )
train_accuracy = correct_train.float ( ) / len (train_loader.dataset)
val_accuracy = correct_val.float ( ) / len (val_loader.dataset)
print (
F"Epoch: {epoch + 1} train_accuracy: {(train_accuracy.item ( ) * 100):.3f}% val_accuracy: {(val_accuracy.item ( ) * 100):.3f}%" ,
flush = True)
# time_elapsed_epoch = time.time() - since
# print ('Time taken for Epoch {} is {:.0f}m {:.0f}s'.format (epoch + 1, time_elapsed_epoch // 60 , time_elapsed_epoch % 60))
if val_accuracy > best_accuracy:
best_accuracy = val_accuracy
best_model_weights = copy.deepcopy (network.state_dict ( ))
train_acc_history.append (train_accuracy)
val_acc_history.append (val_accuracy)
print ( )
time_elapsed = time.time ( ) - since
print ('Training complete in {:.0f}m {:.0f}s'.format (time_elapsed // 60 , time_elapsed % 60))
print ('Best Validation Accuracy: {:3f}'.format (best_accuracy * 100))
network.load_state_dict (best_model_weights)
return network , train_acc_history , val_acc_history
但是我的训练和验证准确性极差,如下所示:
> Epoch: 1 train_accuracy: 3.573% val_accuracy: 3.481%
> Epoch: 2 train_accuracy: 3.414% val_accuracy: 3.273%
> Epoch: 3 train_accuracy: 3.515% val_accuracy: 4.039%
> Epoch: 4 train_accuracy: 3.567% val_accuracy: 4.195%
谷歌搜索后,我发现从头开始训练的准确度通常不会那么差(实际上它们从大约 40% - 50% 开始)。我发现很难理解故障可能在哪里。如果有人可以帮助我找出我可能出错的地方,那就太好了。
谢谢