1

我实现了策略梯度方法来学习未知函数(这里是一个 10 循环求和函数),但是模型没有更新。学习数据是输入和目标。func2 包括用于预测目标数的 MLP 模型。代码如下:

import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import torch.optim as opt
import torch
from torch.autograd.variable import Variable

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(1, 8)
        self.fc2 = nn.Linear(8, 8)
        self.fc3 = nn.Linear(8, 1)

    def forward(self, x):
        x = self.fc1(x)
        x=F.relu(x)
        x = self.fc2(x)
        x=F.relu(x)
        x=self.fc3(x)
        x=F.sigmoid(x)
        return x

def assertEqual(label, pre_number):
    reward = -torch.sum((label - pre_number).pow(2))
    return reward
def func_mulsum(model, s,scalar,n_loop):
    c=s
    for i in range(n_loop):
        c = model(c)
    return c

def train_test_data(n_loop):
    nums=100
    rate=0.7
    train_data=np.zeros((int(nums*rate),2))
    test_data=np.zeros((int(nums*(1-rate)),2))
    data=random.sample(range(nums),nums)


    train=data[:int(nums*rate)]
    test = data[int(nums*rate):]

    train_data[:,0]=train
    test_data[:,0]=test

    for i,ans in enumerate(train):
        for j in range(n_loop):
            ans+=j
        train_data[i,1]=ans

    for i,ans1 in enumerate(test):
        for j in range(n_loop):
            ans1+=j
        test_data[i,1]=ans1
    return train_data,test_data


if __name__ == '__main__':
    n_loop=10
    iterations=10
    learn_data, test_case = train_test_data(n_loop)
    model=MLP()
    optim=opt.SGD(model.parameters(),lr=0.05)

    for i in range(iterations):
        reward_sum=0
        learn_data = torch.FloatTensor(learn_data)

        for j,data in enumerate(learn_data[:,0]):
            data=data.unsqueeze(0)

            label=learn_data[j,1]
            optim.zero_grad()
            pre=func_mulsum(model,data,255,n_loop)

            p_norm = torch.normal(pre, std=0.000000001)
            reward = assertEqual(p_norm,label)

           # print(p_norm,label)
            loss=reward*(torch.log(p_norm)*-1)

            loss.backward()

            reward_sum += loss
            optim.step()
            for para in model.parameters():
                pass
            print(para)

        print('reward_mean............................', reward_sum)

我找不到梯度都是 0 的原因。这个问题让我困惑了 2 天。任何人都可以帮助我吗?

4

0 回答 0