1

我已经在 stable_baselines 中进行了试验,效果不错,并且一直想在 stable_baselines3 上进行尝试。

我正在使用 A2C 模型来训练库存环境。使用了目前正在使用 stable_baselines 的自定义环境。我在训练过程中看到了不稳定,并希望迁移到 stable_baselines3 以防万一。

def train_A2C(env_train, model_name,timesteps = 50000, i=0 ):
    start = time.time()
    # policy_kwargs = dict(net_arch=[128, 128])
    policy_kwargs=dict(optimizer_class=RMSpropTFLike)
    model = A2C(MlpPolicy,env_train,verbose = 1,tensorboard_log='./tensorboard/tensorboard_A2C/', 
            learning_rate =0.0001, vf_coef = 0.05, ent_coef = 0.005, policy_kwargs=policy_kwargs)
        model.learn(total_timesteps = timesteps,tb_log_name = f"A2C_{i}")
    end = time.time()

    model.save(f'{config.TRAINED_MODEL_DIR}/{model_name}')
    print(f'Training Time A2C : ', (end - start) /60, ' minutes')
    return model

自定义环境如下。调试时,在环境中的第一个“步骤”之后立即弹出错误。可能是因为退货问题。我确实将状态的返回类型更改为 np.array 并且问题没有解决。

STOCK_DIM = 422 # INITIAL_BALANCE, Amount_of_shares,  60 days * (Norm_close, 1m/2m/3m/1y ret, macd,rsi) 
INITIAL_ACCOUNT_BALANCE = 1000000
TRANSACTION_FEE_PERCENT = 0.01
REWARD_SCALING = 1e-5
HMAX_NORMALIZE = 100
N = config.PREV_DATA_POINTS


class StockEnv_Train(gym.Env):
    def __init__(self,df,day = 374):
        # day set to 260 since we need to account for the annual returns that only start around 252 days,
        # macd only starts from 254
        # we add 63 days to this to account for the historical data
        self.day = day
        self.df = df
        self.action_space = spaces.Box(low = -1,high = 1,shape =(1,) )
        self.observation_space = spaces.Box(low = 0, high = np.inf, shape = (STOCK_DIM,))
        self.terminal = False

        self.state = [INITIAL_ACCOUNT_BALANCE] + [0] + self.df.iloc[self.day-N:self.day,1].tolist() + \
                        self.df.iloc[self.day-N:self.day,2].tolist() + self.df.iloc[self.day-N:self.day,3].tolist() + \
                        self.df.iloc[self.day-N:self.day,4].tolist() + self.df.iloc[self.day-N:self.day,5].tolist() + \
                        self.df.iloc[self.day-N:self.day,6].tolist() + self.df.iloc[self.day-N:self.day,7].tolist()

        self.reward = 0
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.cost = 0
        self.rewards_memory = []
        self.trades = 0

        self._seed()

    
    def _sell_stock(self,action):
        if self.state[1] > 0:
            self.state[0] += (self.df.iloc[self.day,-1]*min(abs(action),self.state[1]) * \
                            (1- TRANSACTION_FEE_PERCENT)).item()
            self.cost +=self.df.iloc[self.day,-1]*min(abs(action),self.state[1]) * \
                            TRANSACTION_FEE_PERCENT
            self.state[1] -= (min(abs(action), self.state[1]))
            self.trades+=1
        else:
            pass

    def _buy_stock(self,action):
        # perform buy action based on the sign of the action
        available_amount = self.state[0] // self.df.iloc[self.day,-1]
        # print('available_amount:{}'.format(available_amount))

        #update balance
        self.state[0] -= (self.df.iloc[self.day,-1]*min(available_amount, action)* \
                          (1+ TRANSACTION_FEE_PERCENT)).item()

        self.state[1] += (min(available_amount, action))

        self.cost+=self.df.iloc[self.day,-1]*min(available_amount, action)* \
                          TRANSACTION_FEE_PERCENT
        self.trades+=1

    def step(self,action):
        self.terminal = self.day >= len(self.df.Date.unique()) - 1

        if self.terminal:
            plt.plot(self.asset_memory,'r')
            plt.savefig('results/account_value_train.png')
            plt.close()

            df_total_value = pd.DataFrame(self.asset_memory)
            df_total_value.to_csv('results/account_value_train.csv')

            df_total_value.columns = ['account_value']
            df_total_value['daily_return'] = df_total_value.pct_change(1)
            
            df_rewards = pd.DataFrame(self.rewards_memory)
            
            return self.state, self.reward*REWARD_SCALING,self.terminal, {}

        else:
            # print(np.array(self.state[1:29]))

            action = action * HMAX_NORMALIZE
            #actions = (actions.astype(int))
            
            begin_total_asset = self.state[0]+ self.state[1]*self.df.iloc[self.day,-1]
            
            if action<0:
                self._sell_stock( action)
            else:
                self._buy_stock(action)

            self.day += 1        
            #load next state
            # print("stock_shares:{}".format(self.state[29:]))

            self.state = [self.state[0]] + [self.state[1]] + self.df.iloc[self.day-N:self.day,1].tolist() + \
                        self.df.iloc[self.day-N:self.day,2].tolist() + self.df.iloc[self.day-N:self.day,3].tolist() + \
                        self.df.iloc[self.day-N:self.day,4].tolist() + self.df.iloc[self.day-N:self.day,5].tolist() + \
                        self.df.iloc[self.day-N:self.day,6].tolist() + self.df.iloc[self.day-N:self.day,7].tolist()
            
            end_total_asset = self.state[0]+ self.state[1]*self.df.iloc[self.day,-1]

            try:
                self.asset_memory.append(end_total_asset.item())
            except:
                self.asset_memory.append(end_total_asset)

            #print("end_total_asset:{}".format(end_total_asset))
            
            self.reward = end_total_asset - begin_total_asset            
            self.rewards_memory.append(self.reward)
            self.reward = self.reward*REWARD_SCALING
            
        return np.array([self.state]),np.array([self.reward]),self.terminal,{}

    
    def reset(self):
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.day = 374
        self.cost = 0
        self.trades = 0
        self.terminal = False
        self.rewards_memory = []

        self.state = [INITIAL_ACCOUNT_BALANCE] + [0] + self.df.iloc[self.day-N:self.day,1].tolist() + \
                        self.df.iloc[self.day-N:self.day,2].tolist() + self.df.iloc[self.day-N:self.day,3].tolist() + \
                        self.df.iloc[self.day-N:self.day,4].tolist() + self.df.iloc[self.day-N:self.day,5].tolist() + \
                        self.df.iloc[self.day-N:self.day,6].tolist() + self.df.iloc[self.day-N:self.day,7].tolist()
        return self.state

    def render(self,mode='human'):
        return self.state

    def _seed(self,seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

我得到的错误是在model.learn过程中:

发生异常:AttributeError 'list' object has no attribute 'get'

4

0 回答 0