我已经使用 DQN 算法训练了一个 RL 代理。在 20000 集之后,我的奖励收敛了。现在,当我测试这个代理时,代理总是采取相同的行动,而与状态无关。我觉得这很奇怪。有人可以帮我弄这个吗。有没有原因,任何人都可以想到为什么代理会这样?
奖励情节
当我测试代理时
state = env.reset()
print('State: ', state)
state_encod = np.reshape(state, [1, state_size])
q_values = model.predict(state_encod)
action_key = np.argmax(q_values)
print(action_key)
print(index_to_action_mapping[action_key])
print(q_values[0][0])
print(q_values[0][action_key])
q_values_plotting = []
for i in range(0,action_size):
q_values_plotting.append(q_values[0][i])
plt.plot(np.arange(0,action_size),q_values_plotting)
每次它给出相同的 q_values 图,即使每次初始化的状态都不同。下面是 q_Value 图。
测试:
代码
test_rewards = []
for episode in range(1000):
terminal_state = False
state = env.reset()
episode_reward = 0
while terminal_state == False:
print('State: ', state)
state_encod = np.reshape(state, [1, state_size])
q_values = model.predict(state_encod)
action_key = np.argmax(q_values)
action = index_to_action_mapping[action_key]
print('Action: ', action)
next_state, reward, terminal_state = env.step(state, action)
print('Next_state: ', next_state)
print('Reward: ', reward)
print('Terminal_state: ', terminal_state, '\n')
print('----------------------------')
episode_reward += reward
state = deepcopy(next_state)
print('Episode Reward' + str(episode_reward))
test_rewards.append(episode_reward)
plt.plot(test_rewards)
谢谢。
添加环境
import gym
import rom_vav_150mm_polyreg as rom
import numpy as np
import random
class VAVenv(gym.Env):
def __init__(self):
# Zone temperature set point and limits
self.temp_sp = 24
self.temp_sp_max = 24.5
self.temp_sp_min = 23.7
# no; of hours in an episode and time interval for each step
self.MAXSTEPS = 11
self.time_interval = 5./60. #in hrs
# constants
self.zone_volume = 775
def step(self,state,action):
# state -> Time, Volume, Load, SAT ,RAT
# action -> CFM
action_cfm = action[0]
# damper_opening = state[2]
load = state[2]
sat = state[3]
current_temp = state[4]
#input
inputs_rat = np.array([load,action_cfm, self.zone_volume,current_temp,sat])
'''
AFTER 5 MINUTES
'''
#output
output = [self.KStep + self.time_interval,self.zone_volume,rom.load(self.KStep + self.time_interval),
sat,rom.rat(inputs_rat)]
#reward calculation
thermal_coefficient = -0.1
zone_temperature = output[4]
if zone_temperature < self.temp_sp_min:
temp_penalty = self.temp_sp_min - zone_temperature
elif zone_temperature > self.temp_sp_max:
temp_penalty = zone_temperature - self.temp_sp_max
else :
temp_penalty = -10
reward = thermal_coefficient * temp_penalty
# create next step
next_state = np.array(output)
# increment simulation step count
self.KStep += self.time_interval
# done - end of one episode, when kSteps reaches the maximum steps in an episode
done = False
if self.KStep > self.MAXSTEPS:
done = True
return next_state,reward,done
def reset(self):
self.KStep = 0
# initialize all the values of a state
initial_rat = random.uniform(23,27)
initial_sat = random.uniform(12,14)
# return a state
return np.array([self.KStep,self.zone_volume,
rom.load(self.KStep),initial_sat,initial_rat])