我正在尝试使用 python 实现马尔可夫决策过程的值迭代算法。我有一个实现。但是,这给了我许多实用程序的重复值。我的转换矩阵非常稀疏。可能,这导致了问题。但是,我不太确定这个假设是否正确。我应该如何纠正这个?代码可能很粗制滥造。我对价值迭代很陌生。所以请帮助我找出我的代码存在的问题。参考代码是这样的:http ://carlo-hamalainen.net/stuff/mdpnotes/ 。我使用了 ipod_mdp.py 代码文件。这是我的实现的片段:
num_of_states = 470 #total number of states
V1 = [0.25] * num_of_states
get_target_index = state_index[(u'48.137654', u'11.579949')] #each state is a location
#print "The target index is ", get_target_index
V1[get_target_index] = -100 #assigning least cost to the target state
V2 = [0.0] * num_of_states
policy = [0.0] * num_of_states
count = 0.0
while max([abs(V1[i] - V2[i]) for i in range(num_of_states)]) > 0.001:
print max([abs(V1[i] - V2[i]) for i in range(num_of_states)])
print count
for s in range(num_of_states): #for each state
#initialize minimum action to the first action in the list
min_action = actions_index[actions[0]] #initialize - get the action index for the first iteration
min_action_cost = cost[s, actions_index[actions[0]]] #initialize the cost
for w in range(num_of_states):
if (s, state_index[actions[0]], w) in transitions: #if this transition exists in the matrix - non-zero value
min_action_cost += 0.9 * transitions[s, state_index[actions[0]], w] * V1[w]
min_action_cost += 0.9 * 0.001 * V1[w] #if not - give it a small value of 0.001 instead of 0.0
#get the minimum action cost for the state
for a in actions:
this_cost = cost[s, actions_index[a]]
for w in range(num_of_states):
# if index_state[w] != 'm':
if (s, state_index[a], w) in transitions:
this_cost += 0.9 * transitions[s, state_index[a], w] * V1[w]
this_cost += 0.9 * 0.001 * V1[w]
if this_cost < min_action_cost:
min_action = actions_index[a]
min_action_cost = this_cost
V2[s] = min_action_cost
policy[s] = min_action
V1, V2 = V2, V1 #swap
count += 1