我正在尝试使用 Python 为马尔可夫决策过程(库存问题)编写一个小程序。我无法弄清楚为什么程序会输出两个相同的矩阵(用于利润和决策矩阵)。编程本身也有一些问题,因为最后两列都是零,这是不应该发生的。对程序本身的任何帮助也将非常有帮助!
import math
import numpy as np
def salvageCost(b):
return 5 * b
def orderingCost(b):
if b == 0:
return 0
else:
return 4 + 2 * b
def holdingCost(b):
return 1.5 * b
def revenue(b):
return 8 * b
M = 10
N = 4
u = np.zeros((M+1,N))
T = np.array([4,3,2,1])
S = np.array(range(M+1))
A = np.array(range(M+1))
u[:,0] = S*5
d = np.zeros((11,4))
probs = np.array([0.05, 0.1, 0.15, 0.2, 0.2, 0.15, 0.1, 0.05, 0, 0, 0])
demands = np.array(range(11))
candidate = [0]*11
d = u
for i in T[1:N]:
for j in S:
for a in range(M-j+1):
candidate[a] = -holdingCost(j) - orderingCost(a) \
+ np.array([probs[k]for k in range(j+a)]).dot(8*np.array([demands[k]for k in range(j+a)])) \
+ np.array([probs[p] for p in range(min(j+a+1,M+1),M+1)]).dot(np.array(range(min(j+a+1,M+1),M+1)))*80\
+ probs.dot(u[:,i-1])
u[j,i] = max(candidate)
d[j,i] = candidate.index(max(candidate))
print(d)
print(u)
这是输出
[[ 0. 0. 0. 0.]
[ 5. 0. 0. 0.]
[10. 0. 0. 0.]
[15. 0. 0. 0.]
[20. 0. 0. 0.]
[25. 0. 0. 0.]
[30. 0. 0. 0.]
[35. 0. 0. 0.]
[40. 0. 0. 0.]
[45. 0. 0. 0.]
[50. 0. 0. 0.]]
[[ 0. 0. 0. 0.]
[ 5. 0. 0. 0.]
[10. 0. 0. 0.]
[15. 0. 0. 0.]
[20. 0. 0. 0.]
[25. 0. 0. 0.]
[30. 0. 0. 0.]
[35. 0. 0. 0.]
[40. 0. 0. 0.]
[45. 0. 0. 0.]
[50. 0. 0. 0.]]