在强化学习中,一个典型的例子是有 风的网格世界
我面临着一个新变化的风网格世界,它还有一堵墙和随机风,我被这两个新东西困住了
图 1 显示了一个标准的网格世界,具有起始 (S) 和目标 (G) 单元格,但有两个不同之处:代理无法越过一堵墙(由黑色单元格表示),以及向下和向左的侧风在网格的右边缘。每个单元格中可用的动作是国王的举动| 每个单元总共有 8 个动作。如果有任何动作会将您带到网格世界之外或与墙壁发生碰撞,您最终会进入最近的单元格(例如,在左上角的单元格中向东北移动会将您带到右侧的一个单元格)。在右侧区域,生成的下一个单元格由随机的“风”向左下移,其平均强度逐列变化。风的平均强度在每列下方给出,以向左下移的单元格数表示.
由于随机性,风有时会与每列给出的平均值相差 1(平均值为 0 时除外)。也就是说,三分之一的时间你会根据列下方指示的值准确地向下移动,三分之一的时间你会向下和向左移动一个单元格,另外三分之一的时间你会移动一个细胞小于平均值。例如,如果你在墙的一排并且在开口的中间并且你向上移动,那么有三分之一的时间你会在那个单元格以西的一排结束,三分之一的时间你会结束向西两列,在该单元格以南一列,三分之一的时间您最终在该单元格以北的同一列。风会影响你所在的牢房,而不是你要去的牢房。
用 = 0:1, = 0:9 和初始 Q(s; a) = 0 对所有 s 实现上述问题中的 Q 学习算法 2;一个。每个动作都会产生 rs = 1 的奖励,但立即导致目标单元格 (rg = 10) 的动作除外。使用: - 贪心动作选择方法 = 0:2。初始 Q(s,a) > 0 且初始 Q(s,a) < 0 的贪心动作选择方法。
我的 matlab 代码可以工作。
我真正的问题是 函数 nextPos = GiveNextPos(curPos, actionIndex, windpowers, gridCols, gridRows),代理将在其中决定一个动作,然后进入下一步。但是影响下一步的因素很多,比如随机风墙
所以第一个问题是关于随机风的,我如何在matlab中编程说在1/3的机会中是3,在另外1/3的机会中是1......
第二个问题是关于撞墙?我是不是应该先计算国王的步行和风的下一步,然后用这个下一步的值来检查我是否撞墙???)
function WindyGridWorldQLearning()
fprintf('WindyGridWorldQLearning\n');
gamma = 0.9;
alpha = 0.1;
epsilon = 0.2;
gridcols = 10;
gridrows = 7;
windpowers = [0 0 0 0 1 1 2 2 1 1];
fontsize = 16;
showTitle = 1;
episodeCount = 900;
selectedEpisodes = [900];
isKing = 1;
canHold = 0;
start.row = 7;
start.col = 1;
goal.row = 1;
goal.col = 1;
selectedEpIndex = 1;
actionCount = 8;
% initialize Q with zeros
Q = zeros(gridrows, gridcols, actionCount);
a = 0; % an invalid action
% loop through episodes
for ei = 1:episodeCount,
%disp(sprintf('Running episode %d', ei));
curpos = start;
nextpos = start;
%epsilon or greedy
if(rand > epsilon) % greedy
[qmax, a] = max(Q(curpos.row,curpos.col,:));
else
a = IntRand(1, actionCount);
end
while(PosCmp(curpos, goal) ~= 0)
% take action a, observe r, and nextpos
nextpos = GiveNextPos(curpos, a, windpowers, gridcols, gridrows);
if(PosCmp(nextpos, goal) ~= 0), r = -1; else r = 10; end
% choose a_next from nextpos
[qmax, a_next] = max(Q(nextpos.row,nextpos.col,:));
if(rand <= epsilon) % explore
a_next = IntRand(1, actionCount);
end
% update Q:
curQ = Q(curpos.row, curpos.col, a);
nextQ = qmax; %Q(nextpos.row, nextpos.col, a_next);
Q(curpos.row, curpos.col, a) = curQ + alpha*(r + gamma*nextQ - curQ);
curpos = nextpos; a = a_next;
end % states in each episode
% if the current state of the world is going to be drawn ...
if(selectedEpIndex <= length(selectedEpisodes) && ei == selectedEpisodes(selectedEpIndex))
curpos = start;
rows = []; cols = []; acts = [];
for i = 1:(gridrows + gridcols) * 10,
[qmax, a] = max(Q(curpos.row,curpos.col,:));
nextpos = GiveNextPos(curpos, a, windpowers, gridcols, gridrows);
rows = [rows curpos.row];
cols = [cols curpos.col];
acts = [acts a];
if(PosCmp(nextpos, goal) == 0), break; end
curpos = nextpos;
end % states in each episode
%figure;
figure('Name',sprintf('Episode: %d', ei), 'NumberTitle','off');
DrawWindyEpisodeState(rows, cols, acts, start.row, start.col, goal.row, goal.col, windpowers, gridrows, gridcols, fontsize);
if(showTitle == 1),
title(sprintf('Windy grid-world SARSA - episode %d - (\\epsilon: %3.3f), (\\alpha = %3.4f), (\\gamma = %1.1f)', ei, epsilon, alpha, gamma));
end
selectedEpIndex = selectedEpIndex + 1;
end
end % episodes loop
function c = PosCmp(pos1, pos2)
c = pos1.row - pos2.row;
if(c == 0)
c = c + pos1.col - pos2.col;
end
function nextPos = GiveNextPos(curPos, actionIndex, windpowers, gridCols, gridRows)
nextPos = curPos;
switch actionIndex
case 1 % east
nextPos.col = curPos.col + 1;
case 2 % south
nextPos.row = curPos.row + 1;
if(nextPos.row ==4 && nextPos.col <= 4 ) nextPos.row = curPos.row; end
case 3 % west
nextPos.col = curPos.col - 1;
if(nextPos.row ==4 && nextPos.col <= 4 ) nextPos.col = curPos.col; end
case 4 % north
nextPos.row = curPos.row - 1;
if(nextPos.row ==4 && nextPos.col <= 4 ) nextPos.row = curPos.row; end
case 5 % northeast
nextPos.col = curPos.col + 1;
nextPos.row = curPos.row - 1;
if(nextPos.row ==4 && nextPos.col <= 4 ) nextPos.row = curPos.row; end
case 6 % southeast
nextPos.col = curPos.col + 1;
nextPos.row = curPos.row + 1;
if(nextPos.row ==4 && nextPos.col <= 4 ) nextPos.row = curPos.row; end
case 7 % southwest
nextPos.col = curPos.col - 1;
nextPos.row = curPos.row + 1;
if(nextPos.row ==4 && nextPos.col <= 4 ) nextPos.row = curPos.row; end
case 8 % northwest
nextPos.col = curPos.col - 1;
nextPos.row = curPos.row - 1;
if(nextPos.row ==4 && nextPos.col <= 4 ) nextPos.row = curPos.row; end
case 9 % hold
nextPos = curPos;
otherwise
disp(sprintf('invalid action index: %d', actionIndex))
end
if(curPos.col > 4)
nextPos.row = nextPos.row - windpowers(nextPos.col);
nextPos.col = nextPos.col - windpowers(nextPos.col);
end
if(nextPos.col <= 0), nextPos.col = 1; end
if(nextPos.col > gridCols), nextPos.col = gridCols; end
if(nextPos.row <= 0), nextPos.row = 1; end
if(nextPos.row > gridRows), nextPos.row = gridRows; end
function n = IntRand(lowerBound, upperBound)
n = floor((upperBound - lowerBound) * rand + lowerBound);
function DrawWindyEpisodeState(rows, cols, acts, SRow, SCol, GRow, GCol, windpowers, gridrows, gridcols, fontsize)
DrawGrid(gridrows, gridcols);
DrawTextOnCell('S', 0, SRow, SCol, gridrows, gridcols, fontsize);
DrawTextOnCell('G', 0, GRow, GCol, gridrows, gridcols, fontsize);
for i=1:length(rows),
DrawActionOnCell(acts(i), rows(i), cols(i), gridrows, gridcols, fontsize);
end
for i=1:gridcols,
[xc, yc] = FindColBaseCenter(i, gridrows, gridcols);
text(xc, yc, sprintf('%d',windpowers(i)), 'FontSize', fontsize, 'Rotation', 0);
end
function DrawEpisodeState(rows, cols, acts, SRow, SCol, GRow, GCol, gridrows, gridcols, fontsize)
DrawGrid(gridrows, gridcols);
DrawTextOnCell('S', 0, SRow, SCol, gridrows, gridcols, fontsize);
DrawTextOnCell('G', 0, GRow, GCol, gridrows, gridcols, fontsize);
for i=1:length(rows),
DrawActionOnCell(acts(i), rows(i), cols(i), gridrows, gridcols, fontsize);
end
function DrawGrid(gridrows, gridcols)
xsp = 1 / (gridcols + 2);
ysp = 1 / (gridrows + 2);
x = zeros(1, 2*(gridcols + 1));
y = zeros(1, 2*(gridcols + 1));
i = 1;
for xi = xsp:xsp:1 - xsp,
x(2*i - 1) = xi; x(2*i) = xi;
if(mod(i , 2) == 0)
y(2*i - 1) = ysp;y(2*i) = 1-ysp;
else
y(2*i - 1) = 1 - ysp;y(2*i) = ysp;
end
i = i + 1;
end
x2 = zeros(1, 2*(gridrows + 1));
y2 = zeros(1, 2*(gridrows + 1));
i = 1;
for yi = ysp:ysp:1 - ysp,
y2(2*i - 1) = yi; y2(2*i) = yi;
if(mod(i , 2) == 0)
x2(2*i - 1) = xsp;x2(2*i) = 1-xsp;
else
x2(2*i - 1) = 1 - xsp;x2(2*i) = xsp;
end
i = i + 1;
end
plot(x, y, '-');
hold on
plot(x2, y2, '-');
axis([0 1 0 1]);
axis off
set(gcf, 'color', 'white');
function DrawTextOnCell(theText, rotation, row, col, gridrows, gridcols, fontsize)
[xc, yc] = FindCellCenter(row, col, gridrows, gridcols);
text(xc, yc, theText, 'FontSize', fontsize, 'Rotation', rotation);
function DrawActionOnCell(actionIndex, row, col, gridrows, gridcols, fontsize)
rotation = 0;
textToDraw = 'o';
switch actionIndex
case 1 % east
textToDraw = '\rightarrow';
rotation = 0;
case 2 % south
textToDraw = '\downarrow';
rotation = 0;
case 3 % west
textToDraw = '\leftarrow';
rotation = 0;
case 4 % north
textToDraw = '\uparrow';
rotation = 0;
case 5 % northeast
textToDraw = '\rightarrow';
rotation = 45;
case 6 % southeast
textToDraw = '\downarrow';
rotation = 45;
case 7 % southwest
textToDraw = '\leftarrow';
rotation = 45;
case 8 % northwest
textToDraw = '\uparrow';
rotation = 45;
otherwise
disp(sprintf('invalid action index: %d', actionIndex))
end
DrawTextOnCell(textToDraw, rotation, row, col, gridrows, gridcols, fontsize);
function [x,y] = FindCellCenter(row, col, gridrows, gridcols)
xsp = 1 / (gridcols + 2);
ysp = 1 / (gridrows + 2);
x = ((2*col + 1) / 2) * xsp;
y = 1 - (((2*row + 1) / 2) * ysp);
x = x - xsp/5;
function [x,y] = FindColBaseCenter(col, gridrows, gridcols)
row = gridrows + 1;
xsp = 1 / (gridcols + 2);
ysp = 1 / (gridrows + 2);
x = ((2*col + 1) / 2) * xsp;
y = 1 - (((2*row + 1) / 2) * ysp);
x = x - xsp/5;