所以我使用以下代码在 Unity 中实现 Q-learning:
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using UnityEngine;
namespace QLearner
public class QLearnerScript
List<float[]> QStates; // Q states over time
List<float[]> QActions; // Q actions over time
float[] initialState;
int initialActionIndex;
float[] outcomeState;
float outcomeActionValue;
bool firstIteration;
int possibleActions;
float learningRate; // denoted by alpha
float discountFactor; // denoted by gamma
float simInterval;
System.Random r = new System.Random();
public int main(float[] currentState, float reward)
QLearning(currentState, reward);
// Applies a sim interval and rounds
initialState = new float[2] {(float)Math.Round((double)currentState[0] / simInterval) * simInterval , (float)Math.Round((double)currentState[1] / simInterval) * simInterval};
firstIteration = false;
int actionIndex = r.Next(0, possibleActions);
bool exists = false;
if(QStates.Count > 0)
for(int i = 0; i < QStates.Count; i++)
float[] state = QStates.ElementAt(i);
float[] actions = QActions.ElementAt(i);
if(state[0] == initialState[0] && state[1] == initialState[1])
exists = true;
initialActionIndex = Array.IndexOf(actions, MaxFloat(actions));
return initialActionIndex;
float[] actionVals = new float[possibleActions];
for (int i = 0; i < possibleActions; i++)
actionVals[i] = 0f;
QStates.Add( initialState);
initialActionIndex = actionIndex;
return initialActionIndex;
public QLearnerScript(int possActs)
QStates = new List<float[]>();
QActions = new List<float[]>();
possibleActions = possActs;
learningRate = .5f; // Between 0 and 1
discountFactor = 1f;
simInterval = 1f;
firstIteration = true;
public void QLearning(float[] outcomeStateFeed, float reward)
outcomeState = new float[2] {(float)Math.Round((double)outcomeStateFeed[0] / simInterval) * simInterval , (float)Math.Round((double)outcomeStateFeed[1] / simInterval) * simInterval};
bool exists = false;
for(int i = 0; i < QStates.Count; i++)
float[] state = QStates.ElementAt(i);
float[] actions = QActions.ElementAt(i);
if(state[0] == outcomeState[0] && state[1] == outcomeState[1])
exists = true;
outcomeActionValue = MaxFloat(actions);
for(int i = 0; i < QStates.Count; i++)
float[] state = QStates.ElementAt(i);
float[] actions = QActions.ElementAt(i);
if(state[0] == initialState[0] && state[1] == initialState[1])
actions[initialActionIndex] += learningRate * (reward + discountFactor * outcomeActionValue - actions[initialActionIndex]);
actions[initialActionIndex] += learningRate * (reward + discountFactor * 0f - actions[initialActionIndex]);
public int getQtableCount()
return QStates.Count;
float MaxFloat(float[] numbers)
float max = numbers[0];
for (int i = 0; i < numbers.Length; i++)
if (max < numbers[i])
max = numbers[i];
return max;
这适用于我的环境。但是,我也在尝试实现 SARSA 来测试这两种算法。我知道 Q-learning 是 off-policy,而 SARSA 是 on-policy,这意味着我必须实施一个策略来获得下一个动作,而不是简单地调用