在 VBasedPolicy 中,神经网络逼近器尝试学习状态的 V 值。所以它的第一层(输入层)应该有与状态大小相同数量的神经元。而且我相信它的最后一个(输出)层的大小应该为 1,因为它会产生一个与输入状态相对应的标量。
但是当我使用这样的网络架构时,我会从轨迹中得到 BoundsError。详细情况如下。
我在库的 github repo 上看到了各种示例实验,但是 BasicDQN 上的所有示例都使用 QBasedPolicy,其中最后一层网络的大小等于操作数,这对我来说很有意义,因为给定一个状态,网络必须为每个输出 Q 值行动。
我浏览了 github 上的代码,而错误恰好出现在本页的第 79行。但我无法解决。
Code for single agent policy (VBasedPolicy)
STATE_SIZE = length(env.channels) # 2
ACTION_SIZE = length(action_set) # 7
model = Chain(
Dense(STATE_SIZE, 128, relu),
Dense(128, 128, relu),
Dense(128, 128, relu),
Dense(128, 128, relu),
Dense(128, 128, relu),
Dense(128, 128, relu),
Dense(128, 128, relu),
Dense(128, 128, relu),
Dense(128, 128, relu),
Dense(128, 1)
) |> cpu
# optimizer
η = 1f-3 # Learning rate
η_decay = 1f-4
opt = Flux.Optimiser(ADAM(η), InvDecay(η_decay))
function value_action_mapping(env, value_learner; explorer = EpsilonGreedyExplorer(0.4))
A = legal_action_space(env)
println("legal action space: ", A)
V = map(A) do a
value_learner(child(env, a))
end
println("V values: ", V)
c = A[explorer(V)]
println("Chosen action: ", c)
println("Action with max V val: ", findmax(V))
return c
end
single_agent_policy = Agent(
policy = VBasedPolicy(;
learner = BasicDQNLearner(;
approximator = NeuralNetworkApproximator(;
model = model,
optimizer = opt
),
min_replay_history = 50,
batch_size = 50,
γ = 0.99
),
mapping = value_action_mapping
),
trajectory = CircularArraySARTTrajectory(;
capacity = 100,
state=Array{Float64} => (STATE_SIZE)
)
)
Here is the error
BoundsError: attempt to access 1×50 Matrix{Float32} at index [CartesianIndex{2}[CartesianIndex(3, 1), CartesianIndex(1, 2), CartesianIndex(3, 3), CartesianIndex(4, 4), CartesianIndex(5, 5), CartesianIndex(4, 6), CartesianIndex(4, 7), CartesianIndex(3, 8), CartesianIndex(4, 9), CartesianIndex(4, 10) … CartesianIndex(5, 41), CartesianIndex(4, 42), CartesianIndex(3, 43), CartesianIndex(4, 44), CartesianIndex(5, 45), CartesianIndex(5, 46), CartesianIndex(6, 47), CartesianIndex(4, 48), CartesianIndex(4, 49), CartesianIndex(1, 50)]]
Stacktrace:
[1] throw_boundserror(A::Matrix{Float32}, I::Tuple{Vector{CartesianIndex{2}}})
@ Base .\abstractarray.jl:651
[2] checkbounds
@ .\abstractarray.jl:616 [inlined]
[3] _getindex
@ .\multidimensional.jl:831 [inlined]
[4] getindex
@ .\abstractarray.jl:1170 [inlined]
[5] adjoint
@ C:\Users\vchou\.julia\packages\Zygote\i1R8y\src\lib\array.jl:31 [inlined]
[6] _pullback(__context__::Zygote.Context, 496::typeof(getindex), x::Matrix{Float32}, inds::Vector{CartesianIndex{2}})
@ Zygote C:\Users\vchou\.julia\packages\ZygoteRules\OjfTt\src\adjoint.jl:57
[7] _pullback
@ C:\Users\vchou\.julia\packages\ReinforcementLearningZoo\M308M\src\algorithms\dqns\basic_dqn.jl:79 [inlined]
[8] _pullback(::Zygote.Context, ::ReinforcementLearningZoo.var"#52#54"{BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, Matrix{Float64}, Vector{Bool}, Vector{Float32}, Matrix{Float64}, typeof(Flux.Losses.huber_loss), Float32, NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}})
@ Zygote C:\Users\vchou\.julia\packages\Zygote\i1R8y\src\compiler\interface2.jl:0
[9] pullback(f::Function, ps::Zygote.Params)
@ Zygote C:\Users\vchou\.julia\packages\Zygote\i1R8y\src\compiler\interface.jl:250
[10] gradient(f::Function, args::Zygote.Params)
@ Zygote C:\Users\vchou\.julia\packages\Zygote\i1R8y\src\compiler\interface.jl:58
[11] update!(learner::BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, batch::NamedTuple{(:state, :action, :reward, :terminal, :next_state), Tuple{Matrix{Float64}, Vector{Int64}, Vector{Float32}, Vector{Bool}, Matrix{Float64}}})
@ ReinforcementLearningZoo C:\Users\vchou\.julia\packages\ReinforcementLearningZoo\M308M\src\algorithms\dqns\basic_dqn.jl:78
[12] update!(learner::BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, traj::CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Float64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}})
@ ReinforcementLearningZoo C:\Users\vchou\.julia\packages\ReinforcementLearningZoo\M308M\src\algorithms\dqns\basic_dqn.jl:65
[13] update!
@ C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\policies\q_based_policies\learners\abstract_learner.jl:35 [inlined]
[14] update!
@ C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\policies\v_based_policies.jl:31 [inlined]
[15] (::Agent{VBasedPolicy{BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, typeof(value_action_mapping)}, CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Float64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}})(stage::PreActStage, env::AdSpendEnv, action::Int64)
@ ReinforcementLearningCore C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\policies\agents\agent.jl:74
[16] _run(policy::Agent{VBasedPolicy{BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, typeof(value_action_mapping)}, CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Float64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, env::AdSpendEnv, stop_condition::StopAfterEpisode{ProgressMeter.Progress}, hook::ComposedHook{Tuple{RewardPerStep, ActionsPerStep, TotalRewardPerEpisode, NeuralOutputPerStep, StatePerStep}})
@ ReinforcementLearningCore C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\core\run.jl:28
[17] run(policy::Agent{VBasedPolicy{BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, typeof(value_action_mapping)}, CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Float64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, env::AdSpendEnv, stop_condition::StopAfterEpisode{ProgressMeter.Progress}, hook::ComposedHook{Tuple{RewardPerStep, ActionsPerStep, TotalRewardPerEpisode, NeuralOutputPerStep, StatePerStep}})
@ ReinforcementLearningCore C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\core\run.jl:10
[18] top-level scope
@ In[1211]:2
[19] eval
@ .\boot.jl:360 [inlined]
[20] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
@ Base .\loading.jl:1094