2

在 VBasedPolicy 中,神经网络逼近器尝试学习状态的 V 值。所以它的第一层(输入层)应该有与状态大小相同数量的神经元。而且我相信它的最后一个(输出)层的大小应该为 1,因为它会产生一个与输入状态相对应的标量。

但是当我使用这样的网络架构时,我会从轨迹中得到 BoundsError。详细情况如下。

我在库的 github repo 上看到了各种示例实验,但是 BasicDQN 上的所有示例都使用 QBasedPolicy,其中最后一层网络的大小等于操作数,这对我来说很有意义,因为给定一个状态,网络必须为每个输出 Q 值行动。

我浏览了 github 上的代码,而错误恰好出现在本页的第 79。但我无法解决。

Code for single agent policy (VBasedPolicy)

STATE_SIZE = length(env.channels) # 2
ACTION_SIZE = length(action_set)    # 7    
model = Chain(
            Dense(STATE_SIZE, 128, relu),
            Dense(128, 128, relu),
            Dense(128, 128, relu),
            Dense(128, 128, relu),
            Dense(128, 128, relu),
            Dense(128, 128, relu),
            Dense(128, 128, relu),
            Dense(128, 128, relu),
            Dense(128, 128, relu),
            Dense(128, 1)
        ) |> cpu

# optimizer 
η = 1f-3 # Learning rate
η_decay = 1f-4
opt = Flux.Optimiser(ADAM(η), InvDecay(η_decay))

function value_action_mapping(env, value_learner; explorer = EpsilonGreedyExplorer(0.4))
    A = legal_action_space(env)
    println("legal action space: ", A)
    V = map(A) do a
        value_learner(child(env, a))
    end
    println("V values: ", V)
    c = A[explorer(V)]
    println("Chosen action: ", c)
    println("Action with max V val: ", findmax(V))
    return c
end

single_agent_policy = Agent(
            policy = VBasedPolicy(;
                    learner = BasicDQNLearner(;
                        approximator = NeuralNetworkApproximator(;
                            model = model,
                            optimizer = opt
                        ),
                        min_replay_history = 50,
                        batch_size = 50,
                        γ = 0.99
                    ),
                    mapping = value_action_mapping
                ),
                trajectory = CircularArraySARTTrajectory(;
                            capacity = 100,
                            state=Array{Float64} => (STATE_SIZE)
                        )
                )

Here is the error
BoundsError: attempt to access 1×50 Matrix{Float32} at index [CartesianIndex{2}[CartesianIndex(3, 1), CartesianIndex(1, 2), CartesianIndex(3, 3), CartesianIndex(4, 4), CartesianIndex(5, 5), CartesianIndex(4, 6), CartesianIndex(4, 7), CartesianIndex(3, 8), CartesianIndex(4, 9), CartesianIndex(4, 10)  …  CartesianIndex(5, 41), CartesianIndex(4, 42), CartesianIndex(3, 43), CartesianIndex(4, 44), CartesianIndex(5, 45), CartesianIndex(5, 46), CartesianIndex(6, 47), CartesianIndex(4, 48), CartesianIndex(4, 49), CartesianIndex(1, 50)]]

Stacktrace:
  [1] throw_boundserror(A::Matrix{Float32}, I::Tuple{Vector{CartesianIndex{2}}})
    @ Base .\abstractarray.jl:651
  [2] checkbounds
    @ .\abstractarray.jl:616 [inlined]
  [3] _getindex
    @ .\multidimensional.jl:831 [inlined]
  [4] getindex
    @ .\abstractarray.jl:1170 [inlined]
  [5] adjoint
    @ C:\Users\vchou\.julia\packages\Zygote\i1R8y\src\lib\array.jl:31 [inlined]
  [6] _pullback(__context__::Zygote.Context, 496::typeof(getindex), x::Matrix{Float32}, inds::Vector{CartesianIndex{2}})
    @ Zygote C:\Users\vchou\.julia\packages\ZygoteRules\OjfTt\src\adjoint.jl:57
  [7] _pullback
    @ C:\Users\vchou\.julia\packages\ReinforcementLearningZoo\M308M\src\algorithms\dqns\basic_dqn.jl:79 [inlined]
  [8] _pullback(::Zygote.Context, ::ReinforcementLearningZoo.var"#52#54"{BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, Matrix{Float64}, Vector{Bool}, Vector{Float32}, Matrix{Float64}, typeof(Flux.Losses.huber_loss), Float32, NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}})
    @ Zygote C:\Users\vchou\.julia\packages\Zygote\i1R8y\src\compiler\interface2.jl:0
  [9] pullback(f::Function, ps::Zygote.Params)
    @ Zygote C:\Users\vchou\.julia\packages\Zygote\i1R8y\src\compiler\interface.jl:250
 [10] gradient(f::Function, args::Zygote.Params)
    @ Zygote C:\Users\vchou\.julia\packages\Zygote\i1R8y\src\compiler\interface.jl:58
 [11] update!(learner::BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, batch::NamedTuple{(:state, :action, :reward, :terminal, :next_state), Tuple{Matrix{Float64}, Vector{Int64}, Vector{Float32}, Vector{Bool}, Matrix{Float64}}})
    @ ReinforcementLearningZoo C:\Users\vchou\.julia\packages\ReinforcementLearningZoo\M308M\src\algorithms\dqns\basic_dqn.jl:78
 [12] update!(learner::BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, traj::CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Float64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}})
    @ ReinforcementLearningZoo C:\Users\vchou\.julia\packages\ReinforcementLearningZoo\M308M\src\algorithms\dqns\basic_dqn.jl:65
 [13] update!
    @ C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\policies\q_based_policies\learners\abstract_learner.jl:35 [inlined]
 [14] update!
    @ C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\policies\v_based_policies.jl:31 [inlined]
 [15] (::Agent{VBasedPolicy{BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, typeof(value_action_mapping)}, CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Float64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}})(stage::PreActStage, env::AdSpendEnv, action::Int64)
    @ ReinforcementLearningCore C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\policies\agents\agent.jl:74
 [16] _run(policy::Agent{VBasedPolicy{BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, typeof(value_action_mapping)}, CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Float64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, env::AdSpendEnv, stop_condition::StopAfterEpisode{ProgressMeter.Progress}, hook::ComposedHook{Tuple{RewardPerStep, ActionsPerStep, TotalRewardPerEpisode, NeuralOutputPerStep, StatePerStep}})
    @ ReinforcementLearningCore C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\core\run.jl:28
 [17] run(policy::Agent{VBasedPolicy{BasicDQNLearner{NeuralNetworkApproximator{Chain{Tuple{Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(relu), Matrix{Float32}, Vector{Float32}}, Dense{typeof(identity), Matrix{Float32}, Vector{Float32}}}}, Flux.Optimise.Optimiser}, typeof(Flux.Losses.huber_loss), Random._GLOBAL_RNG}, typeof(value_action_mapping)}, CircularArraySARTTrajectory{NamedTuple{(:state, :action, :reward, :terminal), Tuple{CircularArrayBuffers.CircularArrayBuffer{Float64, 2}, CircularArrayBuffers.CircularVectorBuffer{Int64}, CircularArrayBuffers.CircularVectorBuffer{Float32}, CircularArrayBuffers.CircularVectorBuffer{Bool}}}}}, env::AdSpendEnv, stop_condition::StopAfterEpisode{ProgressMeter.Progress}, hook::ComposedHook{Tuple{RewardPerStep, ActionsPerStep, TotalRewardPerEpisode, NeuralOutputPerStep, StatePerStep}})
    @ ReinforcementLearningCore C:\Users\vchou\.julia\packages\ReinforcementLearningCore\NWrFY\src\core\run.jl:10
 [18] top-level scope
    @ In[1211]:2
 [19] eval
    @ .\boot.jl:360 [inlined]
 [20] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base .\loading.jl:1094
4

0 回答 0