我正在尝试在赛车模拟器中训练代理。我已经建立了一个自定义模型。我发现训练期间权重没有变化。
我正在尝试在赛车模拟器中训练代理。我按照此处的说明构建了一个自定义模型。我使用文档中显示的python API。我打印了权重(按weights = trainer.workers.foreach_worker(lambda ev: ev.get_policy().get_weights())
),我发现很大一部分权重没有改变。
定制型号:
class AtariNet(Model):
@override(Model)
def _build_layers_v2(self, input_dict, num_outputs, options):
with tf.name_scope("atari_net"):
conv1 = tf.layers.conv2d(input_dict["obs"], 32, 8, 4, activation=tf.nn.relu, padding="valid", name="conv1")
conv2 = tf.layers.conv2d(conv1, 64, 4, 2, activation=tf.nn.relu, padding="valid", name="conv2")
conv3 = tf.layers.conv2d(conv2, 64, 3, 1, activation=tf.nn.relu, padding="valid", name="conv3")
conv_flatten = tf.layers.flatten(conv3)
state = tf.layers.dense(conv_flatten, 512, activation=tf.nn.relu, name="state")
act_output = tf.layers.dense(state, num_outputs, name="act_output")
return act_output, state
@override(Model)
def value_function(self):
val = tf.layers.dense(self.last_layer, 1)
return tf.reshape(val, [-1])
自定义预处理器:
class NormalizePreprocessor(Preprocessor):
@override(Preprocessor)
def _init_shape(self, obs_space, options):
return obs_space.shape # doesn't need to change observation space
@override(Preprocessor)
def transform(self, observation):
scaled = observation * 1. / 255.
return scaled # return the preprocessed observation
培训代码:
def main(args):
ray.init()
ModelCatalog.register_custom_model("atari_net", AtariNet)
ModelCatalog.register_custom_preprocessor("NormPrep", NormalizePreprocessor)
config = ppo.DEFAULT_CONFIG.copy()
config["log_level"] = "DEBUG"
config["num_gpus"] = 1
config["num_cpus_for_driver"] = 4
config["num_workers"] = 2
config["num_cpus_per_worker"] = 2
config["model"]["custom_model"] = "atari_net"
# config["model"]["custom_preprocessor"] = "NormPrep"
config["clip_actions"] = False
config["use_gae"] = True
config["lr"] = 0.1
config["vf_share_layers"] = True
config["vf_loss_coeff"] = 0.1
config["train_batch_size"] = 500
config["batch_mode"] = "complete_episodes"
config["lambda"] = 0.95
# config["kl_coeff"] = 0.0
config["kl_coeff"] = 0.5
# config["entropy_coeff"] = 0.0
config["sample_batch_size"] = 100
config["entropy_coeff"] = 0.01
config["sgd_minibatch_size"] = 500
# config["num_sgd_iter"] = 30
config["num_sgd_iter"] = 10
# config["clip_rewards"] = True
config["observation_filter"] = "NoFilter"
trainer = ppo.PPOTrainer(config=config, env=<my simulator environment>)
for i in range(1000):
# Perform one iteration of training the policy with PPO
result = trainer.train()
print(pretty_print(result))
'''
policy_weights = trainer.get_policy().get_weights()
print("&&& policy shape:{}".format(policy_weights.shape))
print("&&& policy weights:{}".format(policy_weights))
'''
weights = trainer.workers.foreach_worker(lambda ev: ev.get_policy().get_weights())
for n in weights:
print(n[0:30])
print(n[5000:5030])
print(n[-30:])
print('=' * 20)
if i % 10 == 0:
checkpoint = trainer.save()
print("checkpoint saved at", checkpoint)
我发现很大一部分权重没有变化。我不知道为什么。