浏览代码

yaml update

/develop/bisim-review
yanchaosun 4 年前
当前提交
a80915a8
共有 5 个文件被更改,包括 69 次插入43 次删除
  1. 11
      config/ppo_transfer/CrawlerStatic.yaml
  2. 8
      config/ppo_transfer/CrawlerStaticOpbuffer.yaml
  3. 11
      config/ppo_transfer/OldCrawlerStatic.yaml
  4. 51
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  5. 31
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py

11
config/ppo_transfer/CrawlerStatic.yaml


behaviors:
OldCrawlerStatic:
CrawlerStatic:
trainer_type: ppo_transfer
hyperparameters:
batch_size: 2024

num_epoch: 3
learning_rate_schedule: linear
encoder_layers: 2
policy_layers: 1
value_layers: 1
feature_size: 128
policy_layers: 2
value_layers: 2
feature_size: 32
use_op_buffer: true
use_var_encoder: true
in_batch_alter: true
use_inverse_model: true
network_settings:
normalize: true

8
config/ppo_transfer/CrawlerStaticOpbuffer.yaml


num_epoch: 3
learning_rate_schedule: linear
encoder_layers: 2
policy_layers: 1
value_layers: 1
feature_size: 128
policy_layers: 3
value_layers: 2
forward_layers: 1
inverse_layers: 1
feature_size: 32
reuse_encoder: true
use_op_buffer: true
in_batch_alter: true

11
config/ppo_transfer/OldCrawlerStatic.yaml


behaviors:
OldCrawlerStatic:
CrawlerStatic:
trainer_type: ppo_transfer
hyperparameters:
batch_size: 2024

num_epoch: 3
learning_rate_schedule: linear
encoder_layers: 2
policy_layers: 1
value_layers: 1
feature_size: 64
policy_layers: 2
value_layers: 2
feature_size: 32
in_epoch_alter: true
in_batch_alter: true
use_var_encoder: true
use_inverse_model: true
network_settings:
normalize: true

51
ml-agents/mlagents/trainers/policy/transfer_policy.py


self.current_action = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
)
self.current_reward = tf.placeholder(
shape=[None], dtype=tf.float32, name="current_reward"
)
self.next_visual_in: List[tf.Tensor] = []

self.create_inverse_model(self.encoder, self.targ_encoder, inverse_layers)
with tf.variable_scope("predict"):
self.create_forward_model(self.encoder, self.targ_encoder, forward_layers)
self.create_forward_model(self.encoder, self.targ_encoder, forward_layers,
predict_reward=predict_return)
# if var_predict:
# self.predict_distribution, self.predict = self._create_var_world_model(

)
def create_forward_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, forward_layers: int
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, forward_layers: int,
predict_reward: bool=False, var_predict: bool=False
) -> None:
"""
Creates forward model TensorFlow ops for Curiosity module.

activation=None,
name="hidden_{}".format(i)
)
# hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
# predict = tf.layers.dense(
# combined_input,
# self.h_size
# * (self.vis_obs_size + int(self.vec_obs_size > 0)),
# activation=None,
# )
self.predict = tf.layers.dense(
hidden,
self.feature_size,
name="latent"
)
if var_predict:
predict_distribution = GaussianEncoderDistribution(
hidden,
self.feature_size
)
self.predict = predict_distribution.sample()
else:
self.predict = tf.layers.dense(
hidden,
self.feature_size,
name="latent"
)
# self.intrinsic_reward = squared_difference
)
)
if predict_reward:
self.pred_reward = tf.layers.dense(
hidden,
1,
name="reward"
)
self.forward_loss += tf.reduce_mean(
tf.dynamic_partition(
0.5 * tf.reduce_sum(tf.squared_difference(self.pred_reward, self.current_reward), axis=1)
, self.mask, 2)[1]
)

31
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


self.abs_policy_loss = tf.abs(self.policy_loss)
# encoder and predict loss
self.dis_returns = tf.placeholder(
shape=[None], dtype=tf.float32, name="dis_returns"
)
target = tf.concat([targ_encoder, tf.expand_dims(self.dis_returns, -1)], axis=1)
if self.predict_return:
self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target))
else:
self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder))
if self.with_prior:
if self.use_var_encoder:
self.model_loss += encoder_distribution.kl_standard()
if self.use_var_predict:
self.model_loss += self.policy.predict_distribution.kl_standard()
# self.dis_returns = tf.placeholder(
# shape=[None], dtype=tf.float32, name="dis_returns"
# )
# target = tf.concat([targ_encoder, tf.expand_dims(self.dis_returns, -1)], axis=1)
# if self.predict_return:
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target))
# else:
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder))
# if self.with_prior:
# if self.use_var_encoder:
# self.model_loss += encoder_distribution.kl_standard()
# if self.use_var_predict:
# self.model_loss += self.policy.predict_distribution.kl_standard()
self.model_loss = self.policy.forward_loss
if self.use_inverse_model:
self.model_loss += self.policy.inverse_loss

self.policy.processed_vector_next: mini_batch["next_vector_in"],
# self.policy.next_vector_in: mini_batch["next_vector_in"],
self.policy.current_action: mini_batch["actions"],
self.dis_returns: mini_batch["discounted_returns"]
self.policy.current_reward: mini_batch["extrinsic_rewards"],
# self.dis_returns: mini_batch["discounted_returns"]
}
for name in self.reward_signals:
feed_dict[self.returns_holders[name]] = mini_batch[

正在加载...
取消
保存