浏览代码

with swish activation

/develop/bisim-review
yanchaosun 5 年前
当前提交
d1e8d344
共有 2 个文件被更改,包括 63 次插入38 次删除
  1. 57
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 44
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py

57
ml-agents/mlagents/trainers/policy/transfer_policy.py


with tf.variable_scope("predict"):
self.create_forward_model(self.encoder, self.targ_encoder, forward_layers,
predict_reward=predict_return, var_predict=var_predict)
var_predict=var_predict)
if predict_return:
with tf.variable_scope("reward"):
self.create_reward_model(self.encoder, self.targ_encoder, forward_layers)
# if var_predict:
# self.predict_distribution, self.predict = self._create_var_world_model(

feature_size,
name="latent",
reuse=reuse_encoder,
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent_targ
# return tf.stop_gradient(latent_targ)

hidden_stream,
feature_size,
name="latent",
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent

def create_forward_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, forward_layers: int,
predict_reward: bool=False, var_predict: bool=False
var_predict: bool=False
) -> None:
"""
Creates forward model TensorFlow ops for Curiosity module.

self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
if var_predict:

hidden,
self.feature_size,
name="latent",
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
squared_difference = 0.5 * tf.reduce_sum(

self.forward_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)
if predict_reward:
self.pred_reward = tf.layers.dense(
def create_reward_model(self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, forward_layers: int):
combined_input = tf.concat(
[encoded_state, self.current_action], axis=1
)
hidden = combined_input
for i in range(forward_layers):
hidden = tf.layers.dense(
1,
name="reward",
self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
self.forward_loss += tf.reduce_mean(
tf.dynamic_partition(
0.5 * tf.reduce_sum(tf.squared_difference(self.pred_reward, self.current_reward), axis=1)
, self.mask, 2)[1]
)
self.pred_reward = tf.layers.dense(
hidden,
1,
name="reward",
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.reward_loss = tf.reduce_mean(
tf.squared_difference(self.pred_reward, self.current_reward)
)

44
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Losses/Model Loss": "model_loss",
"Losses/Reward Loss": "reward_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Model Learning Rate": "model_learning_rate",
"Policy/Epsilon": "decay_epsilon",

if policy.use_continuous_act:
self._create_cc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
else:
self._create_dc_critic(h_size, hyperparameters.value_layers, vis_encode_type)
self._create_dc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
with tf.variable_scope("optimizer/"):
self.learning_rate = ModelUtils.create_schedule(

"value_loss": self.value_loss,
"policy_loss": self.abs_policy_loss,
"model_loss": self.model_loss,
"reward_loss": self.policy.reward_loss,
"update_batch": self.update_batch,
"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,

# self.model_loss += self.policy.predict_distribution.kl_standard()
self.model_loss = self.policy.forward_loss
if self.predict_return:
self.model_loss += self.policy.reward_loss
if self.with_prior:
if self.use_var_encoder:
self.model_loss += 0.2 * self.policy.encoder_distribution.kl_standard()

"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
"reward_loss": self.policy.reward_loss,
}
)

"model_learning_rate": self.model_learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
"reward_loss": self.policy.reward_loss,
}
)

:return: Results of update.
"""
feed_dict = self._construct_feed_dict(batch, num_sequences)
if update_type == "model":
stats_needed = {
"Losses/Model Loss": "model_loss",
"Policy/Learning Rate": "model_learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
elif update_type == "policy":
stats_needed = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
stats_needed = self.stats_name_to_update_name
# if update_type == "model":
# stats_needed = {
# "Losses/Model Loss": "model_loss",
# "Policy/Learning Rate": "model_learning_rate",
# "Policy/Epsilon": "decay_epsilon",
# "Policy/Beta": "decay_beta",
# }
# elif update_type == "policy":
# stats_needed = {
# "Losses/Value Loss": "value_loss",
# "Losses/Policy Loss": "policy_loss",
# "Policy/Learning Rate": "learning_rate",
# "Policy/Epsilon": "decay_epsilon",
# "Policy/Beta": "decay_beta",
# }
update_stats = {}
# Collect feed dicts for all reward signals.
for _, reward_signal in self.reward_signals.items():

self.policy.run_hard_copy()
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]
if update_name in update_vals.keys():
update_stats[stat_name] = update_vals[update_name]
self.num_updates += 1
return update_stats

self.policy.processed_vector_next: mini_batch["next_vector_in"],
# self.policy.next_vector_in: mini_batch["next_vector_in"],
self.policy.current_action: mini_batch["actions"],
self.policy.current_reward: mini_batch["discounted_returns"],
self.policy.current_reward: mini_batch["extrinsic_rewards"],
# self.dis_returns: mini_batch["discounted_returns"]
}
for name in self.reward_signals:

正在加载...
取消
保存