浏览代码

It's mostly training

/develop-pytorch
Ervin Teng 5 年前
当前提交
a665daed
共有 1 个文件被更改,包括 27 次插入22 次删除
  1. 49
      ml-agents/mlagents/trainers/ppo/policy.py

49
ml-agents/mlagents/trainers/ppo/policy.py


num_outputs,
kernel_initializer=tf.keras.initializers.VarianceScaling(scale=0.01),
)
# self.log_sigma_sq = tf.keras.layers.Dense(
# num_outputs,
# kernel_initializer=tf.keras.initializers.VarianceScaling(scale=0.01),
# )
self.log_sigma_sq = tf.Variable(
name="log_sig_sq", dtype=tf.float32, initial_value=tf.zeros([num_outputs])
self.log_sigma_sq = tf.keras.layers.Dense(
num_outputs,
kernel_initializer=tf.keras.initializers.VarianceScaling(scale=0.01),
# self.log_sigma_sq = tf.Variable(
# name="log_sig_sq", dtype=tf.float32, initial_value=tf.zeros([num_outputs]), trainable=True
# )
log_sig = self.log_sigma_sq
return tfp.distributions.MultivariateNormalDiag(
loc=mu, scale_diag=tf.sqrt(tf.exp(log_sig))
)
log_sig = self.log_sigma_sq(inputs)
return tfp.distributions.Normal(loc=mu, scale=tf.sqrt(tf.exp(log_sig)))
class Normalizer(tf.keras.layers.Layer):

# entropy = dist.entropy()
return dist
@tf.function
def update_normalization(self, inputs):
if self.normalize:
self.normalizer.update(inputs)

tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
* advantage
)
# print(tf.reduce_mean(p_opt_a), tf.reduce_mean(p_opt_b))
policy_loss = -tf.reduce_mean(tf.minimum(p_opt_a, p_opt_b))
# For cleaner stats reporting
# abs_policy_loss = tf.abs(policy_loss)

)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
@tf.function
def execute_model(self, observations):
action_dist = self.model(observations)
action = action_dist.sample()
log_probs = action_dist.log_prob(action)
entropy = action_dist.entropy()
value_heads = self.model.get_values(observations)
return action, log_probs, entropy, value_heads
@timed
def evaluate(self, brain_info):
"""

"""
run_out = {}
action_dist = self.model(brain_info.vector_observations)
action = action_dist.sample()
log_probs = action_dist.log_prob(action)
entropy = action_dist.entropy()
run_out["action"] = action.numpy()
run_out["log_probs"] = log_probs.numpy()
run_out["entropy"] = entropy.numpy()
run_out["value_heads"] = {
name: t.numpy()
for name, t in self.model.get_values(brain_info.vector_observations).items()
}
action, log_probs, entropy, value_heads = self.execute_model(
brain_info.vector_observations
)
run_out["action"] = np.array(action)
run_out["log_probs"] = np.array(log_probs)
run_out["entropy"] = np.array(entropy)
run_out["value_heads"] = {name: np.array(t) for name, t in value_heads.items()}
run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0)
run_out["learning_rate"] = 0.0
self.model.update_normalization(brain_info.vector_observations)

# print(grad,weight.name)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
update_stats = {}
update_stats["Losses/Policy Loss"] = policy_loss
update_stats["Losses/Policy Loss"] = abs(policy_loss)
update_stats["Losses/Value Loss"] = value_loss
# for stat_name, update_name in stats_needed.items():
# update_stats[stat_name] = update_vals[update_name]

正在加载...
取消
保存