浏览代码

Add normalizer

/develop-pytorch
Ervin Teng 5 年前
当前提交
5e6de46f
共有 1 个文件被更改,包括 83 次插入103 次删除
  1. 186
      ml-agents/mlagents/trainers/ppo/policy.py

186
ml-agents/mlagents/trainers/ppo/policy.py


class GaussianDistribution(tf.keras.layers.Layer):
def __init__(self, num_outputs, **kwargs):
super(GaussianDistribution, self).__init__(**kwargs)
self.mu = tf.keras.layers.Dense(num_outputs)
self.log_sigma_sq = tf.keras.layers.Dense(num_outputs)
self.mu = tf.keras.layers.Dense(
num_outputs,
kernel_initializer=tf.keras.initializers.VarianceScaling(scale=0.01),
)
self.log_sigma_sq = tf.keras.layers.Dense(
num_outputs,
kernel_initializer=tf.keras.initializers.VarianceScaling(scale=0.01),
)
def call(self, inputs):
mu = self.mu(inputs)

# )
class Normalizer(tf.keras.layers.Layer):
def __init__(self, vec_obs_size, **kwargs):
super(Normalizer, self).__init__(**kwargs)
print(vec_obs_size)
self.normalization_steps = tf.Variable(
name="normalization_steps", trainable=False, dtype=tf.int32, initial_value=1
)
self.running_mean = tf.Variable(
name="running_mean",
shape=[vec_obs_size],
trainable=False,
dtype=tf.float32,
initial_value=tf.zeros([vec_obs_size]),
)
self.running_variance = tf.Variable(
name="running_variance",
shape=[vec_obs_size],
trainable=False,
dtype=tf.float32,
initial_value=tf.ones([vec_obs_size]),
)
def call(self, inputs):
normalized_state = tf.clip_by_value(
(inputs - self.running_mean)
/ tf.sqrt(
self.running_variance
/ (tf.cast(self.normalization_steps, tf.float32) + 1)
),
-5,
5,
name="normalized_state",
)
return normalized_state
def update(self, vector_input):
mean_current_observation = tf.cast(
tf.reduce_mean(vector_input, axis=0), tf.float32
)
new_mean = self.running_mean + (
mean_current_observation - self.running_mean
) / tf.cast(tf.add(self.normalization_steps, 1), tf.float32)
new_variance = self.running_variance + (mean_current_observation - new_mean) * (
mean_current_observation - self.running_mean
)
self.running_mean.assign(new_mean)
self.running_variance.assign(new_variance)
self.normalization_steps.assign(self.normalization_steps + 1)
class ActorCriticPolicy(tf.keras.Model):
def __init__(
self,

self.distribution = GaussianDistribution(act_size)
self.critic = Critic(stream_names, VectorEncoder(h_size, num_layers))
self.act_size = act_size
self.normalize = normalize
def act(self, input):
_hidden = self.encoder(input)
def build(self, input_size):
self.normalizer = Normalizer(input_size[1])
def call(self, inputs):
if self.normalize:
norm_inputs = self.normalizer(inputs)
_hidden = self.encoder(norm_inputs)
action = dist.sample()
log_prob = dist.log_prob(action)
raw_action = dist.sample()
action = tf.clip_by_value(raw_action, -3, 3) / 3
log_prob = dist.log_prob(raw_action)
def get_values(self, input):
return self.critic(input)
def update_normalization(self, inputs):
if self.normalize:
self.normalizer.update(inputs)
def get_values(self, inputs):
return self.critic(inputs)
class PPOPolicy(object):

trainer_params.get("vis_encode_type", "simple")
),
)
# self.model.create_ppo_optimizer()
# self.inference_dict.update(
# {
# "action": self.model.output,
# "log_probs": self.model.all_log_probs,
# "value_heads": self.model.value_heads,
# "value": self.model.value,
# "entropy": self.model.entropy,
# "learning_rate": self.model.learning_rate,
# }
# )
# if self.use_continuous_act:
# self.inference_dict["pre_action"] = self.model.output_pre
# if self.use_recurrent:
# self.inference_dict["memory_out"] = self.model.memory_out
# self.total_policy_loss = self.model.abs_policy_loss
# self.update_dict.update(
# {
# "value_loss": self.model.value_loss,
# "policy_loss": self.total_policy_loss,
# "update_batch": self.model.update_batch,
# }
# )
def ppo_loss(
self,

# beta, self.global_step, max_step, 1e-5, power=1.0
# )
decay_epsilon = self.trainer_params["epsilon"]
decay_beta = self.trainer_params["beta"]
decay_beta = self.trainer_params["beta"] / 10
# max_step = self.trainer_params["max_step"]
value_losses = []

returns[name], tf.reduce_sum(head, axis=1)
)
v_opt_b = tf.math.squared_difference(returns[name], clipped_value_estimate)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), masks, 2)[1]
)
value_loss = tf.reduce_mean(tf.maximum(v_opt_a, v_opt_b))
r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (

"""
run_out = {}
action, log_probs, entropy = self.model.act(brain_info.vector_observations)
action, log_probs, entropy = self.model(brain_info.vector_observations)
run_out["action"] = action.numpy()
run_out["log_probs"] = log_probs.numpy()
run_out["entropy"] = entropy.numpy()

}
run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0)
print(run_out["value_heads"])
self.model.update_normalization(brain_info.vector_observations)
return run_out
def get_action(self, brain_info: BrainInfo) -> ActionInfo:

obs = np.array(mini_batch["vector_obs"])
values = self.model.get_values(obs)
action, probs, entropy = self.model.act(obs)
action, probs, entropy = self.model(obs)
mini_batch["advantages"],
np.array(mini_batch["advantages"]),
mini_batch["action_probs"],
np.array(mini_batch["action_probs"]),
values,
old_values,
returns,

1000,
)
print(grads)
update_stats["loss"] = loss
update_stats["Policy/Loss"] = loss
def construct_feed_dict(self, model, mini_batch, num_sequences):
feed_dict = {
model.batch_size: num_sequences,
model.sequence_length: self.sequence_length,
model.mask_input: mini_batch["masks"],
model.advantage: mini_batch["advantages"],
model.all_old_log_probs: mini_batch["action_probs"],
}
for name in self.reward_signals:
feed_dict[model.returns_holders[name]] = mini_batch[
"{}_returns".format(name)
]
feed_dict[model.old_values[name]] = mini_batch[
"{}_value_estimates".format(name)
]
if self.use_continuous_act:
feed_dict[model.output_pre] = mini_batch["actions_pre"]
feed_dict[model.epsilon] = mini_batch["random_normal_epsilon"]
else:
feed_dict[model.action_holder] = mini_batch["actions"]
if self.use_recurrent:
feed_dict[model.prev_action] = mini_batch["prev_action"]
feed_dict[model.action_masks] = mini_batch["action_mask"]
if self.use_vec_obs:
feed_dict[model.vector_in] = mini_batch["vector_obs"]
if self.model.vis_obs_size > 0:
for i, _ in enumerate(self.model.visual_in):
feed_dict[model.visual_in[i]] = mini_batch["visual_obs%d" % i]
if self.use_recurrent:
mem_in = [
mini_batch["memory"][i]
for i in range(0, len(mini_batch["memory"]), self.sequence_length)
]
feed_dict[model.memory_in] = mem_in
return feed_dict
def get_value_estimates(
self, brain_info: BrainInfo, idx: int, done: bool
) -> Dict[str, float]:

:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""
# feed_dict: Dict[tf.Tensor, Any] = {
# self.model.batch_size: 1,
# self.model.sequence_length: 1,
# }
# for i in range(len(brain_info.visual_observations)):
# feed_dict[self.model.visual_in[i]] = [
# brain_info.visual_observations[i][idx]
# ]
# if self.use_vec_obs:
# feed_dict[self.model.vector_in] = [brain_info.vector_observations[idx]]
# if self.use_recurrent:
# if brain_info.memories.shape[1] == 0:
# brain_info.memories = self.make_empty_memory(len(brain_info.agents))
# feed_dict[self.model.memory_in] = [brain_info.memories[idx]]
# if not self.use_continuous_act and self.use_recurrent:
# feed_dict[self.model.prev_action] = [
# brain_info.previous_vector_actions[idx]
# ]
value_estimates = self.model.get_values(
np.expand_dims(brain_info.vector_observations[idx], 0)
)

正在加载...
取消
保存