浏览代码

Somewhat running

/develop-pytorch
Ervin Teng 5 年前
当前提交
748c250e
共有 5 个文件被更改,包括 115 次插入57 次删除
  1. 2
      ml-agents/mlagents/tf_utils/tf.py
  2. 121
      ml-agents/mlagents/trainers/ppo/policy.py
  3. 2
      ml-agents/mlagents/trainers/ppo/policy_old.py
  4. 26
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 21
      ml-agents/mlagents/trainers/trainer.py

2
ml-agents/mlagents/tf_utils/tf.py


tf_flatten = tf.layers.flatten
tf_rnn = tf.nn.rnn_cell
tf.disable_v2_behavior()
# tf.disable_v2_behavior()
else:
import tensorflow.contrib.layers as c_layers

121
ml-agents/mlagents/trainers/ppo/policy.py


import logging
# import numpy as np
import numpy as np
from mlagents.tf_utils import tf
import tensorflow as tf
from mlagents.envs.action_info import ActionInfo
from mlagents.trainers.models import EncoderType # , LearningRateSchedule
# from mlagents.trainers.ppo.models import PPOModel

self.mu = tf.keras.layers.Dense(num_outputs)
self.log_sigma_sq = tf.keras.layers.Dense(num_outputs)
def call(self, inputs, epsilon):
def call(self, inputs):
return tfp.distrbutions.Normal(loc=mu, scale=tf.sqrt(tf.exp(log_sig)))
return tfp.distributions.Normal(loc=mu, scale=tf.sqrt(tf.exp(log_sig)))
# action = mu + tf.sqrt(tf.exp(log_sig)) + epsilon
# def log_probs(self, inputs) # Compute probability of model output.

self.create_model(
brain, trainer_params, reward_signal_configs, is_training, load, seed
)
self.brain = brain
self.sequence_length = 1 if not self.trainer_params["use_recurrent"] else self.trainer_params["sequence_length"]
self.global_step = tf.Variable(0)
self.create_reward_signals(reward_signal_configs)
# with self.graph.as_default():

:param seed: Random seed.
"""
self.model = ActorCriticPolicy(
brain=brain,
act_size=sum(brain.vector_action_space_size),
m_size=self.m_size,
m_size=trainer_params["memory_size"],
stream_names=list(reward_signal_configs.keys()),
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")

entropy,
beta,
epsilon,
lr,
max_step,
):
"""
Creates training-specific Tensorflow ops for PPO models.

# )
decay_epsilon = self.trainer_params["epsilon"]
decay_beta = self.trainer_params["beta"]
# max_step = self.trainer_params["max_step"]
value_losses = []
for name, head in values.items():

decay_epsilon,
)
v_opt_a = tf.squared_difference(returns[name], tf.reduce_sum(head, axis=1))
v_opt_b = tf.squared_difference(returns[name], clipped_value_estimate)
v_opt_a = tf.math.squared_difference(returns[name], tf.reduce_sum(head, axis=1))
v_opt_b = tf.math.squared_difference(returns[name], clipped_value_estimate)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), masks, 2)[1]
)

tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
* advantage
)
policy_loss = -tf.reduce_mean(
tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), masks, 2)[1]
)
policy_loss = -tf.reduce_mean(tf.minimum(p_opt_a, p_opt_b))
# For cleaner stats reporting
# abs_policy_loss = tf.abs(policy_loss)

- decay_beta * tf.reduce_mean(tf.dynamic_partition(entropy, masks, 2)[1])
- decay_beta * tf.reduce_mean(entropy)
)
return loss

:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
with self.graph.as_default():
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, self.model, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
# with self.graph.as_default():
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, self.model, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
@timed
def evaluate(self, brain_info):

"""
run_out = {}
run_out["action"], run_out["log_probs"], run_out["entropy"] = self.model.act(
brain_info.vector_observations
)
run_out["value_heads"] = self.model.get_values(brain_info.vector_observations)
run_out["value"] = tf.reduce_mean(list(self.value_heads.values()), 0)
action, log_probs, entropy = self.model.act(brain_info.vector_observations)
run_out["action"] = action.numpy()
run_out["log_probs"] = log_probs.numpy()
run_out["entropy"] = entropy.numpy()
run_out["value_heads"] = {
name: t.numpy()
for name, t in self.model.get_values(brain_info.vector_observations).items()
}
run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0)
print(run_out["value_heads"])
def get_action(self, brain_info: BrainInfo) -> ActionInfo:
"""
Decides actions given observations information, and takes them in environment.
:param brain_info: A dictionary of brain names and BrainInfo from environment.
:return: an ActionInfo containing action, memories, values and an object
to be passed to add experiences
"""
if len(brain_info.agents) == 0:
return ActionInfo([], [], None)
run_out = self.evaluate(brain_info) # pylint: disable=assignment-from-no-return
return ActionInfo(
action=run_out.get("action"), value=run_out.get("value"), outputs=run_out
)
@timed
def update(self, mini_batch, num_sequences):
"""

returns[name] = mini_batch["{}_returns".format(name)]
old_values[name] = mini_batch["{}_value_estimates".format(name)]
values = self.model.get_values(mini_batch["vector_obs"])
action, probs, entropy = self.model.act(mini_batch["vector_obs"])
obs = np.array(mini_batch["vector_obs"])
values = self.model.get_values(obs)
action, probs, entropy = self.model.act(obs)
loss = self.ppo_loss(
mini_batch["advantages"],
probs,

returns,
mini_batch["masks"],
np.array(mini_batch["masks"], dtype=np.uint32),
entropy,
1e-3,
1000,

print(grads)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
update_stats = {}

# feed_dict[self.model.prev_action] = [
# brain_info.previous_vector_actions[idx]
# ]
value_estimates = self.model.get_values(brain_info.vector_observations[idx])
value_estimates = self.model.get_values(np.expand_dims(brain_info.vector_observations[idx],0))
value_estimates = {k: float(v) for k, v in value_estimates.items()}

value_estimates[k] = 0.0
return value_estimates
@property
def vis_obs_size(self):
return self.brain.number_visual_observations
@property
def vec_obs_size(self):
return self.brain.vector_observation_space_size
@property
def use_vis_obs(self):
return self.vis_obs_size > 0
@property
def use_vec_obs(self):
return self.vec_obs_size > 0
@property
def use_recurrent(self):
return False
@property
def use_continuous_act(self):
return True
def get_current_step(self):
"""
Gets current model step.
:return: current model step.
"""
step = self.global_step.numpy()
return step
def increment_step(self, n_steps):
"""
Increments model step.
"""
self.global_step.assign(self.global_step + n_steps)
return self.get_current_step()

2
ml-agents/mlagents/trainers/ppo/policy_old.py


brain, trainer_params, reward_signal_configs, is_training, load, seed
)
self.create_reward_signals(reward_signal_configs)
self.trainer_params = trainer_params
with self.graph.as_default():
self.bc_module: Optional[BCModule] = None
# Create pretrainer if needed

26
ml-agents/mlagents/trainers/ppo/trainer.py


:param next_info: Dictionary of all next brains and corresponding BrainInfo.
"""
info = next_info[self.brain_name]
if self.is_training:
self.policy.update_normalization(info.vector_observations)
# if self.is_training:
# self.policy.update_normalization(info.vector_observations)
for l in range(len(info.agents)):
agent_actions = self.training_buffer[info.agents[l]]["actions"]
if (

Takes the output of the last action and store it into the training buffer.
"""
actions = take_action_outputs["action"]
if self.policy.use_continuous_act:
actions_pre = take_action_outputs["pre_action"]
self.training_buffer[agent_id]["actions_pre"].append(actions_pre[agent_idx])
epsilons = take_action_outputs["random_normal_epsilon"]
self.training_buffer[agent_id]["random_normal_epsilon"].append(
epsilons[agent_idx]
)
# if self.policy.use_continuous_act:
# actions_pre = take_action_outputs["pre_action"]
# self.training_buffer[agent_id]["actions_pre"].append(actions_pre[agent_idx])
# epsilons = take_action_outputs["random_normal_epsilon"]
# self.training_buffer[agent_id]["random_normal_epsilon"].append(
# epsilons[agent_idx]
# )
a_dist = take_action_outputs["log_probs"]
# value is a dictionary from name of reward to value estimate of the value head
self.training_buffer[agent_id]["actions"].append(actions[agent_idx])

for stat, stat_list in batch_update_stats.items():
self.stats[stat].append(np.mean(stat_list))
if self.policy.bc_module:
update_stats = self.policy.bc_module.update()
for stat, val in update_stats.items():
self.stats[stat].append(val)
# if self.policy.bc_module:
# update_stats = self.policy.bc_module.update()
# for stat, val in update_stats.items():
# self.stats[stat].append(val)
self.clear_update_buffer()
self.trainer_metrics.end_policy_update()

21
ml-agents/mlagents/trainers/trainer.py


import logging
from typing import Dict, List, Deque, Any
import os
from mlagents.tf_utils import tf
import tensorflow as tf
import numpy as np
from collections import deque, defaultdict

self.trainer_metrics = TrainerMetrics(
path=self.summary_path + ".csv", brain_name=self.brain_name
)
self.summary_writer = tf.summary.FileWriter(self.summary_path)
self.summary_writer = tf.summary.create_file_writer(self.summary_path)
self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
self.policy: TFPolicy = None
self.step: int = 0

self.run_id, self.brain_name, step, is_training
)
)
summary = tf.Summary()
for key in self.stats:
if len(self.stats[key]) > 0:
stat_mean = float(np.mean(self.stats[key]))
summary.value.add(tag="{}".format(key), simple_value=stat_mean)
self.stats[key] = []
summary.value.add(tag="Environment/Lesson", simple_value=lesson_num)
self.summary_writer.add_summary(summary, step)
self.summary_writer.flush()
with self.summary_writer.as_default():
for key in self.stats:
if len(self.stats[key]) > 0:
stat_mean = float(np.mean(self.stats[key]))
tf.summary.scalar("{}".format(key), stat_mean, step=step)
self.stats[key] = []
tf.summary.scalar("Environment/Lesson", lesson_num, step)
def write_tensorboard_text(self, key: str, input_dict: Dict[str, Any]) -> None:
"""

正在加载...
取消
保存