浏览代码

Update reward signals in parallel with policy (#2362)

/develop-gpu-test
GitHub 6 年前
当前提交
bd7eb286
共有 13 个文件被更改,包括 334 次插入346 次删除
  1. 31
      docs/Reward-Signals.md
  2. 91
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  3. 17
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  4. 29
      ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
  5. 173
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  6. 24
      ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py
  7. 8
      ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
  8. 2
      ml-agents/mlagents/trainers/ppo/models.py
  9. 130
      ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
  10. 129
      ml-agents/mlagents/trainers/ppo/policy.py
  11. 29
      ml-agents/mlagents/trainers/ppo/trainer.py
  12. 10
      ml-agents/mlagents/trainers/tests/test_multigpu.py
  13. 7
      ml-agents/mlagents/trainers/tests/test_reward_signals.py

31
docs/Reward-Signals.md


### Curiosity Reward Signal
The `curiosity` reward signal enables the Intrinsic Curiosity Module. This is an implementation
The `curiosity` Reward Signal enables the Intrinsic Curiosity Module. This is an implementation
of the approach described in "Curiosity-driven Exploration by Self-supervised Prediction"
by Pathak, et al. It trains two networks:
* an inverse model, which takes the current and next obersvation of the agent, encodes them, and

Default Value: `3e-4`
Typical Range: `1e-5` - `1e-3`
#### (Optional) Num Epochs
`num_epoch` The number of passes to make through the experience buffer when performing gradient
descent optimization for the ICM. This typically should be set to the same as used for PPO.
Default Value: `3`
Typical Range: `3` - `10`
### GAIL Reward Signal

unstable, or unable to learn the task at hand.
Default Value: `false`
#### (Optional) Samples Per Update
`samples_per_update` is the maximum number of samples to use during each discriminator update. You may
want to lower this if your buffer size is very large to avoid overfitting the discriminator on current data.
If set to 0, we will use the minimum of buffer size and the number of demonstration samples.
Default Value: `0`
Typical Range: Approximately equal to [`buffer_size`](Training-PPO.md)
#### (Optional) Num Epochs
`num_epoch` The number of passes to make through the experience buffer when performing gradient
descent optimization for the discriminator. To avoid overfitting, this typically should be set to
the same as or less than used for PPO.
Default Value: `3`
Typical Range: `1` - `10`

91
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


import numpy as np
from mlagents.envs.brain import BrainInfo
import tensorflow as tf
from mlagents.trainers.models import LearningModel
class CuriosityRewardSignal(RewardSignal):

policy_model: LearningModel,
num_epoch: int = 3,
):
"""
Creates the Curiosity reward generator

:param gamma: The time discounting factor used for this reward.
:param encoding_size: The size of the hidden encoding layer for the ICM
:param learning_rate: The learning rate for the ICM.
:param num_epoch: The number of epochs to train over the training buffer for the ICM.
super().__init__(policy, strength, gamma)
super().__init__(policy, policy_model, strength, gamma)
policy.model, encoding_size=encoding_size, learning_rate=learning_rate
policy_model, encoding_size=encoding_size, learning_rate=learning_rate
self.num_epoch = num_epoch
"forward_loss": self.model.forward_loss,
"inverse_loss": self.model.inverse_loss,
"update": self.model.update_batch,
"curiosity_forward_loss": self.model.forward_loss,
"curiosity_inverse_loss": self.model.inverse_loss,
"curiosity_update": self.model.update_batch,
}
self.stats_name_to_update_name = {
"Losses/Curiosity Forward Loss": "curiosity_forward_loss",
"Losses/Curiosity Inverse Loss": "curiosity_inverse_loss",
}
self.has_updated = False

param_keys = ["strength", "gamma", "encoding_size"]
super().check_config(config_dict, param_keys)
def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
"""
Updates Curiosity model using training buffer. Divides training buffer into mini batches and performs
gradient descent.
:param update_buffer: Update buffer from which to pull data from.
:param num_sequences: Number of sequences in the update buffer.
:return: Dict of stats that should be reported to Tensorboard.
def prepare_update(
self,
policy_model: LearningModel,
mini_batch: Dict[str, np.ndarray],
num_sequences: int,
) -> Dict[tf.Tensor, Any]:
forward_total: List[float] = []
inverse_total: List[float] = []
for _ in range(self.num_epoch):
update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = update_buffer
for l in range(len(update_buffer["actions"]) // num_sequences):
start = l * num_sequences
end = (l + 1) * num_sequences
run_out_curio = self._update_batch(
buffer.make_mini_batch(start, end), num_sequences
)
inverse_total.append(run_out_curio["inverse_loss"])
forward_total.append(run_out_curio["forward_loss"])
update_stats = {
"Losses/Curiosity Forward Loss": np.mean(forward_total),
"Losses/Curiosity Inverse Loss": np.mean(inverse_total),
}
return update_stats
def _update_batch(
self, mini_batch: Dict[str, np.ndarray], num_sequences: int
) -> Dict[str, float]:
"""
Updates model using buffer.
Prepare for update and get feed_dict.
:return: Output from update process.
:return: Feed_dict needed for update.
self.policy.model.batch_size: num_sequences,
self.policy.model.sequence_length: self.policy.sequence_length,
self.policy.model.mask_input: mini_batch["masks"],
self.policy.model.advantage: mini_batch["advantages"],
self.policy.model.all_old_log_probs: mini_batch["action_probs"],
policy_model.batch_size: num_sequences,
policy_model.sequence_length: self.policy.sequence_length,
policy_model.mask_input: mini_batch["masks"],
policy_model.advantage: mini_batch["advantages"],
policy_model.all_old_log_probs: mini_batch["action_probs"],
feed_dict[self.policy.model.output_pre] = mini_batch["actions_pre"]
feed_dict[policy_model.output_pre] = mini_batch["actions_pre"]
feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
feed_dict[policy_model.action_holder] = mini_batch["actions"]
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
if self.policy.model.vis_obs_size > 0:
for i, _ in enumerate(self.policy.model.visual_in):
feed_dict[self.policy.model.visual_in[i]] = mini_batch[
"visual_obs%d" % i
]
for i, _ in enumerate(self.policy.model.visual_in):
if policy_model.vis_obs_size > 0:
for i, _ in enumerate(policy_model.visual_in):
feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
for i, _ in enumerate(policy_model.visual_in):
run_out = self.policy._execute_model(feed_dict, self.update_dict)
return run_out
return feed_dict

17
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


from mlagents.trainers.buffer import Buffer
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
def __init__(self, policy: TFPolicy, strength: float, gamma: float):
def __init__(
self,
policy: TFPolicy,
policy_model: LearningModel,
strength: float,
gamma: float,
):
"""
The extrinsic reward generator. Returns the reward received by the environment
:param policy: The Policy object (e.g. PPOPolicy) that this Reward Signal will apply to.

"""
super().__init__(policy, strength, gamma)
super().__init__(policy, policy_model, strength, gamma)
@classmethod
def check_config(

unscaled_reward = np.array(next_info.rewards)
scaled_reward = self.strength * unscaled_reward
return RewardSignalResult(scaled_reward, unscaled_reward)
def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
"""
This method does nothing, as there is nothing to update.
"""
return {}

29
ml-agents/mlagents/trainers/components/reward_signals/gail/model.py


self.gradient_penalty_weight = gradient_penalty_weight
self.use_vail = use_vail
self.use_actions = use_actions # True # Not using actions
self.make_beta()
if self.use_vail:
self.make_beta_update()
def make_beta(self) -> None:
def make_beta_update(self) -> None:
self.beta = tf.get_variable(
"gail_beta",
[],
trainable=False,
dtype=tf.float32,
initializer=tf.ones_initializer(),
)
self.kl_div_input = tf.placeholder(shape=[], dtype=tf.float32)
self.beta + self.alpha * (self.kl_div_input - self.mutual_information),
EPSILON,
self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON
self.update_beta = tf.assign(self.beta, new_beta)
with tf.control_dependencies(self.update_batch):
self.update_beta = tf.assign(self.beta, new_beta)
def make_inputs(self) -> None:
"""

"""
self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
if self.use_vail:
self.beta = tf.get_variable(
"gail_beta",
[],
trainable=False,
dtype=tf.float32,
initializer=tf.ones_initializer(),
)
self.discriminator_loss = -tf.reduce_mean(
tf.log(self.expert_estimate + EPSILON)

173
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


from mlagents.trainers.buffer import Buffer
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
from .model import GAILModel
from mlagents.trainers.demo_loader import demo_to_buffer

def __init__(
self,
policy: TFPolicy,
policy_model: LearningModel,
num_epoch: int = 3,
samples_per_update: int = 0,
use_actions: bool = False,
use_vail: bool = False,
):

:param num_epoch: The number of epochs to train over the training buffer for the discriminator.
:param encoding_size: The size of the the hidden layers of the discriminator
:param learning_rate: The Learning Rate used during GAIL updates.
:param samples_per_update: The maximum number of samples to update during GAIL updates.
super().__init__(policy, strength, gamma)
self.num_epoch = num_epoch
self.samples_per_update = samples_per_update
super().__init__(policy, policy_model, strength, gamma)
self.use_terminal_states = False
self.model = GAILModel(

self.has_updated = False
self.update_dict: Dict[str, tf.Tensor] = {
"gail_loss": self.model.loss,
"gail_update_batch": self.model.update_batch,
"gail_policy_estimate": self.model.policy_estimate,
"gail_expert_estimate": self.model.expert_estimate,
}
if self.model.use_vail:
self.update_dict["kl_loss"] = self.model.kl_loss
self.update_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq
self.update_dict["z_mean_expert"] = self.model.z_mean_expert
self.update_dict["z_mean_policy"] = self.model.z_mean_policy
self.update_dict["beta_update"] = self.model.update_beta
self.stats_name_to_update_name = {"Losses/GAIL Loss": "gail_loss"}
def evaluate(
self, current_info: BrainInfo, next_info: BrainInfo

param_keys = ["strength", "gamma", "demo_path"]
super().check_config(config_dict, param_keys)
def update(self, update_buffer: Buffer, n_sequences: int) -> Dict[str, float]:
"""
Updates model using buffer.
:param update_buffer: The policy buffer containing the trajectories for the current policy.
:param n_sequences: The number of sequences from demo and policy used in each mini batch.
:return: The loss of the update.
"""
batch_losses = []
# Divide by 2 since we have two buffers, so we have roughly the same batch size
n_sequences = max(n_sequences // 2, 1)
possible_demo_batches = (
len(self.demonstration_buffer.update_buffer["actions"]) // n_sequences
)
possible_policy_batches = len(update_buffer["actions"]) // n_sequences
possible_batches = min(possible_policy_batches, possible_demo_batches)
max_batches = self.samples_per_update // n_sequences
kl_loss = []
policy_estimate = []
expert_estimate = []
z_log_sigma_sq = []
z_mean_expert = []
z_mean_policy = []
n_epoch = self.num_epoch
for _epoch in range(n_epoch):
self.demonstration_buffer.update_buffer.shuffle(
sequence_length=self.policy.sequence_length
)
update_buffer.shuffle(sequence_length=self.policy.sequence_length)
if max_batches == 0:
num_batches = possible_batches
else:
num_batches = min(possible_batches, max_batches)
for i in range(num_batches):
demo_update_buffer = self.demonstration_buffer.update_buffer
policy_update_buffer = update_buffer
start = i * n_sequences
end = (i + 1) * n_sequences
mini_batch_demo = demo_update_buffer.make_mini_batch(start, end)
mini_batch_policy = policy_update_buffer.make_mini_batch(start, end)
run_out = self._update_batch(mini_batch_demo, mini_batch_policy)
loss = run_out["gail_loss"]
policy_estimate.append(run_out["policy_estimate"])
expert_estimate.append(run_out["expert_estimate"])
if self.model.use_vail:
kl_loss.append(run_out["kl_loss"])
z_log_sigma_sq.append(run_out["z_log_sigma_sq"])
z_mean_policy.append(run_out["z_mean_policy"])
z_mean_expert.append(run_out["z_mean_expert"])
batch_losses.append(loss)
self.has_updated = True
print_list = ["n_epoch", "beta", "policy_estimate", "expert_estimate"]
print_vals = [
n_epoch,
self.policy.sess.run(self.model.beta),
np.mean(policy_estimate),
np.mean(expert_estimate),
]
if self.model.use_vail:
print_list += [
"kl_loss",
"z_mean_expert",
"z_mean_policy",
"z_log_sigma_sq",
]
print_vals += [
np.mean(kl_loss),
np.mean(z_mean_expert),
np.mean(z_mean_policy),
np.mean(z_log_sigma_sq),
]
LOGGER.debug(
"GAIL Debug:\n\t\t"
+ "\n\t\t".join(
"{0}: {1}".format(_name, _val)
for _name, _val in zip(print_list, print_vals)
)
)
update_stats = {"Losses/GAIL Loss": np.mean(batch_losses)}
return update_stats
def _update_batch(
def prepare_update(
mini_batch_demo: Dict[str, np.ndarray],
policy_model: LearningModel,
) -> Dict[str, float]:
num_sequences: int,
) -> Dict[tf.Tensor, Any]:
Helper method for update.
Prepare inputs for update. .
:return: Output from update process.
:return: Feed_dict for update process.
max_num_experiences = min(
len(mini_batch_policy["actions"]),
len(self.demonstration_buffer.update_buffer["actions"]),
)
# If num_sequences is less, we need to shorten the input batch.
for key, element in mini_batch_policy.items():
mini_batch_policy[key] = element[:max_num_experiences]
# Get demo buffer
self.demonstration_buffer.update_buffer.shuffle(1)
# TODO: Replace with SAC sample method
mini_batch_demo = self.demonstration_buffer.update_buffer.make_mini_batch(
0, len(mini_batch_policy["actions"])
)
feed_dict: Dict[tf.Tensor, Any] = {
self.model.done_expert_holder: mini_batch_demo["done"],
self.model.done_policy_holder: mini_batch_policy["done"],

feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
if self.policy.use_continuous_act:
feed_dict[self.policy.model.selected_actions] = mini_batch_policy["actions"]
feed_dict[policy_model.selected_actions] = mini_batch_policy["actions"]
feed_dict[self.policy.model.action_holder] = mini_batch_policy["actions"]
feed_dict[policy_model.action_holder] = mini_batch_policy["actions"]
for i in range(len(self.policy.model.visual_in)):
feed_dict[self.policy.model.visual_in[i]] = mini_batch_policy[
for i in range(len(policy_model.visual_in)):
feed_dict[policy_model.visual_in[i]] = mini_batch_policy[
"visual_obs%d" % i
]
feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[

feed_dict[self.policy.model.vector_in] = mini_batch_policy["vector_obs"]
feed_dict[policy_model.vector_in] = mini_batch_policy["vector_obs"]
out_dict = {
"gail_loss": self.model.loss,
"update_batch": self.model.update_batch,
"policy_estimate": self.model.policy_estimate,
"expert_estimate": self.model.expert_estimate,
}
if self.model.use_vail:
out_dict["kl_loss"] = self.model.kl_loss
out_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq
out_dict["z_mean_expert"] = self.model.z_mean_expert
out_dict["z_mean_policy"] = self.model.z_mean_policy
run_out = self.policy.sess.run(out_dict, feed_dict=feed_dict)
if self.model.use_vail:
self.update_beta(run_out["kl_loss"])
return run_out
def update_beta(self, kl_div: float) -> None:
"""
Updates the Beta parameter with the latest kl_divergence value.
The larger Beta, the stronger the importance of the kl divergence in the loss function.
:param kl_div: The KL divergence
"""
self.policy.sess.run(
self.model.update_beta, feed_dict={self.model.kl_div_input: kl_div}
)
self.has_updated = True
return feed_dict

24
ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py


from mlagents.envs.brain import BrainInfo
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
from mlagents.trainers.buffer import Buffer
logger = logging.getLogger("mlagents.trainers")

class RewardSignal(abc.ABC):
def __init__(self, policy: TFPolicy, strength: float, gamma: float):
def __init__(
self,
policy: TFPolicy,
policy_model: LearningModel,
strength: float,
gamma: float,
):
"""
Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
the reward strength, and the gamma (discount factor.)

# Terminate discounted reward computation at Done. Can disable to mitigate positive bias in rewards with
# no natural end, e.g. GAIL or Curiosity
self.use_terminal_states = True
self.update_dict: Dict[str, tf.Tensor] = {}
self.policy_model = policy_model
self.stats_name_to_update_name: Dict[str, str] = {}
def evaluate(
self, current_info: BrainInfo, next_info: BrainInfo

np.zeros(len(current_info.agents)),
)
def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
def prepare_update(
self,
policy_model: LearningModel,
mini_batch: Dict[str, np.ndarray],
num_sequences: int,
) -> Dict[tf.Tensor, Any]:
If the reward signal has an internal model (e.g. GAIL or Curiosity), update that model.
If the reward signal has an internal model (e.g. GAIL or Curiosity), get the feed_dict
needed to update the buffer..
:return: A dict of {"Stat Name": stat} to be added to Tensorboard
:return: A dict that corresponds to the feed_dict needed for the update.
"""
return {}

8
ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py


CuriosityRewardSignal,
)
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
logger = logging.getLogger("mlagents.trainers")

def create_reward_signal(
policy: TFPolicy, name: str, config_entry: Dict[str, Any]
policy: TFPolicy,
policy_model: LearningModel,
name: str,
config_entry: Dict[str, Any],
) -> RewardSignal:
"""
Creates a reward signal class based on the name and config entry provided as a dict.

raise UnityTrainerException("Unknown reward signal type {0}".format(name))
rcls.check_config(config_entry)
try:
class_inst = rcls(policy, **config_entry)
class_inst = rcls(policy, policy_model, **config_entry)
except TypeError:
raise UnityTrainerException(
"Unknown parameters given for reward signal {0}".format(name)

2
ml-agents/mlagents/trainers/ppo/models.py


self.policy_loss = -tf.reduce_mean(
tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.mask, 2)[1]
)
# For cleaner stats reporting
self.abs_policy_loss = tf.abs(self.policy_loss)
self.loss = (
self.policy_loss

130
ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py


"""
super().__init__(seed, brain, trainer_params, is_training, load)
with self.graph.as_default():
avg_grads = self.average_gradients([t.grads for t in self.towers])
self.update_batch = self.model.optimizer.apply_gradients(avg_grads)
self.update_dict = {"update_batch": self.update_batch}
self.update_dict.update(
{
"value_loss_" + str(i): self.towers[i].value_loss
for i in range(len(self.towers))
}
)
self.update_dict.update(
{
"policy_loss_" + str(i): self.towers[i].policy_loss
for i in range(len(self.towers))
}
)
def create_model(self, brain, trainer_params, reward_signal_configs, seed):
def create_model(
self, brain, trainer_params, reward_signal_configs, is_training, load, seed
):
"""
Create PPO models, one on each device
:param brain: Assigned Brain object.

self.towers[-1].create_ppo_optimizer()
self.model = self.towers[0]
avg_grads = self.average_gradients([t.grads for t in self.towers])
update_batch = self.model.optimizer.apply_gradients(avg_grads)
avg_value_loss = tf.reduce_mean(
tf.stack([model.value_loss for model in self.towers]), 0
)
avg_policy_loss = tf.reduce_mean(
tf.stack([model.policy_loss for model in self.towers]), 0
)
self.inference_dict.update(
{
"action": self.model.output,
"log_probs": self.model.all_log_probs,
"value_heads": self.model.value_heads,
"value": self.model.value,
"entropy": self.model.entropy,
"learning_rate": self.model.learning_rate,
}
)
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.model.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.model.memory_out
if (
is_training
and self.use_vec_obs
and trainer_params["normalize"]
and not load
):
self.inference_dict["update_mean"] = self.model.update_normalization
self.total_policy_loss = self.model.abs_policy_loss
self.update_dict.update(
{
"value_loss": avg_value_loss,
"policy_loss": avg_policy_loss,
"update_batch": update_batch,
}
)
def create_reward_signals(self, reward_signal_configs):
"""
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signal_towers = []
with self.graph.as_default():
with tf.variable_scope(TOWER_SCOPE_NAME, reuse=tf.AUTO_REUSE):
for device_id, device in enumerate(self.devices):
with tf.device(device):
reward_tower = {}
for reward_signal, config in reward_signal_configs.items():
reward_tower[reward_signal] = create_reward_signal(
self, self.towers[device_id], reward_signal, config
)
for k, v in reward_tower[reward_signal].update_dict.items():
self.update_dict[k + "_" + str(device_id)] = v
self.reward_signal_towers.append(reward_tower)
for _, reward_tower in self.reward_signal_towers[0].items():
for _, update_key in reward_tower.stats_name_to_update_name.items():
all_reward_signal_stats = tf.stack(
[
self.update_dict[update_key + "_" + str(i)]
for i in range(len(self.towers))
]
)
mean_reward_signal_stats = tf.reduce_mean(
all_reward_signal_stats, 0
)
self.update_dict.update({update_key: mean_reward_signal_stats})
self.reward_signals = self.reward_signal_towers[0]
@timed
def update(self, mini_batch, num_sequences):
"""

:return: Output from update process.
"""
feed_dict = {}
stats_needed = self.stats_name_to_update_name
{k: v[i : i + device_batch_size] for (k, v) in mini_batch.items()}
{
k: v[
i * device_batch_size : i * device_batch_size
+ device_batch_size
]
for (k, v) in mini_batch.items()
}
for batch, tower in zip(device_batches, self.towers):
for batch, tower, reward_tower in zip(
device_batches, self.towers, self.reward_signal_towers
):
stats_needed.update(self.stats_name_to_update_name)
for _, reward_signal in reward_tower.items():
feed_dict.update(
reward_signal.prepare_update(tower, batch, num_sequences)
)
stats_needed.update(reward_signal.stats_name_to_update_name)
out = self._execute_model(feed_dict, self.update_dict)
run_out = {}
run_out["value_loss"] = np.mean(
[out["value_loss_" + str(i)] for i in range(len(self.towers))]
)
run_out["policy_loss"] = np.mean(
[out["policy_loss_" + str(i)] for i in range(len(self.towers))]
)
run_out["update_batch"] = out["update_batch"]
return run_out
update_vals = self._execute_model(feed_dict, self.update_dict)
update_stats = {}
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]
return update_stats
def average_gradients(self, tower_grads):
"""

129
ml-agents/mlagents/trainers/ppo/policy.py


super().__init__(seed, brain, trainer_params)
reward_signal_configs = trainer_params["reward_signals"]
self.inference_dict = {}
self.update_dict = {}
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.create_model(brain, trainer_params, reward_signal_configs, seed)
self.create_model(
brain, trainer_params, reward_signal_configs, is_training, load, seed
)
self.create_reward_signals(reward_signal_configs)
self.reward_signals = {}
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, reward_signal, config
)
# Create pretrainer if needed
if "pretraining" in trainer_params:
BCModule.check_config(trainer_params["pretraining"])

else:
self._initialize_graph()
self.inference_dict = {
"action": self.model.output,
"log_probs": self.model.all_log_probs,
"value": self.model.value,
"value_heads": self.model.value_heads,
"entropy": self.model.entropy,
"learning_rate": self.model.learning_rate,
}
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.model.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.model.memory_out
if (
is_training
and self.use_vec_obs
and trainer_params["normalize"]
and not load
):
self.inference_dict["update_mean"] = self.model.update_normalization
self.total_policy_loss = self.model.policy_loss
self.update_dict = {
"value_loss": self.model.value_loss,
"policy_loss": self.total_policy_loss,
"update_batch": self.model.update_batch,
}
def create_model(self, brain, trainer_params, reward_signal_configs, seed):
def create_model(
self, brain, trainer_params, reward_signal_configs, is_training, load, seed
):
"""
Create PPO model
:param brain: Assigned Brain object.

)
self.model.create_ppo_optimizer()
self.inference_dict.update(
{
"action": self.model.output,
"log_probs": self.model.all_log_probs,
"value_heads": self.model.value_heads,
"value": self.model.value,
"entropy": self.model.entropy,
"learning_rate": self.model.learning_rate,
}
)
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.model.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.model.memory_out
if (
is_training
and self.use_vec_obs
and trainer_params["normalize"]
and not load
):
self.inference_dict["update_mean"] = self.model.update_normalization
self.total_policy_loss = self.model.abs_policy_loss
self.update_dict.update(
{
"value_loss": self.model.value_loss,
"policy_loss": self.total_policy_loss,
"update_batch": self.model.update_batch,
}
)
def create_reward_signals(self, reward_signal_configs):
"""
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
with self.graph.as_default():
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, self.model, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
@timed
def evaluate(self, brain_info):
"""

@timed
def update(self, mini_batch, num_sequences):
"""
Updates model using buffer.
:param num_sequences: Number of trajectories in batch.
:param mini_batch: Experience batch.
:return: Output from update process.
Performs update on model.
:param mini_batch: Batch of experiences.
:param num_sequences: Number of sequences to process.
:return: Results of update.
run_out = self._execute_model(feed_dict, self.update_dict)
return run_out
stats_needed = self.stats_name_to_update_name
update_stats = {}
# Collect feed dicts for all reward signals.
for _, reward_signal in self.reward_signals.items():
feed_dict.update(
reward_signal.prepare_update(self.model, mini_batch, num_sequences)
)
stats_needed.update(reward_signal.stats_name_to_update_name)
update_vals = self._execute_model(feed_dict, self.update_dict)
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]
return update_stats
self.model.batch_size: num_sequences,
self.model.sequence_length: self.sequence_length,
self.model.mask_input: mini_batch["masks"],
self.model.advantage: mini_batch["advantages"],
self.model.all_old_log_probs: mini_batch["action_probs"],
model.batch_size: num_sequences,
model.sequence_length: self.sequence_length,
model.mask_input: mini_batch["masks"],
model.advantage: mini_batch["advantages"],
model.all_old_log_probs: mini_batch["action_probs"],
}
for name in self.reward_signals:
feed_dict[model.returns_holders[name]] = mini_batch[

]
if self.use_continuous_act:
feed_dict[model.output_pre] = mini_batch["actions_pre"]
feed_dict[model.epsilon] = mini_batch["random_normal_epsilon"]
else:

29
ml-agents/mlagents/trainers/ppo/trainer.py


Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.
"""
buffer_length = len(self.training_buffer.update_buffer["actions"])
number_experiences=len(self.training_buffer.update_buffer["actions"]),
number_experiences=buffer_length,
mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),
)
self.cumulative_returns_since_policy_update = []

)
value_total, policy_total = [], []
batch_update_stats = defaultdict(list)
for l in range(
0, len(self.training_buffer.update_buffer["actions"]), batch_size
):
run_out = self.policy.update(
max_num_batch = buffer_length // batch_size
for l in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.policy.update(
value_total.append(run_out["value_loss"])
policy_total.append(np.abs(run_out["policy_loss"]))
self.stats["Losses/Value Loss"].append(np.mean(value_total))
self.stats["Losses/Policy Loss"].append(np.mean(policy_total))
for _, reward_signal in self.policy.reward_signals.items():
update_stats = reward_signal.update(
self.training_buffer.update_buffer, n_sequences
)
for stat, val in update_stats.items():
self.stats[stat].append(val)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self.stats[stat].append(np.mean(stat_list))
if self.policy.bc_module:
update_stats = self.policy.bc_module.update()
for stat, val in update_stats.items():

10
ml-agents/mlagents/trainers/tests/test_multigpu.py


mock_get_devices.return_value = ["/device:GPU:0", "/device:GPU:1"]
mock_construct_feed_dict.return_value = {}
mock_execute_model.return_value = {
"value_loss_0": 0.1,
"value_loss_1": 0.3,
"policy_loss_0": 0.5,
"policy_loss_1": 0.7,
"value_loss": 0.1,
"policy_loss": 0.3,
"update_batch": None,
}

assert mock_mini_batch.items.call_count == len(mock_get_devices.return_value)
assert mock_construct_feed_dict.call_count == len(mock_get_devices.return_value)
assert run_out["value_loss"] == 0.2
assert run_out["policy_loss"] == 0.6
assert run_out["Losses/Value Loss"] == 0.1
assert run_out["Losses/Policy Loss"] == 0.3
if __name__ == "__main__":

7
ml-agents/mlagents/trainers/tests/test_reward_signals.py


def reward_signal_update(env, policy, reward_signal_name):
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
out = policy.reward_signals[reward_signal_name].update(buffer.update_buffer, 2)
feed_dict = policy.reward_signals[reward_signal_name].prepare_update(
policy.model, buffer.update_buffer.make_mini_batch(0, 10), 2
)
out = policy._execute_model(
feed_dict, policy.reward_signals[reward_signal_name].update_dict
)
assert type(out) is dict

正在加载...
取消
保存