Andrew Cohen
4 年前
当前提交
418cc778
共有 4 个文件被更改,包括 705 次插入 和 0 次删除
-
0ml-agents/mlagents/trainers/coma/__init__.py
-
419ml-agents/mlagents/trainers/coma/optimizer_torch.py
-
286ml-agents/mlagents/trainers/coma/trainer.py
|
|||
from typing import Dict, cast |
|||
from mlagents.torch_utils import torch |
|||
|
|||
from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil |
|||
|
|||
from mlagents_envs.timers import timed |
|||
from mlagents.trainers.policy.torch_policy import TorchPolicy |
|||
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer |
|||
from mlagents.trainers.settings import TrainerSettings, PPOSettings |
|||
from mlagents.trainers.torch.networks import Critic |
|||
from mlagents.trainers.torch.layers import EntityEmbedding, ResidualSelfAttention, LinearEncoder |
|||
from mlagents.trainers.torch.agent_action import AgentAction |
|||
from mlagents.trainers.torch.action_log_probs import ActionLogProbs |
|||
from mlagents.trainers.torch.utils import ModelUtils |
|||
from mlagents.trainers.trajectory import ObsUtil |
|||
|
|||
|
|||
class TorchCOMAOptimizer(TorchOptimizer): |
|||
class COMAValueNetwork(torch.nn.Module, Critic): |
|||
def __init__( |
|||
self, |
|||
stream_names: List[str], |
|||
observation_specs: List[ObservationSpec], |
|||
network_settings: NetworkSettings, |
|||
action_spec: ActionSpec, |
|||
): |
|||
super().__init__() |
|||
self.normalize = network_settings.normalize |
|||
self.use_lstm = network_settings.memory is not None |
|||
# Scale network depending on num agents |
|||
self.h_size = network_settings.hidden_units |
|||
self.m_size = ( |
|||
network_settings.memory.memory_size |
|||
if network_settings.memory is not None |
|||
else 0 |
|||
) |
|||
self.processors, _input_size = ModelUtils.create_input_processors( |
|||
sensor_specs, |
|||
self.h_size, |
|||
network_settings.vis_encode_type, |
|||
normalize=self.normalize, |
|||
) |
|||
self.action_spec = action_spec |
|||
|
|||
# Modules for self-attention |
|||
obs_only_ent_size = sum(_input_size) |
|||
q_ent_size = ( |
|||
sum(_input_size) |
|||
+ sum(self.action_spec.discrete_branches) |
|||
+ self.action_spec.continuous_size |
|||
) |
|||
self.obs_encoder = EntityEmbedding( |
|||
0, obs_only_ent_size, None, self.h_size, concat_self=False |
|||
) |
|||
self.obs_action_encoder = EntityEmbedding( |
|||
0, q_ent_size, None, self.h_size, concat_self=False |
|||
) |
|||
|
|||
self.self_attn = ResidualSelfAttention(self.h_size) |
|||
|
|||
self.linear_encoder = LinearEncoder( |
|||
self.h_size, |
|||
network_settings.num_layers, |
|||
self.h_size, |
|||
kernel_gain=(0.125 / self.h_size) ** 0.5, |
|||
) |
|||
|
|||
if self.use_lstm: |
|||
self.lstm = LSTM(self.h_size, self.m_size) |
|||
else: |
|||
self.lstm = None # type: ignorek |
|||
|
|||
|
|||
@property |
|||
def memory_size(self) -> int: |
|||
return self.lstm.memory_size if self.use_lstm else 0 |
|||
|
|||
def update_normalization(self, buffer: AgentBuffer) -> None: |
|||
obs = ObsUtil.from_buffer(buffer, len(self.processors)) |
|||
for vec_input, enc in zip(obs, self.processors): |
|||
if isinstance(enc, VectorInput): |
|||
enc.update_normalization(torch.as_tensor(vec_input)) |
|||
|
|||
def copy_normalization(self, other_network: "NetworkBody") -> None: |
|||
if self.normalize: |
|||
for n1, n2 in zip(self.processors, other_network.processors): |
|||
if isinstance(n1, VectorInput) and isinstance(n2, VectorInput): |
|||
n1.copy_normalization(n2) |
|||
|
|||
def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor: |
|||
""" |
|||
Get attention masks by grabbing an arbitrary obs across all the agents |
|||
Since these are raw obs, the padded values are still NaN |
|||
""" |
|||
only_first_obs = [_all_obs[0] for _all_obs in obs_tensors] |
|||
obs_for_mask = torch.stack(only_first_obs, dim=1) |
|||
# Get the mask from nans |
|||
attn_mask = torch.any(obs_for_mask.isnan(), dim=2).type(torch.FloatTensor) |
|||
return attn_mask |
|||
|
|||
def baseline( |
|||
self, |
|||
self_obs: List[List[torch.Tensor]], |
|||
obs: List[List[torch.Tensor]], |
|||
actions: List[AgentAction], |
|||
memories: Optional[torch.Tensor] = None, |
|||
sequence_length: int = 1, |
|||
) -> Tuple[torch.Tensor, torch.Tensor]: |
|||
|
|||
self_attn_masks = [] |
|||
|
|||
f_inp = None |
|||
concat_f_inp = [] |
|||
for inputs, action in zip(obs, actions): |
|||
encodes = [] |
|||
for idx, processor in enumerate(self.processors): |
|||
obs_input = inputs[idx] |
|||
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs |
|||
processed_obs = processor(obs_input) |
|||
encodes.append(processed_obs) |
|||
cat_encodes = [ |
|||
torch.cat(encodes, dim=-1), |
|||
action.to_flat(self.action_spec.discrete_branches), |
|||
] |
|||
concat_f_inp.append(torch.cat(cat_encodes, dim=1)) |
|||
|
|||
if concat_f_inp: |
|||
f_inp = torch.stack(concat_f_inp, dim=1) |
|||
self_attn_masks.append(self._get_masks_from_nans(obs)) |
|||
|
|||
concat_encoded_obs = [] |
|||
encodes = [] |
|||
for idx, processor in enumerate(self.processors): |
|||
obs_input = self_obs[idx] |
|||
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs |
|||
processed_obs = processor(obs_input) |
|||
encodes.append(processed_obs) |
|||
concat_encoded_obs.append(torch.cat(encodes, dim=-1)) |
|||
g_inp = torch.stack(concat_encoded_obs, dim=1) |
|||
# Get the mask from nans |
|||
self_attn_masks.append(self._get_masks_from_nans([self_obs])) |
|||
encoding, memories = self.forward( |
|||
f_inp, |
|||
g_inp, |
|||
self_attn_masks, |
|||
memories=memories, |
|||
sequence_length=sequence_length, |
|||
) |
|||
return encoding, memories |
|||
|
|||
def critic_pass( |
|||
self, |
|||
obs: List[List[torch.Tensor]], |
|||
memories: Optional[torch.Tensor] = None, |
|||
sequence_length: int = 1, |
|||
) -> Tuple[torch.Tensor, torch.Tensor]: |
|||
|
|||
self_attn_masks = [] |
|||
concat_encoded_obs = [] |
|||
for inputs in obs: |
|||
encodes = [] |
|||
for idx, processor in enumerate(self.processors): |
|||
obs_input = inputs[idx] |
|||
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs |
|||
processed_obs = processor(obs_input) |
|||
encodes.append(processed_obs) |
|||
concat_encoded_obs.append(torch.cat(encodes, dim=-1)) |
|||
g_inp = torch.stack(concat_encoded_obs, dim=1) |
|||
# Get the mask from nans |
|||
self_attn_masks.append(self._get_masks_from_nans(obs)) |
|||
encoding, memories = self.forward( |
|||
None, |
|||
g_inp, |
|||
self_attn_masks, |
|||
memories=memories, |
|||
sequence_length=sequence_length, |
|||
) |
|||
return encoding, memories |
|||
|
|||
|
|||
def forward( |
|||
self, |
|||
f_enc: torch.Tensor, |
|||
g_enc: torch.Tensor, |
|||
self_attn_masks: List[torch.Tensor], |
|||
memories: Optional[torch.Tensor] = None, |
|||
sequence_length: int = 1, |
|||
) -> Tuple[torch.Tensor, torch.Tensor]: |
|||
|
|||
self_attn_inputs = [] |
|||
|
|||
if f_enc is not None: |
|||
self_attn_inputs.append(self.obs_action_encoder(None, f_enc)) |
|||
if g_enc is not None: |
|||
self_attn_inputs.append(self.obs_encoder(None, g_enc)) |
|||
|
|||
encoded_entity = torch.cat(self_attn_inputs, dim=1) |
|||
encoded_state = self.self_attn(encoded_entity, self_attn_masks) |
|||
|
|||
inputs = encoded_state |
|||
encoding = self.linear_encoder(inputs) |
|||
|
|||
if self.use_lstm: |
|||
# Resize to (batch, sequence length, encoding size) |
|||
encoding = encoding.reshape([-1, sequence_length, self.h_size]) |
|||
encoding, memories = self.lstm(encoding, memories) |
|||
encoding = encoding.reshape([-1, self.m_size // 2]) |
|||
return encoding, memories |
|||
|
|||
def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): |
|||
""" |
|||
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. |
|||
The PPO optimizer has a value estimator and a loss function. |
|||
:param policy: A TorchPolicy object that will be updated by this PPO Optimizer. |
|||
:param trainer_params: Trainer parameters dictionary that specifies the |
|||
properties of the trainer. |
|||
""" |
|||
# Create the graph here to give more granular control of the TF graph to the Optimizer. |
|||
|
|||
super().__init__(policy, trainer_settings) |
|||
reward_signal_configs = trainer_settings.reward_signals |
|||
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()] |
|||
|
|||
if policy.shared_critic: |
|||
self.value_net = policy.actor |
|||
else: |
|||
self.value_net = ValueNetwork( |
|||
reward_signal_names, |
|||
policy.behavior_spec.observation_specs, |
|||
network_settings=trainer_settings.network_settings, |
|||
) |
|||
|
|||
params = list(self.policy.actor.parameters()) + list( |
|||
self.value_net.parameters() |
|||
) |
|||
self.hyperparameters: PPOSettings = cast( |
|||
PPOSettings, trainer_settings.hyperparameters |
|||
) |
|||
self.decay_learning_rate = ModelUtils.DecayedValue( |
|||
self.hyperparameters.learning_rate_schedule, |
|||
self.hyperparameters.learning_rate, |
|||
1e-10, |
|||
self.trainer_settings.max_steps, |
|||
) |
|||
self.decay_epsilon = ModelUtils.DecayedValue( |
|||
self.hyperparameters.learning_rate_schedule, |
|||
self.hyperparameters.epsilon, |
|||
0.1, |
|||
self.trainer_settings.max_steps, |
|||
) |
|||
self.decay_beta = ModelUtils.DecayedValue( |
|||
self.hyperparameters.learning_rate_schedule, |
|||
self.hyperparameters.beta, |
|||
1e-5, |
|||
self.trainer_settings.max_steps, |
|||
) |
|||
|
|||
self.optimizer = torch.optim.Adam( |
|||
params, lr=self.trainer_settings.hyperparameters.learning_rate |
|||
) |
|||
self.stats_name_to_update_name = { |
|||
"Losses/Value Loss": "value_loss", |
|||
"Losses/Policy Loss": "policy_loss", |
|||
} |
|||
|
|||
self.stream_names = list(self.reward_signals.keys()) |
|||
|
|||
@property |
|||
def critic(self): |
|||
return self.value_net |
|||
|
|||
def ppo_value_loss( |
|||
self, |
|||
values: Dict[str, torch.Tensor], |
|||
old_values: Dict[str, torch.Tensor], |
|||
returns: Dict[str, torch.Tensor], |
|||
epsilon: float, |
|||
loss_masks: torch.Tensor, |
|||
) -> torch.Tensor: |
|||
""" |
|||
Evaluates value loss for PPO. |
|||
:param values: Value output of the current network. |
|||
:param old_values: Value stored with experiences in buffer. |
|||
:param returns: Computed returns. |
|||
:param epsilon: Clipping value for value estimate. |
|||
:param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences. |
|||
""" |
|||
value_losses = [] |
|||
for name, head in values.items(): |
|||
old_val_tensor = old_values[name] |
|||
returns_tensor = returns[name] |
|||
clipped_value_estimate = old_val_tensor + torch.clamp( |
|||
head - old_val_tensor, -1 * epsilon, epsilon |
|||
) |
|||
v_opt_a = (returns_tensor - head) ** 2 |
|||
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 |
|||
value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks) |
|||
value_losses.append(value_loss) |
|||
value_loss = torch.mean(torch.stack(value_losses)) |
|||
return value_loss |
|||
|
|||
def ppo_policy_loss( |
|||
self, |
|||
advantages: torch.Tensor, |
|||
log_probs: torch.Tensor, |
|||
old_log_probs: torch.Tensor, |
|||
loss_masks: torch.Tensor, |
|||
) -> torch.Tensor: |
|||
""" |
|||
Evaluate PPO policy loss. |
|||
:param advantages: Computed advantages. |
|||
:param log_probs: Current policy probabilities |
|||
:param old_log_probs: Past policy probabilities |
|||
:param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. |
|||
""" |
|||
advantage = advantages.unsqueeze(-1) |
|||
|
|||
decay_epsilon = self.hyperparameters.epsilon |
|||
r_theta = torch.exp(log_probs - old_log_probs) |
|||
p_opt_a = r_theta * advantage |
|||
p_opt_b = ( |
|||
torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage |
|||
) |
|||
policy_loss = -1 * ModelUtils.masked_mean( |
|||
torch.min(p_opt_a, p_opt_b), loss_masks |
|||
) |
|||
return policy_loss |
|||
|
|||
@timed |
|||
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: |
|||
""" |
|||
Performs update on model. |
|||
:param batch: Batch of experiences. |
|||
:param num_sequences: Number of sequences to process. |
|||
:return: Results of update. |
|||
""" |
|||
# Get decayed parameters |
|||
decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) |
|||
decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step()) |
|||
decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) |
|||
returns = {} |
|||
old_values = {} |
|||
for name in self.reward_signals: |
|||
old_values[name] = ModelUtils.list_to_tensor( |
|||
batch[RewardSignalUtil.value_estimates_key(name)] |
|||
) |
|||
returns[name] = ModelUtils.list_to_tensor( |
|||
batch[RewardSignalUtil.returns_key(name)] |
|||
) |
|||
|
|||
n_obs = len(self.policy.behavior_spec.observation_specs) |
|||
current_obs = ObsUtil.from_buffer(batch, n_obs) |
|||
# Convert to tensors |
|||
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] |
|||
|
|||
act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) |
|||
actions = AgentAction.from_buffer(batch) |
|||
|
|||
memories = [ |
|||
ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) |
|||
for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) |
|||
] |
|||
if len(memories) > 0: |
|||
memories = torch.stack(memories).unsqueeze(0) |
|||
|
|||
log_probs, entropy = self.policy.evaluate_actions( |
|||
current_obs, |
|||
masks=act_masks, |
|||
actions=actions, |
|||
memories=memories, |
|||
seq_len=self.policy.sequence_length, |
|||
) |
|||
values, _ = self.critic.critic_pass( |
|||
current_obs, memories=memories, sequence_length=self.policy.sequence_length |
|||
) |
|||
old_log_probs = ActionLogProbs.from_buffer(batch).flatten() |
|||
log_probs = log_probs.flatten() |
|||
loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) |
|||
value_loss = self.ppo_value_loss( |
|||
values, old_values, returns, decay_eps, loss_masks |
|||
) |
|||
policy_loss = self.ppo_policy_loss( |
|||
ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), |
|||
log_probs, |
|||
old_log_probs, |
|||
loss_masks, |
|||
) |
|||
loss = ( |
|||
policy_loss |
|||
+ 0.5 * value_loss |
|||
- decay_bet * ModelUtils.masked_mean(entropy, loss_masks) |
|||
) |
|||
|
|||
# Set optimizer learning rate |
|||
ModelUtils.update_learning_rate(self.optimizer, decay_lr) |
|||
self.optimizer.zero_grad() |
|||
loss.backward() |
|||
|
|||
self.optimizer.step() |
|||
update_stats = { |
|||
# NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. |
|||
# TODO: After PyTorch is default, change to something more correct. |
|||
"Losses/Policy Loss": torch.abs(policy_loss).item(), |
|||
"Losses/Value Loss": value_loss.item(), |
|||
"Policy/Learning Rate": decay_lr, |
|||
"Policy/Epsilon": decay_eps, |
|||
"Policy/Beta": decay_bet, |
|||
} |
|||
|
|||
for reward_provider in self.reward_signals.values(): |
|||
update_stats.update(reward_provider.update(batch)) |
|||
|
|||
return update_stats |
|||
|
|||
def get_modules(self): |
|||
modules = {"Optimizer": self.optimizer} |
|||
for reward_provider in self.reward_signals.values(): |
|||
modules.update(reward_provider.get_modules()) |
|||
return modules |
|
|||
# # Unity ML-Agents Toolkit |
|||
# ## ML-Agent Learning (PPO) |
|||
# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347 |
|||
|
|||
from collections import defaultdict |
|||
from typing import cast |
|||
|
|||
import numpy as np |
|||
|
|||
from mlagents_envs.logging_util import get_logger |
|||
from mlagents_envs.base_env import BehaviorSpec |
|||
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil |
|||
from mlagents.trainers.trainer.rl_trainer import RLTrainer |
|||
from mlagents.trainers.policy import Policy |
|||
from mlagents.trainers.policy.torch_policy import TorchPolicy |
|||
from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer |
|||
from mlagents.trainers.trajectory import Trajectory |
|||
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers |
|||
from mlagents.trainers.settings import TrainerSettings, PPOSettings |
|||
|
|||
logger = get_logger(__name__) |
|||
|
|||
|
|||
class COMATrainer(RLTrainer): |
|||
"""The COMATrainer is an implementation of the COMA2 algorithm.""" |
|||
|
|||
def __init__( |
|||
self, |
|||
behavior_name: str, |
|||
reward_buff_cap: int, |
|||
trainer_settings: TrainerSettings, |
|||
training: bool, |
|||
load: bool, |
|||
seed: int, |
|||
artifact_path: str, |
|||
): |
|||
""" |
|||
Responsible for collecting experiences and training PPO model. |
|||
:param behavior_name: The name of the behavior associated with trainer config |
|||
:param reward_buff_cap: Max reward history to track in the reward buffer |
|||
:param trainer_settings: The parameters for the trainer. |
|||
:param training: Whether the trainer is set for training. |
|||
:param load: Whether the model should be loaded. |
|||
:param seed: The seed the model will be initialized with |
|||
:param artifact_path: The directory within which to store artifacts from this trainer. |
|||
""" |
|||
super().__init__( |
|||
behavior_name, |
|||
trainer_settings, |
|||
training, |
|||
load, |
|||
artifact_path, |
|||
reward_buff_cap, |
|||
) |
|||
self.hyperparameters: PPOSettings = cast( |
|||
PPOSettings, self.trainer_settings.hyperparameters |
|||
) |
|||
self.seed = seed |
|||
self.policy: Policy = None # type: ignore |
|||
|
|||
def _process_trajectory(self, trajectory: Trajectory) -> None: |
|||
""" |
|||
Takes a trajectory and processes it, putting it into the update buffer. |
|||
Processing involves calculating value and advantage targets for model updating step. |
|||
:param trajectory: The Trajectory tuple containing the steps to be processed. |
|||
""" |
|||
super()._process_trajectory(trajectory) |
|||
agent_id = trajectory.agent_id # All the agents should have the same ID |
|||
|
|||
agent_buffer_trajectory = trajectory.to_agentbuffer() |
|||
# Update the normalization |
|||
if self.is_training: |
|||
self.policy.update_normalization(agent_buffer_trajectory) |
|||
|
|||
# Get all value estimates |
|||
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates( |
|||
agent_buffer_trajectory, |
|||
trajectory.next_obs, |
|||
trajectory.done_reached and not trajectory.interrupted, |
|||
) |
|||
|
|||
for name, v in value_estimates.items(): |
|||
agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend( |
|||
v |
|||
) |
|||
self._stats_reporter.add_stat( |
|||
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", |
|||
np.mean(v), |
|||
) |
|||
|
|||
# Evaluate all reward functions |
|||
self.collected_rewards["environment"][agent_id] += np.sum( |
|||
agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS] |
|||
) |
|||
for name, reward_signal in self.optimizer.reward_signals.items(): |
|||
evaluate_result = ( |
|||
reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength |
|||
) |
|||
agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend( |
|||
evaluate_result |
|||
) |
|||
# Report the reward signals |
|||
self.collected_rewards[name][agent_id] += np.sum(evaluate_result) |
|||
|
|||
# Compute GAE and returns |
|||
tmp_advantages = [] |
|||
tmp_returns = [] |
|||
for name in self.optimizer.reward_signals: |
|||
bootstrap_value = value_next[name] |
|||
|
|||
local_rewards = agent_buffer_trajectory[ |
|||
RewardSignalUtil.rewards_key(name) |
|||
].get_batch() |
|||
local_value_estimates = agent_buffer_trajectory[ |
|||
RewardSignalUtil.value_estimates_key(name) |
|||
].get_batch() |
|||
|
|||
local_advantage = get_gae( |
|||
rewards=local_rewards, |
|||
value_estimates=local_value_estimates, |
|||
value_next=bootstrap_value, |
|||
gamma=self.optimizer.reward_signals[name].gamma, |
|||
lambd=self.hyperparameters.lambd, |
|||
) |
|||
local_return = local_advantage + local_value_estimates |
|||
# This is later use as target for the different value estimates |
|||
agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set( |
|||
local_return |
|||
) |
|||
agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set( |
|||
local_advantage |
|||
) |
|||
tmp_advantages.append(local_advantage) |
|||
tmp_returns.append(local_return) |
|||
|
|||
# Get global advantages |
|||
global_advantages = list( |
|||
np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0) |
|||
) |
|||
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0)) |
|||
agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages) |
|||
agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(global_returns) |
|||
# Append to update buffer |
|||
agent_buffer_trajectory.resequence_and_append( |
|||
self.update_buffer, training_length=self.policy.sequence_length |
|||
) |
|||
|
|||
# If this was a terminal trajectory, append stats and reset reward collection |
|||
if trajectory.done_reached: |
|||
self._update_end_episode_stats(agent_id, self.optimizer) |
|||
|
|||
def _is_ready_update(self): |
|||
""" |
|||
Returns whether or not the trainer has enough elements to run update model |
|||
:return: A boolean corresponding to whether or not update_model() can be run |
|||
""" |
|||
size_of_buffer = self.update_buffer.num_experiences |
|||
return size_of_buffer > self.hyperparameters.buffer_size |
|||
|
|||
def _update_policy(self): |
|||
""" |
|||
Uses demonstration_buffer to update the policy. |
|||
The reward signal generators must be updated in this method at their own pace. |
|||
""" |
|||
buffer_length = self.update_buffer.num_experiences |
|||
self.cumulative_returns_since_policy_update.clear() |
|||
|
|||
# Make sure batch_size is a multiple of sequence length. During training, we |
|||
# will need to reshape the data into a batch_size x sequence_length tensor. |
|||
batch_size = ( |
|||
self.hyperparameters.batch_size |
|||
- self.hyperparameters.batch_size % self.policy.sequence_length |
|||
) |
|||
# Make sure there is at least one sequence |
|||
batch_size = max(batch_size, self.policy.sequence_length) |
|||
|
|||
n_sequences = max( |
|||
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1 |
|||
) |
|||
|
|||
advantages = self.update_buffer[BufferKey.ADVANTAGES].get_batch() |
|||
self.update_buffer[BufferKey.ADVANTAGES].set( |
|||
(advantages - advantages.mean()) / (advantages.std() + 1e-10) |
|||
) |
|||
num_epoch = self.hyperparameters.num_epoch |
|||
batch_update_stats = defaultdict(list) |
|||
for _ in range(num_epoch): |
|||
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length) |
|||
buffer = self.update_buffer |
|||
max_num_batch = buffer_length // batch_size |
|||
for i in range(0, max_num_batch * batch_size, batch_size): |
|||
update_stats = self.optimizer.update( |
|||
buffer.make_mini_batch(i, i + batch_size), n_sequences |
|||
) |
|||
for stat_name, value in update_stats.items(): |
|||
batch_update_stats[stat_name].append(value) |
|||
|
|||
for stat, stat_list in batch_update_stats.items(): |
|||
self._stats_reporter.add_stat(stat, np.mean(stat_list)) |
|||
|
|||
if self.optimizer.bc_module: |
|||
update_stats = self.optimizer.bc_module.update() |
|||
for stat, val in update_stats.items(): |
|||
self._stats_reporter.add_stat(stat, val) |
|||
self._clear_update_buffer() |
|||
return True |
|||
|
|||
def create_torch_policy( |
|||
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec |
|||
) -> TorchPolicy: |
|||
""" |
|||
Creates a policy with a PyTorch backend and PPO hyperparameters |
|||
:param parsed_behavior_id: |
|||
:param behavior_spec: specifications for policy construction |
|||
:return policy |
|||
""" |
|||
policy = TorchPolicy( |
|||
self.seed, |
|||
behavior_spec, |
|||
self.trainer_settings, |
|||
condition_sigma_on_obs=False, # Faster training for PPO |
|||
separate_critic=True, # Match network architecture with TF |
|||
) |
|||
return policy |
|||
|
|||
def create_coma_optimizer(self) -> TorchCOMAOptimizer: |
|||
return TorchCOMAptimizer( # type: ignore |
|||
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore |
|||
) # type: ignore |
|||
|
|||
def add_policy( |
|||
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy |
|||
) -> None: |
|||
""" |
|||
Adds policy to trainer. |
|||
:param parsed_behavior_id: Behavior identifiers that the policy should belong to. |
|||
:param policy: Policy to associate with name_behavior_id. |
|||
""" |
|||
self.policy = policy |
|||
self.policies[parsed_behavior_id.behavior_id] = policy |
|||
self.optimizer = self.create_ppo_optimizer() |
|||
for _reward_signal in self.optimizer.reward_signals.keys(): |
|||
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) |
|||
|
|||
self.model_saver.register(self.policy) |
|||
self.model_saver.register(self.optimizer) |
|||
self.model_saver.initialize_or_load() |
|||
|
|||
# Needed to resume loads properly |
|||
self.step = policy.get_current_step() |
|||
|
|||
def get_policy(self, name_behavior_id: str) -> Policy: |
|||
""" |
|||
Gets policy from trainer associated with name_behavior_id |
|||
:param name_behavior_id: full identifier of policy |
|||
""" |
|||
|
|||
return self.policy |
|||
|
|||
|
|||
def discount_rewards(r, gamma=0.99, value_next=0.0): |
|||
""" |
|||
Computes discounted sum of future rewards for use in updating value estimate. |
|||
:param r: List of rewards. |
|||
:param gamma: Discount factor. |
|||
:param value_next: T+1 value estimate for returns calculation. |
|||
:return: discounted sum of future rewards as list. |
|||
""" |
|||
discounted_r = np.zeros_like(r) |
|||
running_add = value_next |
|||
for t in reversed(range(0, r.size)): |
|||
running_add = running_add * gamma + r[t] |
|||
discounted_r[t] = running_add |
|||
return discounted_r |
|||
|
|||
|
|||
def lambda_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0): |
|||
returns = np.zeros_like(r) |
|||
returns[-1] = r[-1] + gamma * value_next |
|||
for t in reversed(range(0, r.size - 1)): |
|||
returns[t] = ( |
|||
gamma * lambd * returns[t + 1] |
|||
+ r[t] |
|||
+ (1 - lambd) * gamma * value_estimates[t + 1] |
|||
) |
|||
return returns |
撰写
预览
正在加载...
取消
保存
Reference in new issue