浏览代码

Weight decay, regularizaton loss

/develop/coma2/clip
Ervin Teng 4 年前
当前提交
adad5183
共有 2 个文件被更改,包括 49 次插入29 次删除
  1. 26
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  2. 52
      ml-agents/mlagents/trainers/ppo/trainer.py

26
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


)
self.optimizer = torch.optim.Adam(
params, lr=self.trainer_settings.hyperparameters.learning_rate
params, lr=self.trainer_settings.hyperparameters.learning_rate, weight_decay=1e-6
)
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",

value_loss = torch.mean(torch.stack(value_losses))
return value_loss
def coma_regularizer_loss(self, values: Dict[str, torch.Tensor], baseline_values: Dict[str, torch.Tensor]):
reg_losses = []
for name, head in values.items():
reg_loss = (baseline_values[name] - head) ** 2
reg_losses.append(reg_loss)
value_loss = torch.mean(torch.stack(reg_losses))
return value_loss
def ppo_policy_loss(
self,
advantages: torch.Tensor,

)
return policy_loss
@timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""

decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
returns_b = {}
# returns_b = {}
returns_v = {}
old_values = {}
old_marg_values = {}

batch[f"{name}_baseline_estimates"]
)
returns_v[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_v"])
returns_b[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_b"])
# returns_b[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_b"])
#
n_obs = len(self.policy.behavior_spec.sensor_specs)

log_probs = log_probs.flatten()
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
# q_loss = self.ppo_value_loss(qs, old_values, returns_q, decay_eps, loss_masks)
# Use trust region from value, not baseline
baseline_vals, old_marg_values, returns_b, decay_eps, loss_masks
baseline_vals, old_marg_values, returns_v, decay_eps, loss_masks
regularizer_loss = self.coma_regularizer_loss(values, baseline_vals)
policy_loss = self.ppo_policy_loss(
ModelUtils.list_to_tensor(batch["advantages"]),
log_probs,

loss = (
policy_loss
+ 0.5 * (value_loss + baseline_loss)
+ 0.25 * (value_loss + baseline_loss)
+ 1.0 * regularizer_loss
- decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
)

"Losses/Value Loss": value_loss.item(),
# "Losses/Q Loss": q_loss.item(),
"Losses/Baseline Value Loss": baseline_loss.item(),
"Losses/Regularization Loss": regularizer_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,
"Policy/Beta": decay_bet,

52
ml-agents/mlagents/trainers/ppo/trainer.py


trajectory.next_obs,
trajectory.next_collab_obs,
trajectory.done_reached and not trajectory.interrupted,
trajectory.teammate_dones_reached and trajectory.done_reached and not trajectory.interrupted,
trajectory.teammate_dones_reached
and trajectory.done_reached
and not trajectory.interrupted,
)
for name, v in value_estimates.items():

baseline_estimates = agent_buffer_trajectory[
f"{name}_baseline_estimates"
].get_batch()
v_estimates = agent_buffer_trajectory[
f"{name}_value_estimates"
].get_batch()
v_estimates = agent_buffer_trajectory[f"{name}_value_estimates"].get_batch()
#next_value_estimates = agent_buffer_trajectory[
# next_value_estimates = agent_buffer_trajectory[
#].get_batch()
#next_m_value_estimates = agent_buffer_trajectory[
# ].get_batch()
# next_m_value_estimates = agent_buffer_trajectory[
#].get_batch()
# ].get_batch()
returns_v, returns_b = get_team_returns(
rewards=local_rewards,

gamma=self.optimizer.reward_signals[name].gamma,
lambd=self.hyperparameters.lambd,
)
#print("loc", local_rewards[-1])
#print("tdlam", returns_v)
# print("loc", local_rewards[-1])
# print("tdlam", returns_v)
#local_advantage = get_team_gae(
# local_advantage = get_team_gae(
# rewards=local_rewards,
# value_estimates=v_estimates,
# baseline=baseline_estimates,

#)
# )
#local_advantage = np.array(returns_v) - baseline_estimates
# local_advantage = np.array(returns_v) - baseline_estimates
#self._stats_reporter.add_stat(
# self._stats_reporter.add_stat(
#)
# )
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Advantage Estimate",

local_return = local_advantage + baseline_estimates
#local_return = local_advantage + q_estimates
# local_return = local_advantage + q_estimates
# This is later use as target for the different value estimates
# agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_v)

)
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
agent_buffer_trajectory["advantages"].set(global_advantages)
agent_buffer_trajectory["discounted_returns"].set(global_returns)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(

n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
#Normalize advantages
# Normalize advantages
list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)

returns = np.zeros_like(r)
returns[-1] = r[-1] + gamma * value_next
for t in reversed(range(0, r.size - 1)):
returns[t] = gamma * lambd * returns[t + 1] + r[t] + (1 - lambd) * gamma * value_estimates[t + 1]
returns[t] = (
gamma * lambd * returns[t + 1]
+ r[t]
+ (1 - lambd) * gamma * value_estimates[t + 1]
)
def get_team_gae(rewards, value_estimates, baseline, value_next=0.0, gamma=0.99, lambd=0.95):
def get_team_gae(
rewards, value_estimates, baseline, value_next=0.0, gamma=0.99, lambd=0.95
):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.

正在加载...
取消
保存