浏览代码

add value clipping

/develop/coma-withq
Andrew Cohen 4 年前
当前提交
2c3147b9
共有 1 个文件被更改,包括 11 次插入14 次删除
  1. 25
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py

25
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


"""
value_losses = []
for name, head in values.items():
# old_val_tensor = old_values[name]
returns_tensor = returns[name] # + 0.99 * old_val_tensor
# clipped_value_estimate = old_val_tensor + torch.clamp(
# head - old_val_tensor, -1 * epsilon, epsilon
# )
# value_loss = (returns_tensor - head) ** 2
old_val_tensor = old_values[name]
returns_tensor = returns[name]
clipped_value_estimate = old_val_tensor + torch.clamp(
head - old_val_tensor, -1 * epsilon, epsilon
)
# v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
# value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
value_loss = ModelUtils.masked_mean(v_opt_a, loss_masks)
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
#value_loss = ModelUtils.masked_mean(v_opt_a, loss_masks)
value_losses.append(value_loss)
value_loss = torch.mean(torch.stack(value_losses))
return value_loss

decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
returns_q = {}
returns_b = {}
returns_v = {}
old_values = {}

batch[f"{name}_value_estimates_next"]
batch[f"{name}_value_estimates"]
batch[f"{name}_marginalized_value_estimates_next"]
batch[f"{name}_baseline_estimates"]
returns_q[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_q"])
returns_b[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_b"])
returns_b[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_b"])
#
n_obs = len(self.policy.behavior_spec.sensor_specs)

正在加载...
取消
保存