浏览代码

add lambda return and target network

/develop/coma-noact
Andrew Cohen 4 年前
当前提交
feb38012
共有 6 个文件被更改,包括 136 次插入44 次删除
  1. 4
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 90
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  3. 37
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 3
      ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
  5. 44
      ml-agents/mlagents/trainers/torch/networks.py
  6. 2
      ml-agents/mlagents/trainers/trajectory.py

4
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


memory = torch.zeros([1, 1, self.policy.m_size])
value_estimates, marg_val_estimates, mem = self.policy.actor_critic.critic_pass(
value_estimates, marg_val_estimates, mem = self.policy.actor_critic.target_critic_pass(
current_obs,
actions,
memory,

)
next_value_estimates, next_marg_val_estimates, next_mem = self.policy.actor_critic.critic_pass(
next_value_estimates, next_marg_val_estimates, next_mem = self.policy.actor_critic.target_critic_pass(
next_obs,
next_actions,
memory,

90
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil
from mlagents.trainers.torch.networks import CentralizedValueNetwork
class TorchPPOOptimizer(TorchOptimizer):
def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):

self.stream_names = list(self.reward_signals.keys())
ModelUtils.soft_update(
self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
)
def ppo_value_loss(
self,
values: Dict[str, torch.Tensor],

"""
value_losses = []
for name, head in values.items():
old_val_tensor = old_values[name]
returns_tensor = returns[name] + 0.99 * old_val_tensor
# clipped_value_estimate = old_val_tensor + torch.clamp(
#old_val_tensor = old_values[name]
returns_tensor = returns[name]# + 0.99 * old_val_tensor
#clipped_value_estimate = old_val_tensor + torch.clamp(
# )
value_loss = (returns_tensor - head) ** 2
# v_opt_a = (returns_tensor - head) ** 2
# v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
# value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
#)
#value_loss = (returns_tensor - head) ** 2
v_opt_a = (returns_tensor - head) ** 2
#v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
#value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
value_loss = ModelUtils.masked_mean(v_opt_a, loss_masks)
value_losses.append(value_loss)
value_loss = torch.mean(torch.stack(value_losses))
return value_loss

decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
returns = {}
returns_q = {}
returns_b = {}
old_values = {}
old_marg_values = {}
for name in self.reward_signals:

old_marg_values[name] = ModelUtils.list_to_tensor(
batch[f"{name}_marginalized_value_estimates_next"]
)
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
returns_q[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_q"])
returns_b[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_b"])
#
# padded_team_rewards = list(
# map(
# lambda x: np.asanyarray(x),
# itertools.zip_longest(*batch["team_rewards"], fillvalue=np.nan),
# )
# )
#padded_team_rewards = torch.tensor(
# np.array(
# list(itertools.zip_longest(*batch["team_rewards"], fillvalue=np.nan))
# )
#)
#padded_team_rewards = np.array(
# list(itertools.zip_longest(*batch["team_rewards"], fillvalue=np.nan))
# )
padded_team_rewards = list(
map(
lambda x: np.asanyarray(x),
itertools.zip_longest(*batch["team_rewards"], fillvalue=np.nan),
)
)
padded_team_rewards = torch.tensor(
np.array(
list(itertools.zip_longest(*batch["team_rewards"], fillvalue=np.nan))
)
)
# Average team rewards
if "extrinsic" in returns:
all_rewards = torch.cat(
[torch.unsqueeze(returns["extrinsic"], 0), padded_team_rewards], dim=0
)
returns["extrinsic"] = torch.mean(
all_rewards[~torch.isnan(all_rewards)], dim=0
)
#all_rewards = np.concatenate((np.expand_dims(batch["environment_rewards"], axis=0), padded_team_rewards), axis=0)
#average_team_rewards = batch["average_team_reward"]
#returns["extrinsic"] = torch.tensor(average_team_rewards)
## Average team rewards
#if "extrinsic" in returns:
# env_rewards = ModelUtils.list_to_tensor(batch["environment_rewards"])
# all_rewards = torch.cat(
# [torch.unsqueeze(env_rewards, 0), padded_team_rewards], dim=0
# )
# returns["extrinsic"] = torch.mean(
# all_rewards[~torch.isnan(all_rewards)], dim=0
# )
# print(all_rewards[~torch.isnan(all_rewards)].shape)
# print(all_rewards.shape)
n_obs = len(self.policy.behavior_spec.sensor_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)

log_probs = log_probs.flatten()
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
value_loss = self.ppo_value_loss(
values, old_values, returns, decay_eps, loss_masks
values, old_values, returns_q, decay_eps, loss_masks
marginalized_vals, old_marg_values, returns, decay_eps, loss_masks
marginalized_vals, old_marg_values, returns_b, decay_eps, loss_masks
)
policy_loss = self.ppo_policy_loss(
ModelUtils.list_to_tensor(batch["advantages"]),

loss.backward()
self.optimizer.step()
ModelUtils.soft_update(
self.policy.actor_critic.critic, self.policy.actor_critic.target, .001
)
"Losses/Baseline Value Loss": marg_value_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,
"Policy/Beta": decay_bet,

37
ml-agents/mlagents/trainers/ppo/trainer.py


m_value_estimates = agent_buffer_trajectory[
f"{name}_marginalized_value_estimates"
].get_batch()
next_value_estimates = agent_buffer_trajectory[
f"{name}_value_estimates_next"
].get_batch()
next_m_value_estimates = agent_buffer_trajectory[
f"{name}_marginalized_value_estimates_next"
].get_batch()
local_advantage = get_team_gae(
returns_q, returns_b = get_team_returns(
value_estimates=local_value_estimates,
marginalized_value_estimates=m_value_estimates,
next_value_estimates=next_value_estimates,
next_marginalized_value_estimates=next_m_value_estimates,
local_advantage = np.array(local_value_estimates) - np.array(m_value_estimates)
agent_buffer_trajectory[f"{name}_returns"].set(local_return)
#agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_returns_q"].set(returns_q)
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_b)
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)
tmp_advantages.append(local_advantage)
tmp_returns.append(local_return)

discounted_r[t] = running_add
return discounted_r
def lambd_return(r, next_value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
returns = np.zeros_like(r)
returns[-1] = r[-1] + gamma * next_value_estimates[-1]
for t in reversed(range(0, r.size - 1)):
returns[t] = gamma * lambd * returns[t+1] + (1 - lambd) * (r[t] + gamma * next_value_estimates[t])
return returns
def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
"""

return advantage
def get_team_gae(
def get_team_returns(
value_estimates,
marginalized_value_estimates,
next_value_estimates,
next_marginalized_value_estimates,
lambd=0.95,
lambd=0.8,
):
"""
Computes generalized advantage estimate for use in updating policy.

:param lambd: GAE weighing factor.
:return: list of advantage estimates for time-steps t to T.
"""
advantage = np.array(value_estimates) - np.array(marginalized_value_estimates)
return advantage
rewards = np.array(rewards)
returns_q = lambd_return(rewards, next_value_estimates, gamma=gamma, lambd=lambd)
returns_b = lambd_return(rewards, next_marginalized_value_estimates, gamma=gamma, lambd=lambd)
return returns_q, returns_b

3
ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py


class ExtrinsicRewardProvider(BaseRewardProvider):
def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
return np.array(mini_batch["environment_rewards"], dtype=np.float32)
return np.array(mini_batch["average_team_reward"], dtype=np.float32)
# return np.array(mini_batch["environment_rewards"], dtype=np.float32)
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
return {}

44
ml-agents/mlagents/trainers/torch/networks.py


self.critic = CentralizedValueNetwork(
stream_names, sensor_specs, network_settings, action_spec=action_spec
)
self.target = CentralizedValueNetwork(
stream_names, sensor_specs, network_settings, action_spec=action_spec
)
@property
def memory_size(self) -> int:

critic_mem = None
actor_mem = None
return actor_mem, critic_mem
def target_critic_pass(
self,
inputs: List[torch.Tensor],
actions: AgentAction,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: List[List[torch.Tensor]] = None,
team_act: List[AgentAction] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
all_acts = [actions]
if team_act is not None and team_act:
all_acts.extend(team_act)
baseline_outputs, _ = self.target.baseline(
inputs,
team_obs,
team_act,
memories=critic_mem,
sequence_length=sequence_length,
)
value_outputs, critic_mem_out = self.target.q_net(
all_obs, all_acts, memories=critic_mem, sequence_length=sequence_length
)
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs
if actor_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, baseline_outputs, memories_out
def critic_pass(
self,

2
ml-agents/mlagents/trainers/trajectory.py


teammate_discrete_actions
)
agent_buffer_trajectory["team_rewards"].append(teammate_rewards)
team_reward = teammate_rewards + [exp.reward]
agent_buffer_trajectory["average_team_reward"].append(sum(team_reward)/len(team_reward))
# Next actions
teammate_cont_next_actions = []

正在加载...
取消
保存