浏览代码

value estimator

/develop/coma-noact
Andrew Cohen 4 年前
当前提交
a4c336c2
共有 6 个文件被更改,包括 251 次插入123 次删除
  1. 8
      config/ppo/PushBlock.yaml
  2. 80
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  3. 4
      ml-agents/mlagents/trainers/policy/torch_policy.py
  4. 56
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  5. 121
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 105
      ml-agents/mlagents/trainers/torch/networks.py

8
config/ppo/PushBlock.yaml


learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
hidden_units: 512
num_layers: 2
vis_encode_type: simple
reward_signals:

keep_checkpoints: 5
max_steps: 20000000
time_horizon: 1000
max_steps: 50000000
time_horizon: 64
env_settings:
num_envs: 8

80
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


n_obs = len(self.policy.behavior_spec.sensor_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)
next_obs = ObsUtil.from_buffer_next(batch, n_obs)
next_team_obs = TeamObsUtil.from_buffer_next(batch, n_obs)
#next_obs = ObsUtil.from_buffer_next(batch, n_obs)
#next_team_obs = TeamObsUtil.from_buffer_next(batch, n_obs)
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
next_team_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
for _teammate_obs in next_team_obs
]
#next_team_obs = [
# [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
# for _teammate_obs in next_team_obs
#]
next_actions = AgentAction.from_dict_next(batch)
next_team_actions = AgentAction.from_team_dict_next(batch)
#next_actions = AgentAction.from_dict_next(batch)
#next_team_actions = AgentAction.from_team_dict_next(batch)
# next_obs = [obs.unsqueeze(0) for obs in next_obs]
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
next_obs = [obs.unsqueeze(0) for obs in next_obs]
# critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
# critic_obs = [

# next_critic_obs = [
# ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
# ]
next_critic_obs = [
ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
]
# next_critic_obs = [
# [_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_critic_obs
# ]
next_critic_obs = [
[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_critic_obs
]
value_estimates, marg_val_estimates, mem = self.policy.actor_critic.target_critic_pass(
q_estimates, baseline_estimates, mem = self.policy.actor_critic.target_critic_pass(
current_obs,
actions,
memory,

)
next_value_estimates, next_marg_val_estimates, next_mem = self.policy.actor_critic.target_critic_pass(
value_estimates, mem = self.policy.actor_critic.target_critic_value(
current_obs,
memory,
sequence_length=batch.num_experiences,
team_obs=team_obs,
)
boot_value_estimates, mem = self.policy.actor_critic.target_critic_value(
next_actions,
team_obs=next_team_obs,
team_act=next_team_actions,
team_obs=next_critic_obs,
#next_value_estimates, next_marg_val_estimates, next_mem = self.policy.actor_critic.target_critic_pass(
# next_obs,
# next_actions,
# memory,
# sequence_length=batch.num_experiences,
# team_obs=next_team_obs,
# team_act=next_team_actions,
#)
# # Actions is a hack here, we need the next actions
# next_value_estimate, next_marg_val_estimate, _ = self.policy.actor_critic.critic_pass(
# next_obs, actions, next_memory, sequence_length=1, critic_obs=next_critic_obs

for name, estimate in q_estimates.items():
q_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in baseline_estimates.items():
baseline_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in next_value_estimates.items():
next_value_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in marg_val_estimates.items():
marg_val_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in next_marg_val_estimates.items():
next_marg_val_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in boot_value_estimates.items():
boot_value_estimates[name] = ModelUtils.to_numpy(estimate)
for k in next_value_estimates:
for k in boot_value_estimates:
next_value_estimates[k][-1] = 0.0
boot_value_estimates[k][-1] = 0.0
q_estimates,
baseline_estimates,
marg_val_estimates,
next_value_estimates,
next_marg_val_estimates,
boot_value_estimates,
)

4
ml-agents/mlagents/trainers/policy/torch_policy.py


team_obs: Optional[List[List[torch.Tensor]]] = None,
team_act: Optional[List[AgentAction]] = None,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
log_probs, entropies, value_heads, marg_vals = self.actor_critic.get_stats_and_value(
log_probs, entropies, q_heads, baseline, values = self.actor_critic.get_stats_and_value(
return log_probs, entropies, value_heads, marg_vals
return log_probs, entropies, q_heads, baseline, values
@timed
def evaluate(

56
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
returns_q = {}
returns_b = {}
returns_v = {}
old_values = {}
old_marg_values = {}
for name in self.reward_signals:

)
returns_q[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_q"])
returns_b[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_b"])
returns_v[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_v"])
# padded_team_rewards = list(
# map(
# lambda x: np.asanyarray(x),
# itertools.zip_longest(*batch["team_rewards"], fillvalue=np.nan),
# )
# )
#padded_team_rewards = torch.tensor(
# np.array(
# list(itertools.zip_longest(*batch["team_rewards"], fillvalue=np.nan))
# )
#)
#padded_team_rewards = np.array(
# list(itertools.zip_longest(*batch["team_rewards"], fillvalue=np.nan))
# )
#all_rewards = np.concatenate((np.expand_dims(batch["environment_rewards"], axis=0), padded_team_rewards), axis=0)
#average_team_rewards = batch["average_team_reward"]
#returns["extrinsic"] = torch.tensor(average_team_rewards)
## Average team rewards
#if "extrinsic" in returns:
# env_rewards = ModelUtils.list_to_tensor(batch["environment_rewards"])
# all_rewards = torch.cat(
# [torch.unsqueeze(env_rewards, 0), padded_team_rewards], dim=0
# )
# returns["extrinsic"] = torch.mean(
# all_rewards[~torch.isnan(all_rewards)], dim=0
# )
# print(all_rewards[~torch.isnan(all_rewards)].shape)
# print(all_rewards.shape)
n_obs = len(self.policy.behavior_spec.sensor_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
log_probs, entropy, values, marginalized_vals = self.policy.evaluate_actions(
log_probs, entropy, qs, baseline_vals, values = self.policy.evaluate_actions(
current_obs,
masks=act_masks,
actions=actions,

old_log_probs = ActionLogProbs.from_dict(batch).flatten()
log_probs = log_probs.flatten()
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
value_loss = self.ppo_value_loss(
values, old_values, returns_q, decay_eps, loss_masks
q_loss = self.ppo_value_loss(
qs, old_values, returns_q, decay_eps, loss_masks
marg_value_loss = self.ppo_value_loss(
marginalized_vals, old_marg_values, returns_b, decay_eps, loss_masks
baseline_loss = self.ppo_value_loss(
baseline_vals, old_marg_values, returns_b, decay_eps, loss_masks
value_loss = self.ppo_value_loss(
values, old_values, returns_v, decay_eps, loss_masks
)
policy_loss = self.ppo_policy_loss(
ModelUtils.list_to_tensor(batch["advantages"]),
log_probs,

loss = (
policy_loss
+ 0.5 * (value_loss + marg_value_loss)
+ 0.5 * (q_loss + value_loss + baseline_loss)
- decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
)

self.optimizer.step()
ModelUtils.soft_update(
self.policy.actor_critic.critic, self.policy.actor_critic.target, 0.005
self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
"Losses/Baseline Value Loss": marg_value_loss.item(),
"Policy/Advantages": torch.mean(ModelUtils.list_to_tensor(batch["advantages"])).item(),
"Losses/Q Loss": q_loss.item(),
"Losses/Baseline Value Loss": baseline_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,
"Policy/Beta": decay_bet,

121
ml-agents/mlagents/trainers/ppo/trainer.py


# Get all value estimates
(
q_estimates,
baseline_estimates,
marginalized_value_estimates,
marg_value_next,
) = self.optimizer.get_trajectory_value_estimates(
agent_buffer_trajectory,
trajectory.next_obs,

for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
agent_buffer_trajectory[f"{name}_marginalized_value_estimates"].extend(
marginalized_value_estimates[name]
for name, v in q_estimates.items():
agent_buffer_trajectory[f"{name}_q_estimates"].extend(v)
agent_buffer_trajectory[f"{name}_baseline_estimates"].extend(
baseline_estimates[name]
)
agent_buffer_trajectory[f"{name}_value_estimates"].extend(value_estimates[name])
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Q Estimate",
np.mean(v),
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
np.mean(baseline_estimates[name]),
np.mean(v),
np.mean(baseline_estimates[name]),
for name, v in value_next.items():
agent_buffer_trajectory[f"{name}_value_estimates_next"].extend(v)
agent_buffer_trajectory[f"{name}_marginalized_value_estimates_next"].extend(
marg_value_next[name]
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Advantage Estimate",
np.mean(v - baseline_estimates[name]),
#for name, v in value_next.items():
# agent_buffer_trajectory[f"{name}_value_estimates_next"].extend(v)
# agent_buffer_trajectory[f"{name}_marginalized_value_estimates_next"].extend(
# marg_value_next[name]
# )
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]

bootstrap_value = value_next[name]
local_rewards = agent_buffer_trajectory[f"{name}_rewards"].get_batch()
local_value_estimates = agent_buffer_trajectory[
f"{name}_value_estimates"
q_estimates = agent_buffer_trajectory[
f"{name}_q_estimates"
m_value_estimates = agent_buffer_trajectory[
f"{name}_marginalized_value_estimates"
].get_batch()
next_value_estimates = agent_buffer_trajectory[
f"{name}_value_estimates_next"
baseline_estimates = agent_buffer_trajectory[
f"{name}_baseline_estimates"
next_m_value_estimates = agent_buffer_trajectory[
f"{name}_marginalized_value_estimates_next"
v_estimates = agent_buffer_trajectory[
f"{name}_value_estimates"
returns_q, returns_b = get_team_returns(
#next_value_estimates = agent_buffer_trajectory[
# f"{name}_value_estimates_next"
#].get_batch()
#next_m_value_estimates = agent_buffer_trajectory[
# f"{name}_marginalized_value_estimates_next"
#].get_batch()
returns_q, returns_b, returns_v = get_team_returns(
next_value_estimates=next_value_estimates,
next_marginalized_value_estimates=next_m_value_estimates,
q_estimates=q_estimates,
baseline_estimates=baseline_estimates,
v_estimates=v_estimates,
local_advantage = np.array(local_value_estimates) - np.array(
m_value_estimates
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Lam",
np.mean(returns_v),
)
local_advantage = np.array(q_estimates) - np.array(
baseline_estimates
local_return = local_advantage + local_value_estimates
local_return = local_advantage + q_estimates
agent_buffer_trajectory[f"{name}_returns_q"].set(returns_q)
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_b)
agent_buffer_trajectory[f"{name}_returns_q"].set(returns_v)
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_v)
agent_buffer_trajectory[f"{name}_returns_v"].set(returns_v)
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)
tmp_advantages.append(local_advantage)
tmp_returns.append(local_return)

)
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
agent_buffer_trajectory["advantages"].set(global_advantages)
agent_buffer_trajectory["discounted_returns"].set(global_returns)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(

n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
# Normalize advantages
# advantages = np.array(self.update_buffer["advantages"].get_batch())
# self.update_buffer["advantages"].set(
# list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
# )
#Normalize advantages
advantages = np.array(self.update_buffer["advantages"].get_batch())
self.update_buffer["advantages"].set(
list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):

return discounted_r
def lambd_return(r, next_value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
returns[-1] = r[-1] + gamma * next_value_estimates[-1]
returns[-1] = r[-1] + gamma * value_next
returns[t] = gamma * lambd * returns[t + 1] + (1 - lambd) * (
r[t] + gamma * next_value_estimates[t]
)
returns[t] = gamma * lambd * returns[t + 1] + r[t] + (1 - lambd) * gamma * value_estimates[t + 1]
return returns

def get_team_returns(
rewards,
next_value_estimates,
next_marginalized_value_estimates,
q_estimates,
baseline_estimates,
v_estimates,
value_next=0.0,
gamma=0.99,
lambd=0.8,

:return: list of advantage estimates for time-steps t to T.
"""
rewards = np.array(rewards)
returns_q = lambd_return(rewards, next_value_estimates, gamma=gamma, lambd=lambd)
returns_q = lambd_return(rewards, q_estimates, gamma=gamma, lambd=lambd, value_next=value_next)
rewards, next_marginalized_value_estimates, gamma=gamma, lambd=lambd
rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=value_next
)
returns_v = lambd_return(
rewards, v_estimates, gamma=gamma, lambd=lambd, value_next=value_next
return returns_q, returns_b
if rewards[-1] > 0:
print(returns_v)
print(rewards)
return returns_q, returns_b, returns_v

105
ml-agents/mlagents/trainers/torch/networks.py


)
return encoding, memories
def value(
self,
obs: List[List[torch.Tensor]],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
self_attn_masks = []
concat_encoded_obs = []
for inputs in obs:
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
concat_encoded_obs.append(torch.cat(encodes, dim=-1))
g_inp = torch.stack(concat_encoded_obs, dim=1)
# Get the mask from nans
self_attn_masks.append(self._get_masks_from_nans(obs))
encoding, memories = self.forward(
None,
g_inp,
self_attn_masks,
memories=memories,
sequence_length=sequence_length,
)
return encoding, memories
def forward(
self,
f_enc: torch.Tensor,

) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body.q_net(
obs, actions, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories
def value(
self,
obs: List[List[torch.Tensor]],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body.value(
obs, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories

actor_mem = None
return actor_mem, critic_mem
def target_critic_value(
self,
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: List[List[torch.Tensor]] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
value_outputs, _ = self.target.value(
all_obs,
memories=critic_mem,
sequence_length=sequence_length,
)
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs
if actor_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, memories_out
def critic_value(
self,
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: List[List[torch.Tensor]] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
value_outputs, _ = self.critic.value(
all_obs,
memories=critic_mem,
sequence_length=sequence_length,
)
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs
if actor_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, memories_out
def target_critic_pass(
self,
inputs: List[torch.Tensor],

)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs, mar_value_outputs, _ = self.critic_pass(
q_outputs, baseline_outputs, _ = self.critic_pass(
inputs,
actions,
memories=critic_mem,

)
value_outputs, _ = self.critic_value(inputs, memories=critic_mem, sequence_length=sequence_length, team_obs=team_obs)
return log_probs, entropies, value_outputs, mar_value_outputs
return log_probs, entropies, q_outputs, baseline_outputs, value_outputs
def get_action_stats(
self,

正在加载...
取消
保存