浏览代码

use only value funcs

/develop/coma-noact
Andrew Cohen 4 年前
当前提交
9af22d30
共有 5 个文件被更改,包括 93 次插入292 次删除
  1. 46
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 4
      ml-agents/mlagents/trainers/policy/torch_policy.py
  3. 8
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  4. 66
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 261
      ml-agents/mlagents/trainers/torch/networks.py

46
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


memory = torch.zeros([1, 1, self.policy.m_size])
q_estimates, baseline_estimates, mem = self.policy.actor_critic.critic_pass(
current_obs,
actions,
memory,
sequence_length=batch.num_experiences,
team_obs=team_obs,
team_act=team_actions,
)
value_estimates, mem = self.policy.actor_critic.target_critic_value(
value_estimates, baseline_estimates, mem = self.policy.actor_critic.critic_pass(
current_obs,
memory,
sequence_length=batch.num_experiences,

boot_value_estimates, mem = self.policy.actor_critic.target_critic_value(
next_value_estimates, next_baseline_estimates, mem = self.policy.actor_critic.critic_pass(
next_obs,
memory,
sequence_length=batch.num_experiences,

#next_value_estimates, next_marg_val_estimates, next_mem = self.policy.actor_critic.target_critic_pass(
# next_obs,
# next_actions,
# memory,
# sequence_length=batch.num_experiences,
# team_obs=next_team_obs,
# team_act=next_team_actions,
#)
# # Actions is a hack here, we need the next actions
# next_value_estimate, next_marg_val_estimate, _ = self.policy.actor_critic.critic_pass(
# next_obs, actions, next_memory, sequence_length=1, critic_obs=next_critic_obs

for name, estimate in q_estimates.items():
q_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in baseline_estimates.items():
baseline_estimates[name] = ModelUtils.to_numpy(estimate)

# the base line and V shpuld not be on the same done flag
for name, estimate in boot_value_estimates.items():
boot_value_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in next_baseline_estimates.items():
next_baseline_estimates[name] = ModelUtils.to_numpy(estimate)
died = False
for name, estimate in next_value_estimates.items():
next_value_estimates[name] = ModelUtils.to_numpy(estimate)
for k in boot_value_estimates:
for k in next_baseline_estimates:
died = True
next_baseline_estimates[k][-1] = 0.0
boot_value_estimates[k][-1] = 0.0
next_value_estimates[k][-1] = 0.0
# else:
# print(len(next_critic_obs))
# print(baseline_estimates)

return (
q_estimates,
value_estimates,
value_estimates,
boot_value_estimates,
died
next_value_estimates,
next_baseline_estimates,
)

4
ml-agents/mlagents/trainers/policy/torch_policy.py


team_obs: Optional[List[List[torch.Tensor]]] = None,
team_act: Optional[List[AgentAction]] = None,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
log_probs, entropies, q_heads, baseline, values = self.actor_critic.get_stats_and_value(
log_probs, entropies, values, baseline = self.actor_critic.get_stats_and_value(
return log_probs, entropies, q_heads, baseline, values
return log_probs, entropies, values, baseline
@timed
def evaluate(

8
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


self.stream_names = list(self.reward_signals.keys())
ModelUtils.soft_update(
self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
)
#ModelUtils.soft_update(
# self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
#)
def ppo_value_loss(
self,

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
log_probs, entropy, qs, baseline_vals, values = self.policy.evaluate_actions(
log_probs, entropy, values, baseline_vals, = self.policy.evaluate_actions(
current_obs,
masks=act_masks,
actions=actions,

66
ml-agents/mlagents/trainers/ppo/trainer.py


# Get all value estimates
(
q_estimates,
value_estimates,
value_estimates,
died,
baseline_next,
) = self.optimizer.get_trajectory_value_estimates(
agent_buffer_trajectory,
trajectory.next_obs,

for name, v in q_estimates.items():
agent_buffer_trajectory[f"{name}_q_estimates"].extend(v)
for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
agent_buffer_trajectory[f"{name}_value_estimates"].extend(value_estimates[name])
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Q Estimate",
np.mean(v),
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
np.mean(baseline_estimates[name]),

)
#for name, v in value_next.items():
# agent_buffer_trajectory[f"{name}_value_estimates_next"].extend(v)
# agent_buffer_trajectory[f"{name}_marginalized_value_estimates_next"].extend(
# marg_value_next[name]
# )
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)

for name in self.optimizer.reward_signals:
local_rewards = agent_buffer_trajectory[f"{name}_rewards"].get_batch()
q_estimates = agent_buffer_trajectory[
f"{name}_q_estimates"
].get_batch()
baseline_estimates = agent_buffer_trajectory[
f"{name}_baseline_estimates"
].get_batch()

#next_value_estimates = agent_buffer_trajectory[
# f"{name}_value_estimates_next"
#].get_batch()
#next_m_value_estimates = agent_buffer_trajectory[
# f"{name}_marginalized_value_estimates_next"
#].get_batch()
#print(local_rewards[-1])
#print(died)
#print(value_next[name])
returns_q, returns_b, returns_v = get_team_returns(
returns_v, returns_b = get_team_returns(
q_estimates=q_estimates,
v_estimates=v_estimates,
v_estimates=v_estimates,
died=died,
baseline_next=baseline_next[name],
gamma=self.optimizer.reward_signals[name].gamma,
lambd=self.hyperparameters.lambd,
)

#local_advantage = np.array(q_estimates) - np.array(
local_advantage = np.array(returns_v) - np.array(
baseline_estimates
)
local_advantage = np.array(returns_v) - np.array(returns_b)
#self._stats_reporter.add_stat(
# f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} GAE Advantage Estimate",
# np.mean(gae_advantage),

local_return = local_advantage + baseline_estimates
# This is later use as target for the different value estimates
# agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_returns_q"].set(returns_v)
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_b)
agent_buffer_trajectory[f"{name}_returns_v"].set(returns_v)
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)

def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0, baseline=False, died=False):
returns = np.zeros_like(r)
if baseline:
# this is incorrect
if died:
returns[-1] = r[-1]
else:
returns[-1] = value_estimates[-1]
else:
returns[-1] = r[-1] + gamma * value_next
returns[-1] = r[-1] + gamma * value_next
for t in reversed(range(0, r.size - 1)):
returns[t] = gamma * lambd * returns[t + 1] + r[t] + (1 - lambd) * gamma * value_estimates[t + 1]

def get_team_returns(
rewards,
q_estimates,
v_estimates,
v_estimates,
died=False,
baseline_next=0.0,
gamma=0.99,
lambd=0.8,
):

:return: list of advantage estimates for time-steps t to T.
"""
rewards = np.array(rewards)
returns_q = lambd_return(rewards, q_estimates, gamma=gamma, lambd=lambd, value_next=value_next)
rewards, baseline_estimates, gamma=gamma, lambd=lambd, baseline=True, died=died
rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=baseline_next
return returns_q, returns_b, returns_v
return returns_v, returns_b

261
ml-agents/mlagents/trainers/torch/networks.py


sensor_specs: List[SensorSpec],
network_settings: NetworkSettings,
action_spec: ActionSpec,
baseline: bool = False
):
super().__init__()
self.normalize = network_settings.normalize

if network_settings.memory is not None
else 0
)
self.is_baseline = True
self.processors, _input_size = ModelUtils.create_input_processors(
sensor_specs,
self.h_size,

# Modules for self-attention
obs_only_ent_size = sum(_input_size)
q_ent_size = (
sum(_input_size)
+ sum(self.action_spec.discrete_branches)
+ self.action_spec.continuous_size
)
self.obs_encoder = EntityEmbedding(
0, obs_only_ent_size, None, self.h_size, concat_self=False
)
self.obs_action_encoder = EntityEmbedding(
0, q_ent_size, None, self.h_size, concat_self=False
)
self.self_encoder = None
if baseline:
self.self_encoder = LinearEncoder(
obs_only_ent_size, 1, self.h_size
)
self.obs_encoder = EntityEmbedding(
self.h_size, obs_only_ent_size, None, self.h_size, concat_self=True
)
else:
self.obs_encoder = EntityEmbedding(
0, obs_only_ent_size, None, self.h_size, concat_self=False
)
self.linear_encoder = LinearEncoder(
encoder_input_size, network_settings.num_layers, self.h_size

attn_mask = torch.any(obs_for_mask.isnan(), dim=2).type(torch.FloatTensor)
return attn_mask
def q_net(
self,
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
self_attn_masks = []
concat_f_inp = []
for inputs, action in zip(obs, actions):
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
cat_encodes = [
torch.cat(encodes, dim=-1),
action.to_flat(self.action_spec.discrete_branches),
]
concat_f_inp.append(torch.cat(cat_encodes, dim=1))
f_inp = torch.stack(concat_f_inp, dim=1)
self_attn_masks.append(self._get_masks_from_nans(obs))
encoding, memories = self.forward(
f_inp,
None,
self_attn_masks,
memories=memories,
sequence_length=sequence_length,
)
return encoding, memories
actions: List[AgentAction],
f_inp = None
concat_f_inp = []
for inputs, action in zip(obs, actions):
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
cat_encodes = [
torch.cat(encodes, dim=-1),
action.to_flat(self.action_spec.discrete_branches),
]
concat_f_inp.append(torch.cat(cat_encodes, dim=1))
if concat_f_inp:
f_inp = torch.stack(concat_f_inp, dim=1)
concat_encoded_obs = []
g_inp = None
if len(obs) > 0:
for inputs in obs:
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
concat_encoded_obs.append(torch.cat(encodes, dim=-1))
g_inp = torch.stack(concat_encoded_obs, dim=1)
concat_encoded_obs = []
encodes = []
self_encodes = []
encodes.append(processed_obs)
concat_encoded_obs.append(torch.cat(encodes, dim=-1))
g_inp = torch.stack(concat_encoded_obs, dim=1)
self_encodes.append(processed_obs)
encoded_self = self.self_encoder(torch.cat(self_encodes, dim=-1))
self_attn_masks.append(self._get_masks_from_nans([self_obs]))
f_inp,
encoded_self,
return encoding, memories
def value(

def forward(
self,
f_enc: torch.Tensor,
encoded_self: torch.Tensor,
g_enc: torch.Tensor,
self_attn_masks: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,

self_attn_inputs = []
if f_enc is not None:
self_attn_inputs.append(self.obs_action_encoder(None, f_enc))
if g_enc is not None:
if self.is_baseline:
if g_enc is not None:
self_attn_inputs.append(self.obs_encoder(encoded_self, g_enc))
encoded_entity = torch.cat(self_attn_inputs, dim=1)
inputs = self.self_attn(encoded_entity, self_attn_masks)
else:
inputs = encoded_self
else:
encoded_entity = torch.cat(self_attn_inputs, dim=1)
encoded_state = self.self_attn(encoded_entity, self_attn_masks)
encoded_entity = torch.cat(self_attn_inputs, dim=1)
inputs = self.self_attn(encoded_entity, self_attn_masks)
inputs = encoded_state
encoding = self.linear_encoder(inputs)
if self.use_lstm:

network_settings: NetworkSettings,
action_spec: ActionSpec,
outputs_per_stream: int = 1,
baseline: bool = False
observation_shapes, network_settings, action_spec=action_spec
observation_shapes, network_settings, action_spec=action_spec, baseline=baseline
)
if network_settings.memory is not None:
encoding_size = network_settings.memory.memory_size // 2

def q_net(
self,
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body.q_net(
obs, actions, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories
def value(
self,
obs: List[List[torch.Tensor]],

self,
self_obs: List[List[torch.Tensor]],
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
self_obs, obs, actions, memories, sequence_length
self_obs, obs, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories

value_inputs: List[List[torch.Tensor]],
q_inputs: List[List[torch.Tensor]],
q_actions: List[AgentAction] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

self.critic = CentralizedValueNetwork(
stream_names, sensor_specs, network_settings, action_spec=action_spec
)
self.target = CentralizedValueNetwork(
stream_names, sensor_specs, network_settings, action_spec=action_spec
self.baseline_critic = CentralizedValueNetwork(
stream_names, sensor_specs, network_settings, action_spec=action_spec, baseline=True
)

actor_mem = None
return actor_mem, critic_mem
def target_critic_value(
self,
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: List[List[torch.Tensor]] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
value_outputs, _ = self.target.value(
all_obs,
memories=critic_mem,
sequence_length=sequence_length,
)
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs
if actor_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, memories_out
def critic_value(
self,
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: List[List[torch.Tensor]] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
value_outputs, _ = self.critic.value(
all_obs,
memories=critic_mem,
sequence_length=sequence_length,
)
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs
if actor_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, memories_out
def target_critic_pass(
self,
inputs: List[torch.Tensor],
actions: AgentAction,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: List[List[torch.Tensor]] = None,
team_act: List[AgentAction] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
all_acts = [actions]
if team_act is not None and team_act:
all_acts.extend(team_act)
baseline_outputs, _ = self.target.baseline(
inputs,
team_obs,
team_act,
memories=critic_mem,
sequence_length=sequence_length,
)
value_outputs, critic_mem_out = self.target.q_net(
all_obs, all_acts, memories=critic_mem, sequence_length=sequence_length
)
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs

def critic_pass(
self,
inputs: List[torch.Tensor],
actions: AgentAction,
team_act: List[AgentAction] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)

all_acts = [actions]
if team_act is not None and team_act:
all_acts.extend(team_act)
baseline_outputs, _ = self.critic.baseline(
baseline_outputs, _ = self.baseline_critic.baseline(
team_act,
value_outputs, critic_mem_out = self.critic.q_net(
all_obs, all_acts, memories=critic_mem, sequence_length=sequence_length
value_outputs, critic_mem_out = self.critic.value(
all_obs, memories=critic_mem, sequence_length=sequence_length
)
# if mar_value_outputs is None:

)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
q_outputs, baseline_outputs, _ = self.critic_pass(
value_outputs, baseline_outputs, _ = self.critic_pass(
actions,
team_act=team_act,
value_outputs, _ = self.target_critic_value(inputs, memories=critic_mem, sequence_length=sequence_length, team_obs=team_obs)
return log_probs, entropies, q_outputs, baseline_outputs, value_outputs
return log_probs, entropies, value_outputs, baseline_outputs
def get_action_stats(
self,

正在加载...
取消
保存