浏览代码

Add next action and next team obs

/develop/centralizedcritic/counterfact
Ervin Teng 4 年前
当前提交
0919a32d
共有 3 个文件被更改,包括 101 次插入18 次删除
  1. 1
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  2. 36
      ml-agents/mlagents/trainers/torch/agent_action.py
  3. 82
      ml-agents/mlagents/trainers/trajectory.py

1
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = AgentAction.from_dict(batch)
team_actions = AgentAction.from_team_dict(batch)
next_team_actions = AgentAction.from_team_dict_next(batch)
memories = [
ModelUtils.list_to_tensor(batch["memory"][i])

36
ml-agents/mlagents/trainers/torch/agent_action.py


return AgentAction(continuous, discrete)
@staticmethod
def from_team_dict(buff: Dict[str, np.ndarray]) -> List["AgentAction"]:
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
def _from_team_dict(
buff: Dict[str, np.ndarray], cont_action_key: str, disc_action_key: str
):
if "team_continuous_action" in buff:
if cont_action_key in buff:
buff["team_continuous_action"]
buff[cont_action_key]
if "team_discrete_action" in buff:
if disc_action_key in buff:
buff["team_discrete_action"], dtype=torch.long
buff[disc_action_key], dtype=torch.long
)
actions_list = []

_disc = [_disc[..., i] for i in range(_disc.shape[-1])]
actions_list.append(AgentAction(_cont, _disc))
return actions_list
@staticmethod
def from_team_dict(buff: Dict[str, np.ndarray]) -> List["AgentAction"]:
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
return AgentAction._from_team_dict(
buff, "team_continuous_action", "team_discrete_action"
)
@staticmethod
def from_team_dict_next(buff: Dict[str, np.ndarray]) -> List["AgentAction"]:
"""
A static method that accesses next continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
return AgentAction._from_team_dict(
buff, "team_next_continuous_action", "team_next_discrete_action"
)
def to_flat(self, discrete_branches: List[int]) -> torch.Tensor:
discrete_oh = ModelUtils.actions_to_onehot(

82
ml-agents/mlagents/trainers/trajectory.py


return f"team_obs_{index}"
@staticmethod
def get_name_at_next(index: int) -> str:
"""
returns the name of the next team observation given the index of the observation
"""
return f"team_obs_next_{index}"
@staticmethod
def _padded_time_to_batch(
agent_buffer_field: AgentBuffer.AgentBufferField,
) -> List[np.ndarray]:

result = TeamObsUtil._transpose_list_of_lists(separated_obs)
return result
@staticmethod
def from_buffer_next(batch: AgentBuffer, num_obs: int) -> List[np.array]:
"""
Creates the list of observations from an AgentBuffer
"""
separated_obs: List[np.array] = []
for i in range(num_obs):
separated_obs.append(
TeamObsUtil._padded_time_to_batch(
batch[TeamObsUtil.get_name_at_next(i)]
)
)
# separated_obs contains a List(num_obs) of Lists(num_agents), we want to flip
# that and get a List(num_agents) of Lists(num_obs)
result = TeamObsUtil._transpose_list_of_lists(separated_obs)
return result
class Trajectory(NamedTuple):
steps: List[AgentExperience]

agent_buffer_trajectory = AgentBuffer()
obs = self.steps[0].obs
for step, exp in enumerate(self.steps):
if step < len(self.steps) - 1:
is_last_step = step == len(self.steps) - 1
if not is_last_step:
next_obs = self.steps[step + 1].obs
else:
next_obs = self.next_obs

agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i])
agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(next_obs[i])
# Take care of teammate obs and actions
teammate_continuous_actions, teammate_discrete_actions, teammate_rewards = (
[],
[],

teammate_continuous_actions.append(teammate_status.action.continuous)
teammate_discrete_actions.append(teammate_status.action.discrete)
# Team actions
agent_buffer_trajectory["team_continuous_action"].append(
teammate_continuous_actions
)
agent_buffer_trajectory["team_discrete_action"].append(
teammate_discrete_actions
)
agent_buffer_trajectory["team_rewards"].append(teammate_rewards)
# Next actions
teammate_cont_next_actions = []
teammate_disc_next_actions = []
if not is_last_step:
next_exp = self.steps[step + 1]
for teammate_status in next_exp.teammate_status:
teammate_cont_next_actions.append(teammate_status.action.continuous)
teammate_disc_next_actions.append(teammate_status.action.discrete)
agent_buffer_trajectory["team_next_continuous_action"].append(
teammate_cont_next_actions
)
agent_buffer_trajectory["team_next_discrete_action"].append(
teammate_disc_next_actions
)
for i in range(num_obs):
ith_team_obs = []
for _teammate_status in exp.teammate_status:

agent_buffer_trajectory["team_rewards"].append(teammate_rewards)
ith_team_obs_next = []
if is_last_step:
for _obs in self.next_collab_obs:
ith_team_obs_next.append(_obs[i])
else:
next_teammate_status = self.steps[step + 1].teammate_status
for _teammate_status in next_teammate_status:
# Assume teammates have same obs space
ith_team_obs_next.append(_teammate_status.obs[i])
agent_buffer_trajectory[TeamObsUtil.get_name_at_next(i)].append(
ith_team_obs_next
)
if exp.memory is not None:
agent_buffer_trajectory["memory"].append(exp.memory)

agent_buffer_trajectory["continuous_action"].append(exp.action.continuous)
agent_buffer_trajectory["discrete_action"].append(exp.action.discrete)
# Team actions
agent_buffer_trajectory["team_continuous_action"].append(
teammate_continuous_actions
)
agent_buffer_trajectory["team_discrete_action"].append(
teammate_discrete_actions
)
cont_next_actions = np.zeros_like(exp.action.continuous)
disc_next_actions = np.zeros_like(exp.action.discrete)
if not is_last_step:
next_action = self.steps[step + 1].action
cont_next_actions = next_action.continuous
disc_next_actions = next_action.discrete
agent_buffer_trajectory["next_continuous_action"].append(cont_next_actions)
agent_buffer_trajectory["next_discrete_action"].append(disc_next_actions)
agent_buffer_trajectory["continuous_log_probs"].append(
exp.action_probs.continuous

正在加载...
取消
保存