浏览代码

Actions added but untested

/develop/centralizedcritic/counterfact
Ervin Teng 4 年前
当前提交
65b866b0
共有 3 个文件被更改,包括 107 次插入30 次删除
  1. 46
      ml-agents/mlagents/trainers/agent_processor.py
  2. 52
      ml-agents/mlagents/trainers/torch/agent_action.py
  3. 39
      ml-agents/mlagents/trainers/trajectory.py

46
ml-agents/mlagents/trainers/agent_processor.py


StatsAggregationMethod,
EnvironmentStats,
)
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.trajectory import TeammateStatus, Trajectory, AgentExperience
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.torch.action_log_probs import LogProbsTuple

)
# last_group_obs is used to collect the last seen obs of all the agents in the same group,
# and assemble the collab_obs.
self.last_group_obs: Dict[str, Dict[str, List[np.ndarray]]] = defaultdict(
lambda: defaultdict(list)
)
# current_group_rewards is used to collect the last seen rewards of all the agents in the same group.
self.current_group_rewards: Dict[str, Dict[str, float]] = defaultdict(
lambda: defaultdict(float)
self.teammate_status: Dict[str, Dict[str, TeammateStatus]] = defaultdict(
lambda: defaultdict(None)
)
# last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while
# grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1).

self, step: Union[TerminalStep, DecisionStep], global_id: str
) -> None:
stored_decision_step, idx = self.last_step_result.get(global_id, (None, None))
if stored_decision_step is not None:
stored_take_action_outputs = self.last_take_action_outputs.get(global_id, None)
if stored_decision_step is not None and stored_take_action_outputs is not None:
self.last_group_obs[step.team_manager_id][
global_id
] = stored_decision_step.obs
stored_actions = stored_take_action_outputs["action"]
action_tuple = ActionTuple(
continuous=stored_actions.continuous[idx],
discrete=stored_actions.discrete[idx],
)
teammate_status = TeammateStatus(
obs=stored_decision_step.obs,
reward=step.reward,
action=action_tuple,
)
self.teammate_status[step.team_manager_id][global_id] = teammate_status
self.current_group_rewards[step.team_manager_id][
global_id
] = step.reward
self._delete_in_nested_dict(self.last_group_obs, global_id)
self._delete_in_nested_dict(self.current_group_rewards, global_id)
self._delete_in_nested_dict(self.teammate_status, global_id)
def _delete_in_nested_dict(self, nested_dict, key):
for _manager_id, _team_group in nested_dict.items():

prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
# Assemble teammate_obs. If none saved, then it will be an empty list.
collab_obs = []
for _id, _obs in self.last_group_obs[step.team_manager_id].items():
teammate_statuses = []
for _id, _obs in self.teammate_status[step.team_manager_id].items():
collab_obs.append(_obs)
teammate_rewards = []
for _id, _rew in self.current_group_rewards[step.team_manager_id].items():
if _id != global_id:
teammate_rewards.append(_rew)
teammate_statuses.append(_obs)
collab_obs=collab_obs,
teammate_status=teammate_statuses,
team_rewards=teammate_rewards,
done=done,
action=action_tuple,
action_probs=log_probs_tuple,

52
ml-agents/mlagents/trainers/torch/agent_action.py


from typing import List, Optional, NamedTuple, Dict
import itertools
from mlagents.trainers.buffer import AgentBuffer
class AgentAction(NamedTuple):

discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :])
action_tuple.add_discrete(discrete)
return action_tuple
@staticmethod
def _padded_time_to_batch(
agent_buffer_field: AgentBuffer.AgentBufferField,
dtype: torch.dtype = torch.float32,
) -> List[torch.Tensor]:
action_shape = None
for _action in agent_buffer_field:
if _action:
action_shape = _action.shape
break
# If there were no critic obs at all
if action_shape is None:
return []
new_list = list(
map(
lambda x: ModelUtils.list_to_tensor(x, dtype=dtype),
itertools.zip_longest(
*agent_buffer_field, fillvalue=np.full(action_shape, np.nan)
),
)
)
return new_list
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":

discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return AgentAction(continuous, discrete)
@staticmethod
def from_team_dict(buff: Dict[str, np.ndarray]) -> List["AgentAction"]:
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
continuous_tensors: List[torch.Tensor] = []
discrete_tensors: List[torch.Tensor] = [] # type: ignore
if "team_continuous_action" in buff:
continuous_tensors = AgentAction._padded_time_to_batch(
buff["team_continuous_action"]
)
if "team_discrete_action" in buff:
discrete_tensors = AgentAction._padded_time_to_batch(
buff["team_discrete_action"], dtype=torch.long
)
actions_list = []
for _cont, _disc in itertools.zip_longest(
continuous_tensors, discrete_tensors, fillvalue=None
):
if _disc is not None:
_disc = [_disc[..., i] for i in range(_disc.shape[-1])]
actions_list.append(AgentAction(_cont, _disc))
return actions_list
def to_flat(self, discrete_branches: List[int]) -> torch.Tensor:
discrete_oh = ModelUtils.actions_to_onehot(

39
ml-agents/mlagents/trainers/trajectory.py


@attr.s(auto_attribs=True)
class TeammateStatus:
"""
Stores data related to an agent's teammate.
"""
obs: List[np.ndarray]
reward: float
action: ActionTuple
@attr.s(auto_attribs=True)
collab_obs: List[List[np.ndarray]]
teammate_status: List[TeammateStatus]
team_rewards: List[float]
done: bool
action: ActionTuple
action_probs: LogProbsTuple

agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i])
agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(next_obs[i])
teammate_continuous_actions, teammate_discrete_actions, teammate_rewards = (
[],
[],
[],
)
for teammate_status in exp.teammate_status:
teammate_rewards.append(teammate_status.reward)
teammate_continuous_actions.append(teammate_status.action.continuous)
teammate_discrete_actions.append(teammate_status.action.discrete)
for _team_obs in exp.collab_obs:
for _teammate_status in exp.teammate_status:
ith_team_obs.append(_team_obs[i])
ith_team_obs.append(_teammate_status.obs[i])
agent_buffer_trajectory["team_rewards"].append(exp.team_rewards)
agent_buffer_trajectory["team_rewards"].append(teammate_rewards)
if exp.memory is not None:
agent_buffer_trajectory["memory"].append(exp.memory)

# Adds the log prob and action of continuous/discrete separately
agent_buffer_trajectory["continuous_action"].append(exp.action.continuous)
agent_buffer_trajectory["discrete_action"].append(exp.action.discrete)
# Team actions
agent_buffer_trajectory["team_continuous_action"].append(
teammate_continuous_actions
)
agent_buffer_trajectory["team_discrete_action"].append(
teammate_discrete_actions
)
agent_buffer_trajectory["continuous_log_probs"].append(
exp.action_probs.continuous
)

正在加载...
取消
保存