浏览代码

send and process team manager id

/develop/centralizedcritic/counterfact
Ruo-Ping Dong 4 年前
当前提交
fbfdc05b
共有 4 个文件被更改,包括 61 次插入10 次删除
  1. 5
      com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs
  2. 18
      ml-agents-envs/mlagents_envs/base_env.py
  3. 29
      ml-agents-envs/mlagents_envs/rpc_utils.py
  4. 19
      ml-agents/mlagents/trainers/agent_processor.py

5
com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs


agentInfoProto.ActionMask.AddRange(ai.discreteActionMasks);
}
if (ai.teamManagerId != null)
{
agentInfoProto.TeamManagerId = ai.teamManagerId;
}
return agentInfoProto;
}

18
ml-agents-envs/mlagents_envs/base_env.py


reward: float
agent_id: AgentId
action_mask: Optional[List[np.ndarray]]
team_manager_id: Optional[str]
class DecisionSteps(Mapping):

this simulation step.
"""
def __init__(self, obs, reward, agent_id, action_mask):
def __init__(self, obs, reward, agent_id, action_mask, team_manager_id=None):
self.team_manager_id: Optional[List[str]] = team_manager_id
self.action_mask: Optional[List[np.ndarray]] = action_mask
self._agent_id_to_index: Optional[Dict[AgentId, int]] = None

agent_mask = []
for mask in self.action_mask:
agent_mask.append(mask[agent_index])
team_manager_id = None
if self.team_manager_id is not None and self.team_manager_id != "":
team_manager_id = self.team_manager_id[agent_index]
team_manager_id=team_manager_id,
)
def __iter__(self) -> Iterator[Any]:

reward=np.zeros(0, dtype=np.float32),
agent_id=np.zeros(0, dtype=np.int32),
action_mask=None,
team_manager_id=None,
)

reward: float
interrupted: bool
agent_id: AgentId
team_manager_id: Optional[str]
class TerminalSteps(Mapping):

across simulation steps.
"""
def __init__(self, obs, reward, interrupted, agent_id):
def __init__(self, obs, reward, interrupted, agent_id, team_manager_id=None):
self.team_manager_id: Optional[List[str]] = team_manager_id
@property
def agent_id_to_index(self) -> Dict[AgentId, int]:

agent_obs = []
for batched_obs in self.obs:
agent_obs.append(batched_obs[agent_index])
team_manager_id = None
if self.team_manager_id is not None and self.team_manager_id != "":
team_manager_id = self.team_manager_id[agent_index]
team_manager_id=team_manager_id,
)
def __iter__(self) -> Iterator[Any]:

reward=np.zeros(0, dtype=np.float32),
interrupted=np.zeros(0, dtype=np.bool),
agent_id=np.zeros(0, dtype=np.int32),
team_manager_id=None,
)

29
ml-agents-envs/mlagents_envs/rpc_utils.py


decision_rewards = np.array(
[agent_info.reward for agent_info in decision_agent_info_list], dtype=np.float32
)
decision_team_manager = [
agent_info.team_manager_id
for agent_info in decision_agent_info_list
if agent_info.team_manager_id is not None
]
if len(decision_team_manager) == 0:
decision_team_manager = None
terminal_team_manager = [
agent_info.team_manager_id
for agent_info in terminal_agent_info_list
if agent_info.team_manager_id is not None
]
if len(terminal_team_manager) == 0:
terminal_team_manager = None
_raise_on_nan_and_inf(decision_rewards, "rewards")
_raise_on_nan_and_inf(terminal_rewards, "rewards")

action_mask = np.split(action_mask, indices, axis=1)
return (
DecisionSteps(
decision_obs_list, decision_rewards, decision_agent_id, action_mask
decision_obs_list,
decision_rewards,
decision_agent_id,
action_mask,
decision_team_manager,
TerminalSteps(terminal_obs_list, terminal_rewards, max_step, terminal_agent_id),
TerminalSteps(
terminal_obs_list,
terminal_rewards,
max_step,
terminal_agent_id,
terminal_team_manager,
),
)

19
ml-agents/mlagents/trainers/agent_processor.py


self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
self.last_experience: Dict[str, AgentExperience] = {}
self.last_step_result: Dict[str, Tuple[DecisionStep, int]] = {}
# current_obs is used to collect the last seen obs of all the agents, and assemble the next_collab_obs.
self.current_obs: Dict[str, List[np.ndarray]] = {}
# current_group_obs is used to collect the last seen obs of all the agents, and assemble the next_collab_obs.
self.current_group_obs: Dict[str, Dict[str, List[np.ndarray]]] = defaultdict(
lambda: defaultdict(list)
)
# last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while
# grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1).
self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}

self.last_take_action_outputs[global_id] = take_action_outputs
# Iterate over all the terminal steps
# print("processing terminal_step")
for terminal_step in terminal_steps.values():
local_id = terminal_step.agent_id
global_id = get_global_agent_id(worker_id, local_id)

local_id = terminal_step.agent_id
global_id = get_global_agent_id(worker_id, local_id)
self._assemble_trajectory(terminal_step, global_id)
self.current_obs.clear()
self.current_group_obs.clear()
# print("clear terminal_step")
# Clean the last experience dictionary for terminal steps
for terminal_step in terminal_steps.values():

# Iterate over all the decision steps
# print("processing decision_steps")
for ongoing_step in decision_steps.values():
local_id = ongoing_step.agent_id
global_id = get_global_agent_id(worker_id, local_id)

local_id = ongoing_step.agent_id
global_id = get_global_agent_id(worker_id, local_id)
self._assemble_trajectory(ongoing_step, global_id)
self.current_obs.clear()
self.current_group_obs.clear()
# print("clear decision_steps")
for _gid in action_global_agent_ids:
# If the ID doesn't have a last step result, the agent just reset,

interrupted=interrupted,
memory=memory,
)
self.current_obs[global_id] = step.obs
if step.team_manager_id is not None:
self.current_group_obs[step.team_manager_id][global_id] += step.obs
self.last_experience[global_id] = experience
def _assemble_trajectory(

):
next_obs = step.obs
next_collab_obs = []
for _id, _exp in self.current_obs.items():
for _id, _exp in self.current_group_obs[step.team_manager_id].items():
if _id == global_id:
continue
else:

正在加载...
取消
保存