浏览代码

Get next critic observations into value estimate

/comms-grad
Ervin Teng 4 年前
当前提交
56dcd75a
共有 6 个文件被更改,包括 55 次插入13 次删除
  1. 27
      ml-agents/mlagents/trainers/agent_processor.py
  2. 17
      ml-agents/mlagents/trainers/buffer.py
  3. 17
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  4. 5
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  5. 1
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 1
      ml-agents/mlagents/trainers/trajectory.py

27
ml-agents/mlagents/trainers/agent_processor.py


import sys
import numpy as np
from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
from collections import defaultdict, Counter
import queue

self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
self.last_experience: Dict[str, AgentExperience] = {}
self.last_step_result: Dict[str, Tuple[DecisionStep, int]] = {}
# current_obs is used to collect the last seen obs of all the agents, and assemble the next_collab_obs.
self.current_obs: Dict[str, List[np.ndarray]] = {}
# last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while
# grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1).
self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}

for terminal_step in terminal_steps.values():
local_id = terminal_step.agent_id
global_id = get_global_agent_id(worker_id, local_id)
self._assemble_trajectory(
terminal_step, global_id, terminal_steps.agent_id_to_index[local_id]
)
self._assemble_trajectory(terminal_step, global_id)
self.current_obs.clear()
# Clean the last experience dictionary for terminal steps
for terminal_step in terminal_steps.values():
local_id = terminal_step.agent_id

for ongoing_step in decision_steps.values():
local_id = ongoing_step.agent_id
global_id = get_global_agent_id(worker_id, local_id)
self._assemble_trajectory(
ongoing_step, global_id, decision_steps.agent_id_to_index[local_id]
)
self._assemble_trajectory(ongoing_step, global_id)
self.current_obs.clear()
for _gid in action_global_agent_ids:
# If the ID doesn't have a last step result, the agent just reset,

interrupted=interrupted,
memory=memory,
)
self.current_obs[global_id] = step.obs
self, step: Union[TerminalStep, DecisionStep], global_id: str, index: int
self, step: Union[TerminalStep, DecisionStep], global_id: str
# Add remaining obs to AgentExperience
# Add remaining shared obs to AgentExperience
for _id, _exp in self.last_experience.items():
if _id == global_id:
continue

or terminated
):
next_obs = step.obs
next_collab_obs = []
for _id, _exp in self.current_obs.items():
if _id == global_id:
continue
else:
next_collab_obs.append(_exp)
next_collab_obs=next_collab_obs,
behavior_id=self.behavior_id,
)
for traj_queue in self.trajectory_queues:

17
ml-agents/mlagents/trainers/buffer.py


# Transpose and convert List of Lists
new_list = list(map(lambda x: np.asanyarray(list(x)), zip(*obs_list)))
return new_list
@staticmethod
def obs_list_list_to_obs_batch(
obs_list_list: List[List[List[np.ndarray]]]
) -> List[List[np.ndarray]]:
"""
Convert a List of List of obs, where one of the dimension is time and the other is number (e.g. in the
case of a variable number of critic observations) to a List of obs, where time is in the batch dimension
of the obs, and the List is the variable number of agents.
"""
new_list = list(
map(
lambda x: AgentBuffer.obs_list_to_obs_batch(list(x)),
zip(*obs_list_list),
)
)
return new_list

17
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


)
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
self,
batch: AgentBuffer,
next_obs: List[np.ndarray],
next_critic_obs: List[List[np.ndarray]],
done: bool,
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])

# This line doesn't work
critic_obs = [ModelUtils.list_to_tensor_list(AgentBuffer.obs_list_to_obs_batch(agent_obs)) for agent_obs in batch["critic_obs"]]
critic_obs_np = AgentBuffer.obs_list_list_to_obs_batch(batch["critic_obs"])
critic_obs = [
ModelUtils.list_to_tensor_list(_agent_obs) for _agent_obs in critic_obs_np
]
next_critic_obs = [
ModelUtils.list_to_tensor_list(_obs) for _obs in next_critic_obs
]
memory = torch.zeros([1, 1, self.policy.m_size])

next_value_estimate, _ = self.policy.actor_critic.critic_pass(
next_obs, next_memory, sequence_length=1
next_obs, next_memory, sequence_length=1, critic_obs=next_critic_obs
)
for name, estimate in value_estimates.items():

5
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
critic_obs = [ModelUtils.list_to_tensor_list(AgentBuffer.obs_list_to_obs_batch(agent_obs)) for agent_obs in batch["critic_obs"]]
critic_obs_np = AgentBuffer.obs_list_list_to_obs_batch(batch["critic_obs"])
critic_obs = [
ModelUtils.list_to_tensor_list(_agent_obs) for _agent_obs in critic_obs_np
]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:

1
ml-agents/mlagents/trainers/ppo/trainer.py


value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
agent_buffer_trajectory,
trajectory.next_obs,
trajectory.next_collab_obs,
trajectory.done_reached and not trajectory.interrupted,
)

1
ml-agents/mlagents/trainers/trajectory.py


next_obs: List[
np.ndarray
] # Observation following the trajectory, for bootstrapping
next_collab_obs: List[List[np.ndarray]]
agent_id: str
behavior_id: str

正在加载...
取消
保存