浏览代码

Policy output actiontuple (#4651)

/develop/actionmodel-csharp
GitHub 4 年前
当前提交
cc948a41
共有 14 个文件被更改,包括 156 次插入105 次删除
  1. 74
      ml-agents-envs/mlagents_envs/base_env.py
  2. 4
      ml-agents-envs/mlagents_envs/environment.py
  3. 4
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  4. 26
      ml-agents/mlagents/trainers/agent_processor.py
  5. 12
      ml-agents/mlagents/trainers/env_manager.py
  6. 13
      ml-agents/mlagents/trainers/policy/policy.py
  7. 16
      ml-agents/mlagents/trainers/policy/tf_policy.py
  8. 8
      ml-agents/mlagents/trainers/policy/torch_policy.py
  9. 6
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  10. 3
      ml-agents/mlagents/trainers/simple_env_manager.py
  11. 5
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  12. 44
      ml-agents/mlagents/trainers/torch/action_log_probs.py
  13. 21
      ml-agents/mlagents/trainers/torch/agent_action.py
  14. 25
      ml-agents/mlagents/trainers/trajectory.py

74
ml-agents-envs/mlagents_envs/base_env.py


)
class ActionTuple:
class _ActionTupleBase(ABC):
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively.
An object whose fields correspond to action data of continuous and discrete
spaces. Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
def __init__(

):
if continuous is not None and continuous.dtype != np.float32:
continuous = continuous.astype(np.float32, copy=False)
self._continuous = continuous
if discrete is not None and discrete.dtype != np.int32:
discrete = discrete.astype(np.int32, copy=False)
self._discrete = discrete
self._continuous: Optional[np.ndarray] = None
self._discrete: Optional[np.ndarray] = None
if continuous is not None:
self.add_continuous(continuous)
if discrete is not None:
self.add_discrete(discrete)
@property
def continuous(self) -> np.ndarray:

def discrete(self) -> np.ndarray:
return self._discrete
def add_continuous(self, continuous: np.ndarray) -> None:
if continuous.dtype != np.float32:
continuous = continuous.astype(np.float32, copy=False)
if self._discrete is None:
_discrete_dtype = self.get_discrete_dtype()
self._discrete = np.zeros((continuous.shape[0], 0), dtype=_discrete_dtype)
self._continuous = continuous
def add_discrete(self, discrete: np.ndarray) -> None:
_discrete_dtype = self.get_discrete_dtype()
if discrete.dtype != _discrete_dtype:
discrete = discrete.astype(np.int32, copy=False)
if self._continuous is None:
self._continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
self._discrete = discrete
@abstractmethod
def get_discrete_dtype(self) -> np.dtype:
pass
class ActionTuple(_ActionTupleBase):
"""
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
def get_discrete_dtype(self) -> np.dtype:
"""
The dtype of a discrete action.
"""
return np.int32
class ActionSpec(NamedTuple):
"""

for a number of agents.
:param n_agents: The number of agents that will have actions generated
"""
continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous, discrete)
_continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous=_continuous, discrete=_discrete)
def random_action(self, n_agents: int) -> ActionTuple:
"""

"""
continuous = np.random.uniform(
_continuous = np.random.uniform(
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
discrete = np.column_stack(
_discrete = np.column_stack(
[
np.random.randint(
0,

for i in range(self.discrete_size)
]
)
return ActionTuple(continuous, discrete)
return ActionTuple(continuous=_continuous, discrete=_discrete)
def _validate_action(
self, actions: ActionTuple, n_agents: int, name: str

4
ml-agents-envs/mlagents_envs/environment.py


if n_agents == 0:
continue
for i in range(n_agents):
# TODO: extend to AgentBuffers
if vector_action[b].continuous is not None:
# TODO: This check will be removed when the oroto supports hybrid actions
if vector_action[b].continuous.shape[1] > 0:
_act = vector_action[b].continuous[i]
else:
_act = vector_action[b].discrete[i]

4
ml-agents-envs/mlagents_envs/tests/test_envs.py


decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)
_empty_act = spec.action_spec.empty_action(n_agents)
next_action = ActionTuple(_empty_act.continuous - 1, _empty_act.discrete - 1)
next_action = ActionTuple()
next_action.add_continuous(_empty_act.continuous - 1)
next_action.add_discrete(_empty_act.discrete - 1)
env.set_actions("RealFakeBrain", next_action)
env.step()

26
ml-agents/mlagents/trainers/agent_processor.py


from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
from collections import defaultdict, Counter
import queue
import numpy as np
ActionTuple,
DecisionSteps,
DecisionStep,
TerminalSteps,

from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.behavior_id_utils import get_global_agent_id

done = terminated # Since this is an ongoing step
interrupted = step.interrupted if terminated else False
# Add the outputs of the last eval
action_dict = stored_take_action_outputs["action"]
action: Dict[str, np.ndarray] = {}
for act_type, act_array in action_dict.items():
action[act_type] = act_array[idx]
stored_actions = stored_take_action_outputs["action"]
action_tuple = ActionTuple(
continuous=stored_actions.continuous[idx],
discrete=stored_actions.discrete[idx],
)
action_probs_dict = stored_take_action_outputs["log_probs"]
action_probs: Dict[str, np.ndarray] = {}
for prob_type, prob_array in action_probs_dict.items():
action_probs[prob_type] = prob_array[idx]
stored_action_probs = stored_take_action_outputs["log_probs"]
log_probs_tuple = LogProbsTuple(
continuous=stored_action_probs.continuous[idx],
discrete=stored_action_probs.discrete[idx],
)
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(

action=action,
action_probs=action_probs,
action=action_tuple,
action_probs=log_probs_tuple,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,

12
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
import numpy as np
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.base_env import (

BehaviorName,
ActionTuple,
)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

step_info.environment_stats, step_info.worker_id
)
return len(step_infos)
@staticmethod
def action_tuple_from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> ActionTuple:
continuous: np.ndarray = None
discrete: np.ndarray = None
if "continuous_action" in action_dict:
continuous = action_dict["continuous_action"]
if "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
return ActionTuple(continuous, discrete)

13
ml-agents/mlagents/trainers/policy/policy.py


from typing import Dict, List, Optional
import numpy as np
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import TrainerSettings, NetworkSettings

)
def save_previous_action(
self, agent_ids: List[str], action_dict: Dict[str, np.ndarray]
self, agent_ids: List[str], action_tuple: ActionTuple
if action_dict is None or "discrete_action" not in action_dict:
return
# if action_dict is None or "discrete_action" not in action_dict:
# return
self.previous_action_dict[agent_id] = action_dict["discrete_action"][
index, :
]
self.previous_action_dict[agent_id] = action_tuple.discrete[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = self.make_empty_previous_action(len(agent_ids))

16
ml-agents/mlagents/trainers/policy/tf_policy.py


from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import DecisionSteps, ActionTuple
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, EncoderType
from mlagents.trainers import __version__

self.save_memories(global_agent_ids, run_out.get("memory_out"))
# For Compatibility with buffer changes for hybrid action support
if "log_probs" in run_out:
run_out["log_probs"] = {"action_probs": run_out["log_probs"]}
log_probs_tuple = LogProbsTuple()
if self.behavior_spec.action_spec.is_continuous():
log_probs_tuple.add_continuous(run_out["log_probs"])
else:
log_probs_tuple.add_discrete(run_out["log_probs"])
run_out["log_probs"] = log_probs_tuple
action_tuple = ActionTuple()
run_out["action"] = {"continuous_action": run_out["action"]}
action_tuple.add_continuous(run_out["action"])
run_out["action"] = {"discrete_action": run_out["action"]}
action_tuple.add_discrete(run_out["action"])
run_out["action"] = action_tuple
return ActionInfo(
action=run_out.get("action"),
value=run_out.get("value"),

8
ml-agents/mlagents/trainers/policy/torch_policy.py


action, log_probs, entropy, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
action_dict = action.to_numpy_dict()
run_out["action"] = action_dict
action_tuple = action.to_action_tuple()
run_out["action"] = action_tuple
action_dict["continuous_action"] if self.use_continuous_act else None
action_tuple.continuous if self.use_continuous_act else None
run_out["log_probs"] = log_probs.to_numpy_dict()
run_out["log_probs"] = log_probs.to_log_probs_tuple()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["learning_rate"] = 0.0
if self.use_recurrent:

6
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


self.policy.sequence_length_ph: self.policy.sequence_length,
self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
self.advantage: mini_batch["advantages"],
self.all_old_log_probs: mini_batch["action_probs"],
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.all_old_log_probs] = mini_batch["continuous_log_probs"]
else:
feed_dict[self.all_old_log_probs] = mini_batch["discrete_log_probs"]
if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]

3
ml-agents/mlagents/trainers/simple_env_manager.py


self.previous_all_action_info = all_action_info
for brain_name, action_info in all_action_info.items():
_action = EnvManager.action_tuple_from_numpy_dict(action_info.action)
self.env.set_actions(brain_name, _action)
self.env.set_actions(brain_name, action_info.action)
self.env.step()
all_step_result = self._generate_all_results()

5
ml-agents/mlagents/trainers/subprocess_env_manager.py


all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
_action = EnvManager.action_tuple_from_numpy_dict(
action_info.action
)
env.set_actions(brain_name, _action)
env.set_actions(brain_name, action_info.action)
env.step()
all_step_result = _generate_all_results()
# The timers in this process are independent from all the processes and the "main" process

44
ml-agents/mlagents/trainers/torch/action_log_probs.py


import numpy as np
from mlagents.trainers.torch.utils import ModelUtils
from mlagents_envs.base_env import _ActionTupleBase
class LogProbsTuple(_ActionTupleBase):
"""
An object whose fields correspond to the log probs of actions of different types.
Continuous and discrete are numpy arrays
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
def get_discrete_dtype(self) -> np.dtype:
"""
The dtype of a discrete log probability.
"""
return np.float32
class ActionLogProbs(NamedTuple):

"""
return torch.cat(self.all_discrete_list, dim=1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
def to_log_probs_tuple(self) -> LogProbsTuple:
Returns a Dict of np arrays with an entry correspinding to the continuous log probs
and an entry corresponding to the discrete log probs. "continuous_log_probs" and
"discrete_log_probs" are added to the agents buffer individually to maintain a flat buffer.
Returns a LogProbsTuple. Only adds if tensor is not None. Otherwise,
LogProbsTuple uses a default.
array_dict: Dict[str, np.ndarray] = {}
log_probs_tuple = LogProbsTuple()
array_dict["continuous_log_probs"] = ModelUtils.to_numpy(
self.continuous_tensor
)
continuous = ModelUtils.to_numpy(self.continuous_tensor)
log_probs_tuple.add_continuous(continuous)
array_dict["discrete_log_probs"] = ModelUtils.to_numpy(self.discrete_tensor)
return array_dict
discrete = ModelUtils.to_numpy(self.discrete_tensor)
log_probs_tuple.add_discrete(discrete)
return log_probs_tuple
def _to_tensor_list(self) -> List[torch.Tensor]:
"""

continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
if "discrete_log_probs" in buff:
discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
# This will keep discrete_list = None which enables flatten()
if discrete_tensor.shape[1] > 0:
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return ActionLogProbs(continuous, discrete, None)

21
ml-agents/mlagents/trainers/torch/agent_action.py


import numpy as np
from mlagents.trainers.torch.utils import ModelUtils
from mlagents_envs.base_env import ActionTuple
class AgentAction(NamedTuple):

"""
return torch.stack(self.discrete_list, dim=-1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
def to_action_tuple(self) -> ActionTuple:
Returns a Dict of np arrays with an entry correspinding to the continuous action
and an entry corresponding to the discrete action. "continuous_action" and
"discrete_action" are added to the agents buffer individually to maintain a flat buffer.
Returns an ActionTuple
array_dict: Dict[str, np.ndarray] = {}
action_tuple = ActionTuple()
array_dict["continuous_action"] = ModelUtils.to_numpy(
self.continuous_tensor
)
continuous = ModelUtils.to_numpy(self.continuous_tensor)
action_tuple.add_continuous(continuous)
array_dict["discrete_action"] = ModelUtils.to_numpy(
self.discrete_tensor[:, 0, :]
)
return array_dict
discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :])
action_tuple.add_discrete(discrete)
return action_tuple
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":

25
ml-agents/mlagents/trainers/trajectory.py


from typing import List, NamedTuple, Dict
from typing import List, NamedTuple
from mlagents_envs.base_env import ActionTuple
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
class AgentExperience(NamedTuple):

action: Dict[str, np.ndarray]
action_probs: Dict[str, np.ndarray]
action: ActionTuple
action_probs: LogProbsTuple
action_pre: np.ndarray # TODO: Remove this
action_mask: np.ndarray
prev_action: np.ndarray

agent_buffer_trajectory["actions_pre"].append(exp.action_pre)
# Adds the log prob and action of continuous/discrete separately
for act_type, act_array in exp.action.items():
agent_buffer_trajectory[act_type].append(act_array)
for log_type, log_array in exp.action_probs.items():
agent_buffer_trajectory[log_type].append(log_array)
agent_buffer_trajectory["continuous_action"].append(exp.action.continuous)
agent_buffer_trajectory["discrete_action"].append(exp.action.discrete)
agent_buffer_trajectory["continuous_log_probs"].append(
exp.action_probs.continuous
)
agent_buffer_trajectory["discrete_log_probs"].append(
exp.action_probs.discrete
)
# Store action masks if necessary. Note that 1 means active, while
# in AgentExperience False means active.

# This should never be needed unless the environment somehow doesn't supply the
# action mask in a discrete space.
if "discrete_action" in exp.action:
action_shape = exp.action["discrete_action"].shape
else:
action_shape = exp.action["continuous_action"].shape
action_shape = exp.action.discrete.shape
agent_buffer_trajectory["action_mask"].append(
np.ones(action_shape, dtype=np.float32), padding_value=1
)

正在加载...
取消
保存