Ervin Teng
5 年前
当前提交
c330f6f6
共有 59 个文件被更改,包括 1444 次插入 和 793 次删除
-
2UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
-
2UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scripts/GridArea.cs
-
42UnitySDK/Assets/ML-Agents/Scripts/Academy.cs
-
3docs/Installation.md
-
1docs/Migrating.md
-
202docs/Python-API.md
-
4gym-unity/README.md
-
120gym-unity/gym_unity/envs/__init__.py
-
90gym-unity/gym_unity/tests/test_gym.py
-
337ml-agents-envs/mlagents/envs/environment.py
-
8ml-agents-envs/mlagents/envs/exception.py
-
82ml-agents-envs/mlagents/envs/tests/test_envs.py
-
2ml-agents/mlagents/trainers/components/reward_signals/__init__.py
-
2ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
-
2ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
-
2ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
-
2ml-agents/mlagents/trainers/demo_loader.py
-
8ml-agents/mlagents/trainers/exception.py
-
34ml-agents/mlagents/trainers/learn.py
-
2ml-agents/mlagents/trainers/models.py
-
2ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
-
2ml-agents/mlagents/trainers/ppo/policy.py
-
2ml-agents/mlagents/trainers/sac/policy.py
-
2ml-agents/mlagents/trainers/tests/mock_brain.py
-
23ml-agents/mlagents/trainers/tests/test_bc.py
-
4ml-agents/mlagents/trainers/tests/test_policy.py
-
20ml-agents/mlagents/trainers/tests/test_ppo.py
-
116ml-agents/mlagents/trainers/tests/test_simple_rl.py
-
6ml-agents/mlagents/trainers/tests/test_trainer_controller.py
-
6ml-agents/mlagents/trainers/tests/test_trainer_util.py
-
6ml-agents/mlagents/trainers/tf_policy.py
-
4ml-agents/mlagents/trainers/trainer.py
-
6ml-agents/mlagents/trainers/trainer_controller.py
-
2ml-agents/mlagents/trainers/trainer_util.py
-
2notebooks/getting-started-gym.ipynb
-
80notebooks/getting-started.ipynb
-
10ml-agents/mlagents/trainers/tests/test_sampler_class.py
-
6ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
-
2ml-agents/mlagents/trainers/sampler_class.py
-
46ml-agents/mlagents/trainers/brain.py
-
6ml-agents/mlagents/trainers/env_manager.py
-
46ml-agents/mlagents/trainers/simple_env_manager.py
-
50ml-agents/mlagents/trainers/subprocess_env_manager.py
-
301ml-agents-envs/mlagents/envs/base_env.py
-
165ml-agents-envs/mlagents/envs/rpc_utils.py
-
187ml-agents-envs/mlagents/envs/tests/test_rpc_utils.py
-
70ml-agents/mlagents/trainers/brain_conversion_utils.py
-
10ml-agents/mlagents/trainers/policy.py
-
73ml-agents-envs/mlagents/envs/tests/test_brain.py
-
10ml-agents-envs/mlagents/envs/policy.py
-
25ml-agents-envs/mlagents/envs/base_unity_environment.py
-
0/ml-agents/mlagents/trainers/tests/test_sampler_class.py
-
0/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
-
0/ml-agents/mlagents/trainers/action_info.py
-
0/ml-agents/mlagents/trainers/sampler_class.py
-
0/ml-agents/mlagents/trainers/brain.py
-
0/ml-agents/mlagents/trainers/env_manager.py
-
0/ml-agents/mlagents/trainers/simple_env_manager.py
-
0/ml-agents/mlagents/trainers/subprocess_env_manager.py
|
|||
""" |
|||
Python Environment API for the ML-Agents toolkit |
|||
The aim of this API is to expose groups of similar Agents evolving in Unity |
|||
to perform reinforcement learning on. |
|||
There can be multiple groups of similar Agents (same observations and actions |
|||
spaces) in the simulation. These groups are identified by a agent_group that |
|||
corresponds to a single group of Agents in the simulation. |
|||
For performance reasons, the data of each group of agents is processed in a |
|||
batched manner. When retrieving the state of a group of Agents, said state |
|||
contains the data for the whole group. Agents in these groups are identified |
|||
by a unique int identifier that allows tracking of Agents across simulation |
|||
steps. Note that there is no guarantee that the number or order of the Agents |
|||
in the state will be consistent across simulation steps. |
|||
A simulation steps corresponds to moving the simulation forward until at least |
|||
one agent in the simulation sends its observations to Python again. Since |
|||
Agents can request decisions at different frequencies, a simulation step does |
|||
not necessarily correspond to a fixed simulation time increment. |
|||
""" |
|||
|
|||
from abc import ABC, abstractmethod |
|||
from typing import List, NamedTuple, Tuple, Optional, Union, Dict, NewType |
|||
import numpy as np |
|||
from enum import Enum |
|||
|
|||
AgentId = NewType("AgentId", int) |
|||
AgentGroup = NewType("AgentGroup", str) |
|||
|
|||
|
|||
class StepResult(NamedTuple): |
|||
""" |
|||
Contains the data a single Agent collected since the last |
|||
simulation step. |
|||
- obs is a list of numpy arrays observations collected by the group of |
|||
agent. |
|||
- reward is a float. Corresponds to the rewards collected by the agent |
|||
since the last simulation step. |
|||
- done is a bool. Is true if the Agent was terminated during the last |
|||
simulation step. |
|||
- max_step is a bool. Is true if the Agent reached its maximum number of |
|||
steps during the last simulation step. |
|||
- agent_id is an int and an unique identifier for the corresponding Agent. |
|||
- action_mask is an optional list of one dimensional array of booleans. |
|||
Only available in multi-discrete action space type. |
|||
Each array corresponds to an action branch. Each array contains a mask |
|||
for each action of the branch. If true, the action is not available for |
|||
the agent during this simulation step. |
|||
""" |
|||
|
|||
obs: List[np.ndarray] |
|||
reward: float |
|||
done: bool |
|||
max_step: bool |
|||
agent_id: AgentId |
|||
action_mask: Optional[List[np.ndarray]] |
|||
|
|||
|
|||
class BatchedStepResult: |
|||
""" |
|||
Contains the data a group of similar Agents collected since the last |
|||
simulation step. Note that all Agents do not necessarily have new |
|||
information to send at each simulation step. Therefore, the ordering of |
|||
agents and the batch size of the BatchedStepResult are not fixed across |
|||
simulation steps. |
|||
- obs is a list of numpy arrays observations collected by the group of |
|||
agent. Each obs has one extra dimension compared to StepResult: the first |
|||
dimension of the array corresponds to the batch size of |
|||
the group. |
|||
- reward is a float vector of length batch size. Corresponds to the |
|||
rewards collected by each agent since the last simulation step. |
|||
- done is an array of booleans of length batch size. Is true if the |
|||
associated Agent was terminated during the last simulation step. |
|||
- max_step is an array of booleans of length batch size. Is true if the |
|||
associated Agent reached its maximum number of steps during the last |
|||
simulation step. |
|||
- agent_id is an int vector of length batch size containing unique |
|||
identifier for the corresponding Agent. This is used to track Agents |
|||
across simulation steps. |
|||
- action_mask is an optional list of two dimensional array of booleans. |
|||
Only available in multi-discrete action space type. |
|||
Each array corresponds to an action branch. The first dimension of each |
|||
array is the batch size and the second contains a mask for each action of |
|||
the branch. If true, the action is not available for the agent during |
|||
this simulation step. |
|||
""" |
|||
|
|||
def __init__(self, obs, reward, done, max_step, agent_id, action_mask): |
|||
self.obs: List[np.ndarray] = obs |
|||
self.reward: np.ndarray = reward |
|||
self.done: np.ndarray = done |
|||
self.max_step: np.ndarray = max_step |
|||
self.agent_id: np.ndarray = agent_id |
|||
self.action_mask: Optional[List[np.ndarray]] = action_mask |
|||
self._agent_id_to_index: Optional[Dict[int, int]] = None |
|||
|
|||
def contains_agent(self, agent_id: AgentId) -> bool: |
|||
if self._agent_id_to_index is None: |
|||
self._agent_id_to_index = {} |
|||
for a_idx, a_id in enumerate(self.agent_id): |
|||
self._agent_id_to_index[a_id] = a_idx |
|||
return agent_id in self._agent_id_to_index |
|||
|
|||
def get_agent_step_result(self, agent_id: AgentId) -> StepResult: |
|||
""" |
|||
returns the step result for a specific agent. |
|||
:param agent_id: The id of the agent |
|||
:returns: obs, reward, done, agent_id and optional action mask for a |
|||
specific agent |
|||
""" |
|||
if not self.contains_agent(agent_id): |
|||
raise IndexError( |
|||
"agent_id {} is not present in the BatchedStepResult".format(agent_id) |
|||
) |
|||
agent_index = self._agent_id_to_index[agent_id] # type: ignore |
|||
agent_obs = [] |
|||
for batched_obs in self.obs: |
|||
agent_obs.append(batched_obs[agent_index]) |
|||
agent_mask = None |
|||
if self.action_mask is not None: |
|||
agent_mask = [] |
|||
for mask in self.action_mask: |
|||
agent_mask.append(mask[agent_index]) |
|||
return StepResult( |
|||
obs=agent_obs, |
|||
reward=self.reward[agent_index], |
|||
done=self.done[agent_index], |
|||
max_step=self.max_step[agent_index], |
|||
agent_id=agent_id, |
|||
action_mask=agent_mask, |
|||
) |
|||
|
|||
@staticmethod |
|||
def empty(spec: "AgentGroupSpec") -> "BatchedStepResult": |
|||
""" |
|||
Returns an empty BatchedStepResult. |
|||
:param spec: The AgentGroupSpec for the BatchedStepResult |
|||
""" |
|||
obs: List[np.ndarray] = [] |
|||
for shape in spec.observation_shapes: |
|||
obs += [np.zeros((0,) + shape, dtype=np.float32)] |
|||
return BatchedStepResult( |
|||
obs=obs, |
|||
reward=np.zeros(0, dtype=np.float32), |
|||
done=np.zeros(0, dtype=np.bool), |
|||
max_step=np.zeros(0, dtype=np.bool), |
|||
agent_id=np.zeros(0, dtype=np.int32), |
|||
action_mask=None, |
|||
) |
|||
|
|||
def n_agents(self) -> int: |
|||
return len(self.agent_id) |
|||
|
|||
|
|||
class ActionType(Enum): |
|||
DISCRETE = 0 |
|||
CONTINUOUS = 1 |
|||
|
|||
|
|||
class AgentGroupSpec(NamedTuple): |
|||
""" |
|||
A NamedTuple to containing information about the observations and actions |
|||
spaces for a group of Agents. |
|||
- observation_shapes is a List of Tuples of int : Each Tuple corresponds |
|||
to an observation's dimensions. The shape tuples have the same ordering as |
|||
the ordering of the BatchedStepResult and StepResult. |
|||
- action_type is the type of data of the action. it can be discrete or |
|||
continuous. If discrete, the action tensors are expected to be int32. If |
|||
continuous, the actions are expected to be float32. |
|||
- action_shape is: |
|||
- An int in continuous action space corresponding to the number of |
|||
floats that constitute the action. |
|||
- A Tuple of int in discrete action space where each int corresponds to |
|||
the number of discrete actions available to the agent. |
|||
""" |
|||
|
|||
observation_shapes: List[Tuple] |
|||
action_type: ActionType |
|||
action_shape: Union[int, Tuple[int, ...]] |
|||
|
|||
def is_action_discrete(self) -> bool: |
|||
""" |
|||
Returns true if the Agent group uses discrete actions |
|||
""" |
|||
return self.action_type == ActionType.DISCRETE |
|||
|
|||
def is_action_continuous(self) -> bool: |
|||
""" |
|||
Returns true if the Agent group uses continuous actions |
|||
""" |
|||
return self.action_type == ActionType.CONTINUOUS |
|||
|
|||
@property |
|||
def action_size(self) -> int: |
|||
""" |
|||
Returns the dimension of the action. |
|||
- In the continuous case, will return the number of continuous actions. |
|||
- In the (multi-)discrete case, will return the number of action. |
|||
branches. |
|||
""" |
|||
if self.action_type == ActionType.DISCRETE: |
|||
return len(self.action_shape) # type: ignore |
|||
else: |
|||
return self.action_shape # type: ignore |
|||
|
|||
@property |
|||
def discrete_action_branches(self) -> Optional[Tuple[int, ...]]: |
|||
""" |
|||
Returns a Tuple of int corresponding to the number of possible actions |
|||
for each branch (only for discrete actions). Will return None in |
|||
for continuous actions. |
|||
""" |
|||
if self.action_type == ActionType.DISCRETE: |
|||
return self.action_shape # type: ignore |
|||
else: |
|||
return None |
|||
|
|||
def create_empty_action(self, n_agents: int) -> np.ndarray: |
|||
if self.action_type == ActionType.DISCRETE: |
|||
return np.zeros((n_agents, self.action_size), dtype=np.int32) |
|||
else: |
|||
return np.zeros((n_agents, self.action_size), dtype=np.float32) |
|||
|
|||
|
|||
class BaseEnv(ABC): |
|||
@abstractmethod |
|||
def step(self) -> None: |
|||
""" |
|||
Signals the environment that it must move the simulation forward |
|||
by one step. |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def reset(self) -> None: |
|||
""" |
|||
Signals the environment that it must reset the simulation. |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def close(self) -> None: |
|||
""" |
|||
Signals the environment that it must close. |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def get_agent_groups(self) -> List[AgentGroup]: |
|||
""" |
|||
Returns the list of the agent group names present in the environment. |
|||
Agents grouped under the same group name have the same action and |
|||
observation specs, and are expected to behave similarly in the environment. |
|||
This list can grow with time as new policies are instantiated. |
|||
:return: the list of agent group names. |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def set_actions(self, agent_group: AgentGroup, action: np.ndarray) -> None: |
|||
""" |
|||
Sets the action for all of the agents in the simulation for the next |
|||
step. The Actions must be in the same order as the order received in |
|||
the step result. |
|||
:param agent_group: The name of the group the agents are part of |
|||
:param action: A two dimensional np.ndarray corresponding to the action |
|||
(either int or float) |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def set_action_for_agent( |
|||
self, agent_group: AgentGroup, agent_id: AgentId, action: np.ndarray |
|||
) -> None: |
|||
""" |
|||
Sets the action for one of the agents in the simulation for the next |
|||
step. |
|||
:param agent_group: The name of the group the agent is part of |
|||
:param agent_id: The id of the agent the action is set for |
|||
:param action: A two dimensional np.ndarray corresponding to the action |
|||
(either int or float) |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def get_step_result(self, agent_group: AgentGroup) -> BatchedStepResult: |
|||
""" |
|||
Retrieves the observations of the agents that requested a step in the |
|||
simulation. |
|||
:param agent_group: The name of the group the agents are part of |
|||
:return: A BatchedStepResult NamedTuple containing the observations, |
|||
the rewards and the done flags for this group of agents. |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def get_agent_group_spec(self, agent_group: AgentGroup) -> AgentGroupSpec: |
|||
""" |
|||
Get the AgentGroupSpec corresponding to the agent group name |
|||
:param agent_group: The name of the group the agents are part of |
|||
:return: A AgentGroupSpec corresponding to that agent group name |
|||
""" |
|||
pass |
|
|||
from mlagents.envs.base_env import AgentGroupSpec, ActionType, BatchedStepResult |
|||
from mlagents.envs.timers import hierarchical_timer, timed |
|||
from mlagents.envs.communicator_objects.agent_info_pb2 import AgentInfoProto |
|||
from mlagents.envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto |
|||
import logging |
|||
import numpy as np |
|||
import io |
|||
from typing import List, Tuple |
|||
from PIL import Image |
|||
|
|||
logger = logging.getLogger("mlagents.envs") |
|||
|
|||
|
|||
def agent_group_spec_from_proto( |
|||
brain_param_proto: BrainParametersProto, agent_info: AgentInfoProto |
|||
) -> AgentGroupSpec: |
|||
""" |
|||
Converts brain parameter and agent info proto to AgentGroupSpec object. |
|||
:param brain_param_proto: protobuf object. |
|||
:param agent_info: protobuf object. |
|||
:return: AgentGroupSpec object. |
|||
""" |
|||
observation_shape = [tuple(obs.shape) for obs in agent_info.observations] |
|||
action_type = ( |
|||
ActionType.DISCRETE |
|||
if brain_param_proto.vector_action_space_type == 0 |
|||
else ActionType.CONTINUOUS |
|||
) |
|||
action_shape = None |
|||
if action_type == ActionType.CONTINUOUS: |
|||
action_shape = brain_param_proto.vector_action_size[0] |
|||
else: |
|||
action_shape = tuple(brain_param_proto.vector_action_size) |
|||
return AgentGroupSpec(observation_shape, action_type, action_shape) |
|||
|
|||
|
|||
@timed |
|||
def process_pixels(image_bytes: bytes, gray_scale: bool) -> np.ndarray: |
|||
""" |
|||
Converts byte array observation image into numpy array, re-sizes it, |
|||
and optionally converts it to grey scale |
|||
:param gray_scale: Whether to convert the image to grayscale. |
|||
:param image_bytes: input byte array corresponding to image |
|||
:return: processed numpy array of observation from environment |
|||
""" |
|||
with hierarchical_timer("image_decompress"): |
|||
image_bytearray = bytearray(image_bytes) |
|||
image = Image.open(io.BytesIO(image_bytearray)) |
|||
# Normally Image loads lazily, this forces it to do loading in the timer scope. |
|||
image.load() |
|||
s = np.array(image) / 255.0 |
|||
if gray_scale: |
|||
s = np.mean(s, axis=2) |
|||
s = np.reshape(s, [s.shape[0], s.shape[1], 1]) |
|||
return s |
|||
|
|||
|
|||
@timed |
|||
def _process_visual_observation( |
|||
obs_index: int, shape: Tuple[int, int, int], agent_info_list: List[AgentInfoProto] |
|||
) -> np.ndarray: |
|||
if len(agent_info_list) == 0: |
|||
return np.zeros((0, shape[0], shape[1], shape[2]), dtype=np.float32) |
|||
|
|||
gray_scale = shape[2] == 1 |
|||
batched_visual = [ |
|||
process_pixels(agent_obs.observations[obs_index].compressed_data, gray_scale) |
|||
for agent_obs in agent_info_list |
|||
] |
|||
return np.array(batched_visual, dtype=np.float32) |
|||
|
|||
|
|||
@timed |
|||
def _process_vector_observation( |
|||
obs_index: int, shape: Tuple[int, ...], agent_info_list: List[AgentInfoProto] |
|||
) -> np.ndarray: |
|||
if len(agent_info_list) == 0: |
|||
return np.zeros((0, shape[0]), dtype=np.float32) |
|||
np_obs = np.array( |
|||
[ |
|||
agent_obs.observations[obs_index].float_data.data |
|||
for agent_obs in agent_info_list |
|||
], |
|||
dtype=np.float32, |
|||
) |
|||
# Check for NaNs or infs in the observations |
|||
# If there's a NaN in the observations, the np.mean() result will be NaN |
|||
# If there's an Inf (either sign) then the result will be Inf |
|||
# See https://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy for background |
|||
# Note that a very large values (larger than sqrt(float_max)) will result in an Inf value here |
|||
# This is OK though, worst case it results in an unnecessary (but harmless) nan_to_num call. |
|||
d = np.mean(np_obs) |
|||
has_nan = np.isnan(d) |
|||
has_inf = not np.isfinite(d) |
|||
|
|||
# In we have any NaN or Infs, use np.nan_to_num to replace these with finite values |
|||
if has_nan or has_inf: |
|||
np_obs = np.nan_to_num(np_obs) |
|||
|
|||
if has_nan: |
|||
logger.warning(f"An agent had a NaN observation in the environment") |
|||
return np_obs |
|||
|
|||
|
|||
@timed |
|||
def batched_step_result_from_proto( |
|||
agent_info_list: List[AgentInfoProto], group_spec: AgentGroupSpec |
|||
) -> BatchedStepResult: |
|||
obs_list: List[np.ndarray] = [] |
|||
for obs_index, obs_shape in enumerate(group_spec.observation_shapes): |
|||
is_visual = len(obs_shape) == 3 |
|||
if is_visual: |
|||
obs_list += [ |
|||
_process_visual_observation(obs_index, obs_shape, agent_info_list) |
|||
] |
|||
else: |
|||
obs_list += [ |
|||
_process_vector_observation(obs_index, obs_shape, agent_info_list) |
|||
] |
|||
rewards = np.array( |
|||
[agent_info.reward for agent_info in agent_info_list], dtype=np.float32 |
|||
) |
|||
|
|||
d = np.dot(rewards, rewards) |
|||
has_nan = np.isnan(d) |
|||
has_inf = not np.isfinite(d) |
|||
# In we have any NaN or Infs, use np.nan_to_num to replace these with finite values |
|||
if has_nan or has_inf: |
|||
rewards = np.nan_to_num(rewards) |
|||
if has_nan: |
|||
logger.warning(f"An agent had a NaN reward in the environment") |
|||
|
|||
done = np.array([agent_info.done for agent_info in agent_info_list], dtype=np.bool) |
|||
max_step = np.array( |
|||
[agent_info.max_step_reached for agent_info in agent_info_list], dtype=np.bool |
|||
) |
|||
agent_id = np.array( |
|||
[agent_info.id for agent_info in agent_info_list], dtype=np.int32 |
|||
) |
|||
action_mask = None |
|||
if group_spec.is_action_discrete(): |
|||
if any([agent_info.action_mask is not None] for agent_info in agent_info_list): |
|||
n_agents = len(agent_info_list) |
|||
a_size = np.sum(group_spec.discrete_action_branches) |
|||
mask_matrix = np.ones((n_agents, a_size), dtype=np.bool) |
|||
for agent_index, agent_info in enumerate(agent_info_list): |
|||
if agent_info.action_mask is not None: |
|||
if len(agent_info.action_mask) == a_size: |
|||
mask_matrix[agent_index, :] = [ |
|||
False if agent_info.action_mask[k] else True |
|||
for k in range(a_size) |
|||
] |
|||
action_mask = (1 - mask_matrix).astype(np.bool) |
|||
indices = _generate_split_indices(group_spec.discrete_action_branches) |
|||
action_mask = np.split(action_mask, indices, axis=1) |
|||
return BatchedStepResult(obs_list, rewards, done, max_step, agent_id, action_mask) |
|||
|
|||
|
|||
def _generate_split_indices(dims): |
|||
if len(dims) <= 1: |
|||
return () |
|||
result = (dims[0],) |
|||
for i in range(len(dims) - 2): |
|||
result += (dims[i + 1] + result[i],) |
|||
return result |
|
|||
from typing import List, Tuple |
|||
from mlagents.envs.communicator_objects.agent_info_pb2 import AgentInfoProto |
|||
from mlagents.envs.communicator_objects.observation_pb2 import ObservationProto |
|||
from mlagents.envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto |
|||
import numpy as np |
|||
from mlagents.envs.base_env import AgentGroupSpec, ActionType |
|||
import io |
|||
from mlagents.envs.rpc_utils import ( |
|||
agent_group_spec_from_proto, |
|||
process_pixels, |
|||
_process_visual_observation, |
|||
_process_vector_observation, |
|||
batched_step_result_from_proto, |
|||
) |
|||
from PIL import Image |
|||
|
|||
|
|||
def generate_list_agent_proto( |
|||
n_agent: int, shape: List[Tuple[int]] |
|||
) -> List[AgentInfoProto]: |
|||
result = [] |
|||
for agent_index in range(n_agent): |
|||
ap = AgentInfoProto() |
|||
ap.reward = agent_index |
|||
ap.done = agent_index % 2 == 0 |
|||
ap.max_step_reached = agent_index % 2 == 1 |
|||
ap.id = agent_index |
|||
ap.action_mask.extend([True, False] * 5) |
|||
obs_proto_list = [] |
|||
for obs_index in range(len(shape)): |
|||
obs_proto = ObservationProto() |
|||
obs_proto.shape.extend(list(shape[obs_index])) |
|||
obs_proto.compression_type = 0 |
|||
obs_proto.float_data.data.extend([0.1] * np.prod(shape[obs_index])) |
|||
obs_proto_list.append(obs_proto) |
|||
ap.observations.extend(obs_proto_list) |
|||
result.append(ap) |
|||
return result |
|||
|
|||
|
|||
def generate_compressed_data(in_array: np.ndarray) -> bytes: |
|||
image_arr = (in_array * 255).astype(np.uint8) |
|||
im = Image.fromarray(image_arr, "RGB") |
|||
byteIO = io.BytesIO() |
|||
im.save(byteIO, format="PNG") |
|||
return byteIO.getvalue() |
|||
|
|||
|
|||
def generate_compressed_proto_obs(in_array: np.ndarray) -> ObservationProto: |
|||
obs_proto = ObservationProto() |
|||
obs_proto.compressed_data = generate_compressed_data(in_array) |
|||
obs_proto.compression_type = 1 |
|||
obs_proto.shape.extend(in_array.shape) |
|||
return obs_proto |
|||
|
|||
|
|||
def test_process_pixels(): |
|||
in_array = np.random.rand(128, 128, 3) |
|||
byte_arr = generate_compressed_data(in_array) |
|||
out_array = process_pixels(byte_arr, False) |
|||
assert out_array.shape == (128, 128, 3) |
|||
assert np.sum(in_array - out_array) / np.prod(in_array.shape) < 0.01 |
|||
assert (in_array - out_array < 0.01).all() |
|||
|
|||
|
|||
def test_process_pixels_gray(): |
|||
in_array = np.random.rand(128, 128, 3) |
|||
byte_arr = generate_compressed_data(in_array) |
|||
out_array = process_pixels(byte_arr, True) |
|||
assert out_array.shape == (128, 128, 1) |
|||
assert np.mean(in_array.mean(axis=2, keepdims=True) - out_array) < 0.01 |
|||
assert (in_array.mean(axis=2, keepdims=True) - out_array < 0.01).all() |
|||
|
|||
|
|||
def test_vector_observation(): |
|||
n_agents = 10 |
|||
shapes = [(3,), (4,)] |
|||
list_proto = generate_list_agent_proto(n_agents, shapes) |
|||
for obs_index, shape in enumerate(shapes): |
|||
arr = _process_vector_observation(obs_index, shape, list_proto) |
|||
assert list(arr.shape) == ([n_agents] + list(shape)) |
|||
assert (np.abs(arr - 0.1) < 0.01).all() |
|||
|
|||
|
|||
def test_process_visual_observation(): |
|||
in_array_1 = np.random.rand(128, 128, 3) |
|||
proto_obs_1 = generate_compressed_proto_obs(in_array_1) |
|||
in_array_2 = np.random.rand(128, 128, 3) |
|||
proto_obs_2 = generate_compressed_proto_obs(in_array_2) |
|||
ap1 = AgentInfoProto() |
|||
ap1.observations.extend([proto_obs_1]) |
|||
ap2 = AgentInfoProto() |
|||
ap2.observations.extend([proto_obs_2]) |
|||
ap_list = [ap1, ap2] |
|||
arr = _process_visual_observation(0, (128, 128, 3), ap_list) |
|||
assert list(arr.shape) == [2, 128, 128, 3] |
|||
assert (arr[0, :, :, :] - in_array_1 < 0.01).all() |
|||
assert (arr[1, :, :, :] - in_array_2 < 0.01).all() |
|||
|
|||
|
|||
def test_batched_step_result_from_proto(): |
|||
n_agents = 10 |
|||
shapes = [(3,), (4,)] |
|||
group_spec = AgentGroupSpec(shapes, ActionType.CONTINUOUS, 3) |
|||
ap_list = generate_list_agent_proto(n_agents, shapes) |
|||
result = batched_step_result_from_proto(ap_list, group_spec) |
|||
assert list(result.reward) == list(range(n_agents)) |
|||
assert list(result.agent_id) == list(range(n_agents)) |
|||
for index in range(n_agents): |
|||
assert result.done[index] == (index % 2 == 0) |
|||
assert result.max_step[index] == (index % 2 == 1) |
|||
assert list(result.obs[0].shape) == [n_agents] + list(shapes[0]) |
|||
assert list(result.obs[1].shape) == [n_agents] + list(shapes[1]) |
|||
|
|||
|
|||
def test_action_masking_discrete(): |
|||
n_agents = 10 |
|||
shapes = [(3,), (4,)] |
|||
group_spec = AgentGroupSpec(shapes, ActionType.DISCRETE, (7, 3)) |
|||
ap_list = generate_list_agent_proto(n_agents, shapes) |
|||
result = batched_step_result_from_proto(ap_list, group_spec) |
|||
masks = result.action_mask |
|||
assert isinstance(masks, list) |
|||
assert len(masks) == 2 |
|||
assert masks[0].shape == (n_agents, 7) |
|||
assert masks[1].shape == (n_agents, 3) |
|||
assert masks[0][0, 0] |
|||
assert not masks[1][0, 0] |
|||
assert masks[1][0, 1] |
|||
|
|||
|
|||
def test_action_masking_discrete_1(): |
|||
n_agents = 10 |
|||
shapes = [(3,), (4,)] |
|||
group_spec = AgentGroupSpec(shapes, ActionType.DISCRETE, (10,)) |
|||
ap_list = generate_list_agent_proto(n_agents, shapes) |
|||
result = batched_step_result_from_proto(ap_list, group_spec) |
|||
masks = result.action_mask |
|||
assert isinstance(masks, list) |
|||
assert len(masks) == 1 |
|||
assert masks[0].shape == (n_agents, 10) |
|||
assert masks[0][0, 0] |
|||
|
|||
|
|||
def test_action_masking_discrete_2(): |
|||
n_agents = 10 |
|||
shapes = [(3,), (4,)] |
|||
group_spec = AgentGroupSpec(shapes, ActionType.DISCRETE, (2, 2, 6)) |
|||
ap_list = generate_list_agent_proto(n_agents, shapes) |
|||
result = batched_step_result_from_proto(ap_list, group_spec) |
|||
masks = result.action_mask |
|||
assert isinstance(masks, list) |
|||
assert len(masks) == 3 |
|||
assert masks[0].shape == (n_agents, 2) |
|||
assert masks[1].shape == (n_agents, 2) |
|||
assert masks[2].shape == (n_agents, 6) |
|||
assert masks[0][0, 0] |
|||
|
|||
|
|||
def test_action_masking_continuous(): |
|||
n_agents = 10 |
|||
shapes = [(3,), (4,)] |
|||
group_spec = AgentGroupSpec(shapes, ActionType.CONTINUOUS, 10) |
|||
ap_list = generate_list_agent_proto(n_agents, shapes) |
|||
result = batched_step_result_from_proto(ap_list, group_spec) |
|||
masks = result.action_mask |
|||
assert masks is None |
|||
|
|||
|
|||
def test_agent_group_spec_from_proto(): |
|||
agent_proto = generate_list_agent_proto(1, [(3,), (4,)])[0] |
|||
bp = BrainParametersProto() |
|||
bp.vector_action_size.extend([5, 4]) |
|||
bp.vector_action_space_type = 0 |
|||
group_spec = agent_group_spec_from_proto(bp, agent_proto) |
|||
assert group_spec.is_action_discrete() |
|||
assert not group_spec.is_action_continuous() |
|||
assert group_spec.observation_shapes == [(3,), (4,)] |
|||
assert group_spec.discrete_action_branches == (5, 4) |
|||
assert group_spec.action_size == 2 |
|||
bp = BrainParametersProto() |
|||
bp.vector_action_size.extend([6]) |
|||
bp.vector_action_space_type = 1 |
|||
group_spec = agent_group_spec_from_proto(bp, agent_proto) |
|||
assert not group_spec.is_action_discrete() |
|||
assert group_spec.is_action_continuous() |
|||
assert group_spec.action_size == 6 |
|
|||
from mlagents.trainers.brain import BrainInfo, BrainParameters, CameraResolution |
|||
from mlagents.envs.base_env import BatchedStepResult, AgentGroupSpec |
|||
from mlagents.envs.exception import UnityEnvironmentException |
|||
import numpy as np |
|||
from typing import List |
|||
|
|||
|
|||
def step_result_to_brain_info( |
|||
step_result: BatchedStepResult, |
|||
group_spec: AgentGroupSpec, |
|||
agent_id_prefix: int = None, |
|||
) -> BrainInfo: |
|||
n_agents = step_result.n_agents() |
|||
vis_obs_indices = [] |
|||
vec_obs_indices = [] |
|||
for index, observation in enumerate(step_result.obs): |
|||
if len(observation.shape) == 2: |
|||
vec_obs_indices.append(index) |
|||
elif len(observation.shape) == 4: |
|||
vis_obs_indices.append(index) |
|||
else: |
|||
raise UnityEnvironmentException( |
|||
"Invalid input received from the environment, the observation should " |
|||
"either be a vector of float or a PNG image" |
|||
) |
|||
if len(vec_obs_indices) == 0: |
|||
vec_obs = np.zeros((n_agents, 0), dtype=np.float32) |
|||
else: |
|||
vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices], axis=1) |
|||
vis_obs = [step_result.obs[i] for i in vis_obs_indices] |
|||
mask = np.ones((n_agents, np.sum(group_spec.action_size)), dtype=np.float32) |
|||
if group_spec.is_action_discrete(): |
|||
mask = np.ones( |
|||
(n_agents, np.sum(group_spec.discrete_action_branches)), dtype=np.float32 |
|||
) |
|||
if step_result.action_mask is not None: |
|||
mask = 1 - np.concatenate(step_result.action_mask, axis=1) |
|||
if agent_id_prefix is None: |
|||
agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)] |
|||
else: |
|||
agent_ids = [f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id] |
|||
return BrainInfo( |
|||
vis_obs, |
|||
vec_obs, |
|||
list(step_result.reward), |
|||
agent_ids, |
|||
list(step_result.done), |
|||
list(step_result.max_step), |
|||
mask, |
|||
) |
|||
|
|||
|
|||
def group_spec_to_brain_parameters( |
|||
name: str, group_spec: AgentGroupSpec |
|||
) -> BrainParameters: |
|||
vec_size = np.sum( |
|||
[shape[0] for shape in group_spec.observation_shapes if len(shape) == 1] |
|||
) |
|||
vis_sizes = [shape for shape in group_spec.observation_shapes if len(shape) == 3] |
|||
cam_res = [CameraResolution(s[0], s[1], s[2]) for s in vis_sizes] |
|||
a_size: List[int] = [] |
|||
if group_spec.is_action_discrete(): |
|||
a_size += list(group_spec.discrete_action_branches) |
|||
vector_action_space_type = 0 |
|||
else: |
|||
a_size += [group_spec.action_size] |
|||
vector_action_space_type = 1 |
|||
return BrainParameters( |
|||
name, int(vec_size), cam_res, a_size, [], vector_action_space_type |
|||
) |
|
|||
from abc import ABC, abstractmethod |
|||
|
|||
from mlagents.trainers.brain import BrainInfo |
|||
from mlagents.trainers.action_info import ActionInfo |
|||
|
|||
|
|||
class Policy(ABC): |
|||
@abstractmethod |
|||
def get_action(self, brain_info: BrainInfo) -> ActionInfo: |
|||
pass |
|
|||
from typing import List |
|||
import logging |
|||
import numpy as np |
|||
from unittest import mock |
|||
|
|||
from mlagents.envs.communicator_objects.agent_info_pb2 import AgentInfoProto |
|||
from mlagents.envs.communicator_objects.observation_pb2 import ( |
|||
ObservationProto, |
|||
NONE as COMPRESSION_TYPE_NONE, |
|||
) |
|||
from mlagents.envs.brain import BrainInfo, BrainParameters |
|||
|
|||
test_brain = BrainParameters( |
|||
brain_name="test_brain", |
|||
vector_observation_space_size=3, |
|||
camera_resolutions=[], |
|||
vector_action_space_size=[], |
|||
vector_action_descriptions=[], |
|||
vector_action_space_type=1, |
|||
) |
|||
|
|||
|
|||
def _make_agent_info_proto(vector_obs: List[float]) -> AgentInfoProto: |
|||
obs = ObservationProto( |
|||
float_data=ObservationProto.FloatData(data=vector_obs), |
|||
shape=[len(vector_obs)], |
|||
compression_type=COMPRESSION_TYPE_NONE, |
|||
) |
|||
agent_info_proto = AgentInfoProto(observations=[obs]) |
|||
return agent_info_proto |
|||
|
|||
|
|||
@mock.patch.object(np, "nan_to_num", wraps=np.nan_to_num) |
|||
@mock.patch.object(logging.Logger, "warning") |
|||
def test_from_agent_proto_nan(mock_warning, mock_nan_to_num): |
|||
agent_info_proto = _make_agent_info_proto([1.0, 2.0, float("nan")]) |
|||
|
|||
brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain) |
|||
# nan gets set to 0.0 |
|||
expected = [1.0, 2.0, 0.0] |
|||
assert (brain_info.vector_observations == expected).all() |
|||
mock_nan_to_num.assert_called() |
|||
mock_warning.assert_called() |
|||
|
|||
|
|||
@mock.patch.object(np, "nan_to_num", wraps=np.nan_to_num) |
|||
@mock.patch.object(logging.Logger, "warning") |
|||
def test_from_agent_proto_inf(mock_warning, mock_nan_to_num): |
|||
agent_info_proto = _make_agent_info_proto([1.0, float("inf"), 0.0]) |
|||
|
|||
brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain) |
|||
# inf should get set to float32_max |
|||
float32_max = np.finfo(np.float32).max |
|||
expected = [1.0, float32_max, 0.0] |
|||
assert (brain_info.vector_observations == expected).all() |
|||
mock_nan_to_num.assert_called() |
|||
# We don't warn on inf, just NaN |
|||
mock_warning.assert_not_called() |
|||
|
|||
|
|||
@mock.patch.object(np, "nan_to_num", wraps=np.nan_to_num) |
|||
@mock.patch.object(logging.Logger, "warning") |
|||
def test_from_agent_proto_fast_path(mock_warning, mock_nan_to_num): |
|||
""" |
|||
Check that all finite values skips the nan_to_num call |
|||
""" |
|||
agent_info_proto = _make_agent_info_proto([1.0, 2.0, 3.0]) |
|||
|
|||
brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain) |
|||
expected = [1.0, 2.0, 3.0] |
|||
assert (brain_info.vector_observations == expected).all() |
|||
mock_nan_to_num.assert_not_called() |
|||
mock_warning.assert_not_called() |
|
|||
from abc import ABC, abstractmethod |
|||
|
|||
from mlagents.envs.brain import BrainInfo |
|||
from mlagents.envs.action_info import ActionInfo |
|||
|
|||
|
|||
class Policy(ABC): |
|||
@abstractmethod |
|||
def get_action(self, brain_info: BrainInfo) -> ActionInfo: |
|||
pass |
|
|||
from abc import ABC, abstractmethod |
|||
from typing import Dict, Optional |
|||
|
|||
from mlagents.envs.brain import AllBrainInfo, BrainParameters |
|||
|
|||
|
|||
class BaseUnityEnvironment(ABC): |
|||
@abstractmethod |
|||
def step( |
|||
self, vector_action: Optional[Dict] = None, value: Optional[Dict] = None |
|||
) -> AllBrainInfo: |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def reset(self) -> AllBrainInfo: |
|||
pass |
|||
|
|||
@property |
|||
@abstractmethod |
|||
def external_brains(self) -> Dict[str, BrainParameters]: |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def close(self): |
|||
pass |
撰写
预览
正在加载...
取消
保存
Reference in new issue