浏览代码

Action Model (#4580)

Co-authored-by: Ervin T <ervin@unity3d.com>
Co-authored-by: Vincent-Pierre BERGES <vincentpierre@unity3d.com>
/fix-conflict-base-env
GitHub 4 年前
当前提交
3c96a3a2
共有 43 个文件被更改,包括 1337 次插入1004 次删除
  1. 4
      .github/workflows/pytest.yml
  2. 93
      ml-agents-envs/mlagents_envs/base_env.py
  3. 2
      ml-agents-envs/mlagents_envs/rpc_utils.py
  4. 26
      ml-agents/mlagents/trainers/agent_processor.py
  5. 5
      ml-agents/mlagents/trainers/demo_loader.py
  6. 21
      ml-agents/mlagents/trainers/env_manager.py
  7. 17
      ml-agents/mlagents/trainers/policy/policy.py
  8. 24
      ml-agents/mlagents/trainers/policy/tf_policy.py
  9. 54
      ml-agents/mlagents/trainers/policy/torch_policy.py
  10. 6
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  11. 5
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  12. 2
      ml-agents/mlagents/trainers/ppo/trainer.py
  13. 319
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  14. 3
      ml-agents/mlagents/trainers/simple_env_manager.py
  15. 7
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  16. 20
      ml-agents/mlagents/trainers/tests/mock_brain.py
  17. 49
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  18. 66
      ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
  19. 114
      ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
  20. 2
      ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
  21. 27
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  22. 2
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  23. 4
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  24. 2
      ml-agents/mlagents/trainers/tests/torch/test_distributions.py
  25. 78
      ml-agents/mlagents/trainers/tests/torch/test_networks.py
  26. 13
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  27. 28
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  28. 3
      ml-agents/mlagents/trainers/tests/torch/test_sac.py
  29. 118
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  30. 47
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  31. 35
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  32. 75
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  33. 6
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  34. 19
      ml-agents/mlagents/trainers/torch/distributions.py
  35. 183
      ml-agents/mlagents/trainers/torch/networks.py
  36. 240
      ml-agents/mlagents/trainers/torch/utils.py
  37. 25
      ml-agents/mlagents/trainers/trajectory.py
  38. 81
      ml-agents/mlagents/trainers/tests/torch/test_action_model.py
  39. 122
      ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
  40. 44
      ml-agents/mlagents/trainers/torch/action_flattener.py
  41. 108
      ml-agents/mlagents/trainers/torch/action_log_probs.py
  42. 184
      ml-agents/mlagents/trainers/torch/action_model.py
  43. 58
      ml-agents/mlagents/trainers/torch/agent_action.py

4
.github/workflows/pytest.yml


- 'gym-unity/**'
- 'test_constraints*.txt'
- 'test_requirements.txt'
- '.github/workflows/pytest.yml'
push:
branches: [master]

run: python -c "import sys; print(sys.version)"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
# pin pip to workaround https://github.com/pypa/pip/issues/9180
python -m pip install pip==20.2
python -m pip install --upgrade setuptools
python -m pip install --progress-bar=off -e ./ml-agents-envs -c ${{ matrix.pip_constraints }}
python -m pip install --progress-bar=off -e ./ml-agents -c ${{ matrix.pip_constraints }}

93
ml-agents-envs/mlagents_envs/base_env.py


)
class ActionTuple:
class _ActionTupleBase(ABC):
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively.
An object whose fields correspond to action data of continuous and discrete
spaces. Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
def __init__(self, continuous: np.ndarray, discrete: np.ndarray):
def __init__(
self,
continuous: Optional[np.ndarray] = None,
discrete: Optional[np.ndarray] = None,
):
self._continuous: Optional[np.ndarray] = None
self._discrete: Optional[np.ndarray] = None
if continuous is not None:
self.add_continuous(continuous)
if discrete is not None:
self.add_discrete(discrete)
@property
def continuous(self) -> np.ndarray:
return self._continuous
@property
def discrete(self) -> np.ndarray:
return self._discrete
def add_continuous(self, continuous: np.ndarray) -> None:
if self._discrete is None:
self._discrete = np.zeros(
(continuous.shape[0], 0), dtype=self.discrete_dtype
)
if discrete.dtype != np.int32:
discrete = discrete.astype(np.int32, copy=False)
def add_discrete(self, discrete: np.ndarray) -> None:
if discrete.dtype != self.discrete_dtype:
discrete = discrete.astype(self.discrete_dtype, copy=False)
if self._continuous is None:
self._continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
def continuous(self) -> np.ndarray:
return self._continuous
@abstractmethod
def discrete_dtype(self) -> np.dtype:
pass
@property
def discrete(self) -> np.ndarray:
return self._discrete
@staticmethod
def create_continuous(continuous: np.ndarray) -> "ActionTuple":
discrete = np.zeros((continuous.shape[0], 0), dtype=np.int32)
return ActionTuple(continuous, discrete)
class ActionTuple(_ActionTupleBase):
"""
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
@staticmethod
def create_discrete(discrete: np.ndarray) -> "ActionTuple":
continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
return ActionTuple(continuous, discrete)
@property
def discrete_dtype(self) -> np.dtype:
"""
The dtype of a discrete action.
"""
return np.int32
class ActionSpec(NamedTuple):

for a number of agents.
:param n_agents: The number of agents that will have actions generated
"""
continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous, discrete)
_continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous=_continuous, discrete=_discrete)
def random_action(self, n_agents: int) -> ActionTuple:
"""

"""
continuous = np.random.uniform(
_continuous = np.random.uniform(
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
discrete = np.column_stack(
_discrete = np.column_stack(
[
np.random.randint(
0,

for i in range(self.discrete_size)
]
)
return ActionTuple(continuous, discrete)
return ActionTuple(continuous=_continuous, discrete=_discrete)
def _validate_action(
self, actions: ActionTuple, n_agents: int, name: str

for the correct number of agents and ensures the type.
"""
_expected_shape = (n_agents, self.continuous_size)
if self.continuous_size > 0 and actions.continuous.shape != _expected_shape:
if actions.continuous.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a continuous input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "

if self.discrete_size > 0 and actions.discrete.shape != _expected_shape:
if actions.discrete.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a discrete input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "

2
ml-agents-envs/mlagents_envs/rpc_utils.py


from mlagents_envs.base_env import (
BehaviorSpec,
BehaviorSpec,
DecisionSteps,
TerminalSteps,
)

26
ml-agents/mlagents/trainers/agent_processor.py


from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
from collections import defaultdict, Counter
import queue
import numpy as np
ActionTuple,
DecisionSteps,
DecisionStep,
TerminalSteps,

from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.behavior_id_utils import get_global_agent_id

done = terminated # Since this is an ongoing step
interrupted = step.interrupted if terminated else False
# Add the outputs of the last eval
action_dict = stored_take_action_outputs["action"]
action: Dict[str, np.ndarray] = {}
for act_type, act_array in action_dict.items():
action[act_type] = act_array[idx]
stored_actions = stored_take_action_outputs["action"]
action_tuple = ActionTuple(
continuous=stored_actions.continuous[idx],
discrete=stored_actions.discrete[idx],
)
action_probs_dict = stored_take_action_outputs["log_probs"]
action_probs: Dict[str, np.ndarray] = {}
for prob_type, prob_array in action_probs_dict.items():
action_probs[prob_type] = prob_array[idx]
stored_action_probs = stored_take_action_outputs["log_probs"]
log_probs_tuple = LogProbsTuple(
continuous=stored_action_probs.continuous[idx],
discrete=stored_action_probs.discrete[idx],
)
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(

action=action,
action_probs=action_probs,
action=action_tuple,
action_probs=log_probs_tuple,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,

5
ml-agents/mlagents/trainers/demo_loader.py


for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
if behavior_spec.action_spec.is_continuous():
# TODO: update to read from the new proto format
if behavior_spec.action_spec.continuous_size > 0:
else:
if behavior_spec.action_spec.discrete_size > 0:
demo_raw_buffer["discrete_action"].append(
current_pair_info.action_info.vector_actions
)

21
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
import numpy as np
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.base_env import (

BehaviorName,
ActionTuple,
)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

from mlagents_envs.logging_util import get_logger
from mlagents_envs.exception import UnityActionException
AllStepResult = Dict[BehaviorName, Tuple[DecisionSteps, TerminalSteps]]
AllGroupSpec = Dict[BehaviorName, BehaviorSpec]

step_info.environment_stats, step_info.worker_id
)
return len(step_infos)
@staticmethod
def action_tuple_from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> ActionTuple:
if "continuous_action" in action_dict:
continuous = action_dict["continuous_action"]
if "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
action_tuple = ActionTuple(continuous, discrete)
else:
action_tuple = ActionTuple.create_continuous(continuous)
elif "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
action_tuple = ActionTuple.create_discrete(discrete)
else:
raise UnityActionException(
"The action dict must contain entries for either continuous_action or discrete_action."
)
return action_tuple

17
ml-agents/mlagents/trainers/policy/policy.py


from typing import Dict, List, Optional
import numpy as np
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import TrainerSettings, NetworkSettings

self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
self.seed = seed
if (
self.behavior_spec.action_spec.continuous_size > 0
and self.behavior_spec.action_spec.discrete_size > 0
):
raise UnityPolicyException("Trainers do not support mixed action spaces.")
self.act_size = (
list(self.behavior_spec.action_spec.discrete_branches)
if self.behavior_spec.action_spec.is_discrete()

) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]

)
def save_previous_action(
self, agent_ids: List[str], action_dict: Dict[str, np.ndarray]
self, agent_ids: List[str], action_tuple: ActionTuple
if action_dict is None or "discrete_action" not in action_dict:
return
self.previous_action_dict[agent_id] = action_dict["discrete_action"][
index, :
]
self.previous_action_dict[agent_id] = action_tuple.discrete[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = self.make_empty_previous_action(len(agent_ids))

24
ml-agents/mlagents/trainers/policy/tf_policy.py


from mlagents.tf_utils import tf
from mlagents import tf_utils
from mlagents_envs.exception import UnityException
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import DecisionSteps, ActionTuple, BehaviorSpec
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, EncoderType
from mlagents.trainers import __version__

reparameterize,
condition_sigma_on_obs,
)
if (
self.behavior_spec.action_spec.continuous_size > 0
and self.behavior_spec.action_spec.discrete_size > 0
):
raise UnityPolicyException(
"TensorFlow does not support mixed action spaces. Please run with the Torch framework."
)
# for ghost trainer save/load snapshots
self.assign_phs: List[tf.Tensor] = []
self.assign_ops: List[tf.Operation] = []

self.save_memories(global_agent_ids, run_out.get("memory_out"))
# For Compatibility with buffer changes for hybrid action support
if "log_probs" in run_out:
run_out["log_probs"] = {"action_probs": run_out["log_probs"]}
log_probs_tuple = LogProbsTuple()
if self.behavior_spec.action_spec.is_continuous():
log_probs_tuple.add_continuous(run_out["log_probs"])
else:
log_probs_tuple.add_discrete(run_out["log_probs"])
run_out["log_probs"] = log_probs_tuple
action_tuple = ActionTuple()
run_out["action"] = {"continuous_action": run_out["action"]}
action_tuple.add_continuous(run_out["action"])
run_out["action"] = {"discrete_action": run_out["action"]}
action_tuple.add_discrete(run_out["action"])
run_out["action"] = action_tuple
return ActionInfo(
action=run_out.get("action"),
value=run_out.get("value"),

54
ml-agents/mlagents/trainers/policy/torch_policy.py


GlobalSteps,
)
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
EPSILON = 1e-7 # Small value to avoid divide by zero

) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
if self.behavior_spec.action_spec.discrete_size > 0:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(

:param masks: Loss masks for RNN, else None.
:param memories: Input memories when using RNN, else None.
:param seq_len: Sequence length when using RNN.
:return: Tuple of actions, log probabilities (dependent on all_log_probs), entropies, and
output memories, all as Torch Tensors.
:return: Tuple of AgentAction, ActionLogProbs, entropies, and output memories.
if memories is None:
dists, memories = self.actor_critic.get_dists(
vec_obs, vis_obs, masks, memories, seq_len
)
else:
# If we're using LSTM. we need to execute the values to get the critic memories
dists, _, memories = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
)
action_list = self.actor_critic.sample_action(dists)
log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
action_list, dists
)
actions = AgentAction.create(action_list, self.behavior_spec.action_spec)
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec, all_logs_list
actions, log_probs, entropies, _, memories = self.actor_critic.get_action_stats_and_value(
vec_obs, vis_obs, masks, memories, seq_len
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return (actions, log_probs, entropy_sum, memories)
return (actions, log_probs, entropies, memories)
def evaluate_actions(
self,

memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
dists, value_heads, _ = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
)
action_list = actions.to_tensor_list()
log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
action_list, dists
log_probs, entropies, value_heads = self.actor_critic.get_stats_and_value(
vec_obs, vis_obs, actions, masks, memories, seq_len
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec
)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum, value_heads
return log_probs, entropies, value_heads
@timed
def evaluate(

action, log_probs, entropy, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
action_dict = action.to_numpy_dict()
run_out["action"] = action_dict
action_tuple = action.to_action_tuple()
run_out["action"] = action_tuple
action_dict["continuous_action"] if self.use_continuous_act else None
action_tuple.continuous if self.use_continuous_act else None
run_out["log_probs"] = log_probs.to_numpy_dict()
run_out["log_probs"] = log_probs.to_log_probs_tuple()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["learning_rate"] = 0.0
if self.use_recurrent:

6
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


self.policy.sequence_length_ph: self.policy.sequence_length,
self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
self.advantage: mini_batch["advantages"],
self.all_old_log_probs: mini_batch["action_probs"],
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.all_old_log_probs] = mini_batch["continuous_log_probs"]
else:
feed_dict[self.all_old_log_probs] = mini_batch["discrete_log_probs"]
if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]

5
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
class TorchPPOOptimizer(TorchOptimizer):

vis_obs.append(vis_ob)
else:
vis_obs = []
log_probs, entropy, values = self.policy.evaluate_actions(
vec_obs,
vis_obs,

2
ml-agents/mlagents/trainers/ppo/trainer.py


behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=False, # Faster training for PPO
separate_critic=behavior_spec.action_spec.is_continuous(),
separate_critic=behavior_spec.action_spec.continuous_size > 0,
)
return policy

319
ml-agents/mlagents/trainers/sac/optimizer_torch.py


import numpy as np
from typing import Dict, List, Mapping, cast, Tuple, Optional
from typing import Dict, List, Mapping, NamedTuple, cast, Tuple, Optional
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import TrainerSettings, SACSettings
from contextlib import ExitStack

action_spec: ActionSpec,
):
super().__init__()
self.action_spec = action_spec
if self.action_spec.is_continuous():
self.act_size = self.action_spec.continuous_size
num_value_outs = 1
num_action_ins = self.act_size
num_value_outs = max(sum(action_spec.discrete_branches), 1)
num_action_ins = int(action_spec.continuous_size)
else:
self.act_size = self.action_spec.discrete_branches
num_value_outs = sum(self.act_size)
num_action_ins = 0
self.q1_network = ValueNetwork(
stream_names,
observation_shapes,

)
return q1_out, q2_out
class TargetEntropy(NamedTuple):
discrete: List[float] = [] # One per branch
continuous: float = 0.0
class LogEntCoef(nn.Module):
def __init__(self, discrete, continuous):
super().__init__()
self.discrete = discrete
self.continuous = continuous
def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
super().__init__(policy, trainer_params)
hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters)

self.policy = policy
self.act_size = policy.act_size
policy_network_settings = policy.network_settings
self.tau = hyperparameters.tau

name: int(not self.reward_signals[name].ignore_done)
for name in self.stream_names
}
self._action_spec = self.policy.behavior_spec.action_spec
self.policy.behavior_spec.action_spec,
self._action_spec,
)
self.target_network = ValueNetwork(

self.policy.actor_critic.critic, self.target_network, 1.0
)
self._log_ent_coef = torch.nn.Parameter(
torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))),
# We create one entropy coefficient per action, whether discrete or continuous.
_disc_log_ent_coef = torch.nn.Parameter(
torch.log(
torch.as_tensor(
[self.init_entcoef] * len(self._action_spec.discrete_branches)
)
),
if self.policy.use_continuous_act:
self.target_entropy = torch.as_tensor(
-1
* self.continuous_target_entropy_scale
* np.prod(self.act_size[0]).astype(np.float32)
)
else:
self.target_entropy = [
self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
for i in self.act_size
]
_cont_log_ent_coef = torch.nn.Parameter(
torch.log(
torch.as_tensor([self.init_entcoef] * self._action_spec.continuous_size)
),
requires_grad=True,
)
self._log_ent_coef = TorchSACOptimizer.LogEntCoef(
discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef
)
_cont_target = (
-1
* self.continuous_target_entropy_scale
* np.prod(self._action_spec.continuous_size).astype(np.float32)
)
_disc_target = [
self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
for i in self._action_spec.discrete_branches
]
self.target_entropy = TorchSACOptimizer.TargetEntropy(
continuous=_cont_target, discrete=_disc_target
)
self.policy.actor_critic.distribution.parameters()
self.policy.actor_critic.action_model.parameters()
)
value_params = list(self.value_network.parameters()) + list(
self.policy.actor_critic.critic.parameters()

value_params, lr=hyperparameters.learning_rate
)
self.entropy_optimizer = torch.optim.Adam(
[self._log_ent_coef], lr=hyperparameters.learning_rate
self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate
)
self._move_to_device(default_device())

q1p_out: Dict[str, torch.Tensor],
q2p_out: Dict[str, torch.Tensor],
loss_masks: torch.Tensor,
discrete: bool,
_ent_coef = torch.exp(self._log_ent_coef)
for name in values.keys():
if not discrete:
min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
else:
action_probs = log_probs.all_discrete_tensor.exp()
_branched_q1p = ModelUtils.break_into_branches(
q1p_out[name] * action_probs, self.act_size
)
_branched_q2p = ModelUtils.break_into_branches(
q2p_out[name] * action_probs, self.act_size
)
_q1p_mean = torch.mean(
torch.stack(
[
torch.sum(_br, dim=1, keepdim=True)
for _br in _branched_q1p
]
),
dim=0,
)
_q2p_mean = torch.mean(
torch.stack(
[
torch.sum(_br, dim=1, keepdim=True)
for _br in _branched_q2p
]
),
dim=0,
)
_cont_ent_coef = self._log_ent_coef.continuous.exp()
_disc_ent_coef = self._log_ent_coef.discrete.exp()
for name in values.keys():
if self._action_spec.discrete_size <= 0:
min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
else:
disc_action_probs = log_probs.all_discrete_tensor.exp()
_branched_q1p = ModelUtils.break_into_branches(
q1p_out[name] * disc_action_probs,
self._action_spec.discrete_branches,
)
_branched_q2p = ModelUtils.break_into_branches(
q2p_out[name] * disc_action_probs,
self._action_spec.discrete_branches,
)
_q1p_mean = torch.mean(
torch.stack(
[torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q1p]
),
dim=0,
)
_q2p_mean = torch.mean(
torch.stack(
[torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q2p]
),
dim=0,
)
min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean)
min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean)
if not discrete:
if self._action_spec.discrete_size <= 0:
_ent_coef * log_probs.continuous_tensor, dim=1
_cont_ent_coef * log_probs.continuous_tensor, dim=1
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup), loss_masks

disc_log_probs = log_probs.all_discrete_tensor
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
disc_log_probs * disc_log_probs.exp(),
self._action_spec.discrete_branches,
torch.sum(_ent_coef[i] * _lp, dim=1, keepdim=True)
torch.sum(_disc_ent_coef[i] * _lp, dim=1, keepdim=True)
for i, _lp in enumerate(branched_per_action_ent)
]
)

branched_ent_bonus, axis=0
)
# Add continuous entropy bonus to minimum Q
if self._action_spec.continuous_size > 0:
torch.sum(
_cont_ent_coef * log_probs.continuous_tensor,
dim=1,
keepdim=True,
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup.squeeze()),
loss_masks,

log_probs: ActionLogProbs,
q1p_outs: Dict[str, torch.Tensor],
loss_masks: torch.Tensor,
discrete: bool,
_ent_coef = torch.exp(self._log_ent_coef)
_cont_ent_coef, _disc_ent_coef = (
self._log_ent_coef.continuous,
self._log_ent_coef.discrete,
)
_cont_ent_coef = _cont_ent_coef.exp()
_disc_ent_coef = _disc_ent_coef.exp()
if not discrete:
mean_q1 = mean_q1.unsqueeze(1)
batch_policy_loss = torch.mean(
_ent_coef * log_probs.continuous_tensor - mean_q1, dim=1
)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
else:
action_probs = log_probs.all_discrete_tensor.exp()
batch_policy_loss = 0
if self._action_spec.discrete_size > 0:
disc_log_probs = log_probs.all_discrete_tensor
disc_action_probs = disc_log_probs.exp()
log_probs.all_discrete_tensor * action_probs, self.act_size
disc_log_probs * disc_action_probs, self._action_spec.discrete_branches
mean_q1 * action_probs, self.act_size
mean_q1 * disc_action_probs, self._action_spec.discrete_branches
torch.sum(_ent_coef[i] * _lp - _qt, dim=1, keepdim=True)
torch.sum(_disc_ent_coef[i] * _lp - _qt, dim=1, keepdim=False)
for i, (_lp, _qt) in enumerate(
zip(branched_per_action_ent, branched_q_term)
)

batch_policy_loss = torch.squeeze(branched_policy_loss)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
batch_policy_loss += torch.sum(branched_policy_loss, dim=1)
all_mean_q1 = torch.sum(disc_action_probs * mean_q1, dim=1)
else:
all_mean_q1 = mean_q1
if self._action_spec.continuous_size > 0:
cont_log_probs = log_probs.continuous_tensor
batch_policy_loss += torch.mean(
_cont_ent_coef * cont_log_probs - all_mean_q1.unsqueeze(1), dim=1
)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
self, log_probs: ActionLogProbs, loss_masks: torch.Tensor, discrete: bool
self, log_probs: ActionLogProbs, loss_masks: torch.Tensor
if not discrete:
with torch.no_grad():
target_current_diff = torch.sum(
log_probs.continuous_tensor + self.target_entropy, dim=1
)
entropy_loss = -1 * ModelUtils.masked_mean(
self._log_ent_coef * target_current_diff, loss_masks
)
else:
_cont_ent_coef, _disc_ent_coef = (
self._log_ent_coef.continuous,
self._log_ent_coef.discrete,
)
entropy_loss = 0
if self._action_spec.discrete_size > 0:
# Break continuous into separate branch
disc_log_probs = log_probs.all_discrete_tensor
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
disc_log_probs * disc_log_probs.exp(),
self._action_spec.discrete_branches,
branched_per_action_ent, self.target_entropy
branched_per_action_ent, self.target_entropy.discrete
)
],
axis=1,

)
entropy_loss = -1 * ModelUtils.masked_mean(
torch.mean(self._log_ent_coef * target_current_diff, axis=1), loss_masks
entropy_loss += -1 * ModelUtils.masked_mean(
torch.mean(_disc_ent_coef * target_current_diff, axis=1), loss_masks
)
if self._action_spec.continuous_size > 0:
with torch.no_grad():
cont_log_probs = log_probs.continuous_tensor
target_current_diff = torch.sum(
cont_log_probs + self.target_entropy.continuous, dim=1
)
# We update all the _cont_ent_coef as one block
entropy_loss += -1 * ModelUtils.masked_mean(
torch.mean(_cont_ent_coef) * target_current_diff, loss_masks
)
return entropy_loss

) -> Dict[str, torch.Tensor]:
condensed_q_output = {}
onehot_actions = ModelUtils.actions_to_onehot(discrete_actions, self.act_size)
onehot_actions = ModelUtils.actions_to_onehot(
discrete_actions, self._action_spec.discrete_branches
)
branched_q = ModelUtils.break_into_branches(item, self.act_size)
branched_q = ModelUtils.break_into_branches(
item, self._action_spec.discrete_branches
)
only_action_qs = torch.stack(
[
torch.sum(_act * _q, dim=1, keepdim=True)

value_estimates, _ = self.policy.actor_critic.critic_pass(
vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length
)
if self.policy.use_continuous_act:
squeezed_actions = actions.continuous_tensor
# Only need grad for q1, as that is used for policy.
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
sampled_actions.continuous_tensor,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q2_grad=False,
)
q1_out, q2_out = self.value_network(
vec_obs,
vis_obs,
squeezed_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
cont_sampled_actions = sampled_actions.continuous_tensor
cont_actions = actions.continuous_tensor
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
cont_sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_out, q2_out = self.value_network(
vec_obs,
vis_obs,
cont_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
if self._action_spec.discrete_size > 0:
disc_actions = actions.discrete_tensor
q1_stream = self._condense_q_streams(q1_out, disc_actions)
q2_stream = self._condense_q_streams(q2_out, disc_actions)
else:
else:
# For discrete, you don't need to backprop through the Q for the policy
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q1_grad=False,
q2_grad=False,
)
q1_out, q2_out = self.value_network(
vec_obs,
vis_obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_stream = self._condense_q_streams(q1_out, actions.discrete_tensor)
q2_stream = self._condense_q_streams(q2_out, actions.discrete_tensor)
with torch.no_grad():
target_values, _ = self.target_network(

sequence_length=self.policy.sequence_length,
)
masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
use_discrete = not self.policy.use_continuous_act
dones = ModelUtils.list_to_tensor(batch["done"])
q1_loss, q2_loss = self.sac_q_loss(

log_probs, value_estimates, q1p_out, q2p_out, masks, use_discrete
log_probs, value_estimates, q1p_out, q2p_out, masks
policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete)
entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete)
policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks)
entropy_loss = self.sac_entropy_loss(log_probs, masks)
total_value_loss = q1_loss + q2_loss + value_loss

"Losses/Value Loss": value_loss.item(),
"Losses/Q1 Loss": q1_loss.item(),
"Losses/Q2 Loss": q2_loss.item(),
"Policy/Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef)).item(),
"Policy/Discrete Entropy Coeff": torch.mean(
torch.exp(self._log_ent_coef.discrete)
).item(),
"Policy/Continuous Entropy Coeff": torch.mean(
torch.exp(self._log_ent_coef.continuous)
).item(),
"Policy/Learning Rate": decay_lr,
}

3
ml-agents/mlagents/trainers/simple_env_manager.py


self.previous_all_action_info = all_action_info
for brain_name, action_info in all_action_info.items():
_action = EnvManager.action_tuple_from_numpy_dict(action_info.action)
self.env.set_actions(brain_name, _action)
self.env.set_actions(brain_name, action_info.action)
self.env.step()
all_step_result = self._generate_all_results()

7
ml-agents/mlagents/trainers/subprocess_env_manager.py


if req.cmd == EnvironmentCommand.STEP:
all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
_action = EnvManager.action_tuple_from_numpy_dict(
action_info.action
)
env.set_actions(brain_name, _action)
if len(action_info.agent_ids) > 0:
env.set_actions(brain_name, action_info.action)
env.step()
all_step_result = _generate_all_results()
# The timers in this process are independent from all the processes and the "main" process

20
ml-agents/mlagents/trainers/tests/mock_brain.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents_envs.base_env import (
DecisionSteps,

ActionTuple,
)

steps_list = []
action_size = action_spec.discrete_size + action_spec.continuous_size
action_probs = {
"action_probs": np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
}
for _i in range(length - 1):
obs = []
for _shape in observation_shapes:

if action_spec.is_continuous():
action = {"continuous_action": np.zeros(action_size, dtype=np.float32)}
else:
action = {"discrete_action": np.zeros(action_size, dtype=np.float32)}
action = ActionTuple(
continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
)
action_probs = LogProbsTuple(
continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
)
action_pre = np.zeros(action_size, dtype=np.float32)
action_mask = (
[

49
ml-agents/mlagents/trainers/tests/simple_test_envs.py


OBS_SIZE = 1
VIS_OBS_SIZE = (20, 20, 3)
STEP_SIZE = 0.1
STEP_SIZE = 0.2
TIME_PENALTY = 0.01
MIN_STEPS = int(1.0 / STEP_SIZE) + 1

def __init__(
self,
brain_names,
use_discrete,
action_size=1,
action_sizes=(1, 0),
self.discrete = use_discrete
if use_discrete:
action_spec = ActionSpec.create_discrete(
tuple(2 for _ in range(action_size))
)
else:
action_spec = ActionSpec.create_continuous(action_size)
continuous_action_size, discrete_action_size = action_sizes
discrete_tuple = tuple(2 for _ in range(discrete_action_size))
action_spec = ActionSpec(continuous_action_size, discrete_tuple)
self.total_action_size = (
continuous_action_size + discrete_action_size
) # to set the goals/positions
self.action_spec = action_spec
self.action_size = action_size
self.names = brain_names
self.positions: Dict[str, List[float]] = {}
self.step_count: Dict[str, float] = {}

def _take_action(self, name: str) -> bool:
deltas = []
_act = self.action[name]
if self.action_spec.continuous_size > 0:
for _cont in _act.continuous[0]:
deltas.append(_cont)
if self.action_spec.continuous_size > 0:
for _cont in _act.continuous[0]:
deltas.append(_cont)
for i, _delta in enumerate(deltas):
_delta = clamp(_delta, -self.step_size, self.step_size)
self.positions[name][i] += _delta

return done
def _generate_mask(self):
if self.discrete:
action_mask = None
if self.action_spec.discrete_size > 0:
ndmask = np.array(2 * self.action_size * [False], dtype=np.bool)
ndmask = np.array(
2 * self.action_spec.discrete_size * [False], dtype=np.bool
)
else:
action_mask = None
return action_mask
def _compute_reward(self, name: str, done: bool) -> float:

def _reset_agent(self, name):
self.goal[name] = self.random.choice([-1, 1])
self.positions[name] = [0.0 for _ in range(self.action_size)]
self.positions[name] = [0.0 for _ in range(self.total_action_size)]
self.step_count[name] = 0
self.rewards[name] = 0
self.agent_id[name] = self.agent_id[name] + 1

class MemoryEnvironment(SimpleEnvironment):
def __init__(self, brain_names, use_discrete, step_size=0.2):
super().__init__(brain_names, use_discrete, step_size=step_size)
def __init__(self, brain_names, action_sizes=(1, 0), step_size=0.2):
super().__init__(brain_names, action_sizes=action_sizes, step_size=step_size)
# Number of steps to reveal the goal for. Lower is harder. Should be
# less than 1/step_size to force agent to use memory
self.num_show_steps = 2

def __init__(
self,
brain_names,
use_discrete,
action_sizes=(1, 0),
use_discrete,
action_sizes=action_sizes,
)
self.demonstration_protos: Dict[str, List[AgentInfoActionPairProto]] = {}
self.n_demos = n_demos

def step(self) -> None:
super().step()
for name in self.names:
if self.discrete:
if self.action_spec.discrete_size > 0:
action = self.action[name].discrete
else:
action = self.action[name].continuous

self.reset()
for _ in range(self.n_demos):
for name in self.names:
if self.discrete:
if self.action_spec.discrete_size > 0:
self.action[name] = ActionTuple(
np.array([], dtype=np.float32),
np.array(

66
ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py


dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if discrete:
n_agents = len(update_buffer["discrete_log_probs"])
update_buffer["discrete_log_probs"] = np.ones(
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
dtype=np.float32,
)
else:
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if discrete:
n_agents = len(update_buffer["discrete_log_probs"])
update_buffer["discrete_log_probs"] = np.ones(
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
dtype=np.float32,
)
else:
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

use_visual=False,
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

buffer["curiosity_returns"] = buffer["environment_rewards"]
buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
buffer["advantages"] = buffer["environment_rewards"]
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if use_discrete:
n_agents = len(buffer["discrete_log_probs"])
buffer["discrete_log_probs"].reset_field()
for _ in range(n_agents):
buffer["discrete_log_probs"].append(
np.ones(
int(sum(mock_behavior_spec.action_spec.discrete_branches)),
dtype=np.float32,
)
)
else:
n_agents = len(buffer["continuous_log_probs"])
buffer["continuous_log_probs"].reset_field()
for _ in range(n_agents):
buffer["continuous_log_probs"].append(
np.ones(
mock_behavior_spec.action_spec.continuous_size, dtype=np.float32
)
)
trainer.update_buffer = buffer
trainer._update_policy()

114
ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py


assert all(reward > success_threshold for reward in processed_rewards)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
new_hyperparams = attr.evolve(
PPO_TF_CONFIG.hyperparameters, batch_size=64, buffer_size=640
)

_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_ppo(num_visual, use_discrete):
def test_visual_ppo(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_recurrent_ppo(action_sizes):
env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
new_network_settings = attr.evolve(
PPO_TF_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
new_hyperparams = attr.evolve(SAC_TF_CONFIG.hyperparameters, buffer_init_steps=2000)
config = attr.evolve(
SAC_TF_CONFIG,

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_sac(num_visual, use_discrete):
def test_visual_sac(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_sac(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,